Index: user/dchagin/lemul/gnu/lib/libdialog/Makefile
===================================================================
--- user/dchagin/lemul/gnu/lib/libdialog/Makefile	(revision 274240)
+++ user/dchagin/lemul/gnu/lib/libdialog/Makefile	(revision 274241)
@@ -1,20 +1,23 @@
 # $FreeBSD$
 
 DIALOG=		${.CURDIR}/../../../contrib/dialog
 
 LIB=		dialog
 SHLIB_MAJOR=	8
 SRCS=		argv.c arrows.c buildlist.c buttons.c calendar.c checklist.c \
 		columns.c dlg_keys.c editbox.c fselect.c formbox.c guage.c \
 		help.c inputbox.c inputstr.c menubox.c mixedform.c \
 		mixedgauge.c mouse.c mousewget.c msgbox.c pause.c prgbox.c \
 		progressbox.c rangebox.c rc.c tailbox.c textbox.c timebox.c \
 		trace.c treeview.c ui_getc.c util.c version.c yesno.c
 INCS=		dialog.h dlg_colors.h dlg_config.h dlg_keys.h
 MAN=		dialog.3
 
+DPADD=		${LIBNCURSESW} ${LIBM}
+LDADD=		-lncursesw -lm
+
 CFLAGS+= 	-I${.CURDIR} -I${DIALOG} -D_XOPEN_SOURCE_EXTENDED -DGCC_UNUSED=__unused
 .PATH:		${DIALOG}
 WARNS?=		1
 
 .include <bsd.lib.mk>
Index: user/dchagin/lemul/gnu/lib
===================================================================
--- user/dchagin/lemul/gnu/lib	(revision 274240)
+++ user/dchagin/lemul/gnu/lib	(revision 274241)

Property changes on: user/dchagin/lemul/gnu/lib
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/gnu/lib:r273766-274240
Index: user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_net_vsc.c
===================================================================
--- user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_net_vsc.c	(revision 274240)
+++ user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_net_vsc.c	(revision 274241)
@@ -1,1144 +1,1144 @@
 /*-
  * Copyright (c) 2009-2012 Microsoft Corp.
  * Copyright (c) 2010-2012 Citrix Inc.
  * Copyright (c) 2012 NetApp Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /**
  * HyperV vmbus network VSC (virtual services client) module
  *
  */
 
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
 #include <sys/lock.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <machine/bus.h>
 #include <machine/atomic.h>
 
 #include <dev/hyperv/include/hyperv.h>
 #include "hv_net_vsc.h"
 #include "hv_rndis.h"
 #include "hv_rndis_filter.h"
 
 
 /*
  * Forward declarations
  */
 static void hv_nv_on_channel_callback(void *context);
 static int  hv_nv_init_send_buffer_with_net_vsp(struct hv_device *device);
 static int  hv_nv_init_rx_buffer_with_net_vsp(struct hv_device *device);
 static int  hv_nv_destroy_send_buffer(netvsc_dev *net_dev);
 static int  hv_nv_destroy_rx_buffer(netvsc_dev *net_dev);
 static int  hv_nv_connect_to_vsp(struct hv_device *device);
 static void hv_nv_on_send_completion(struct hv_device *device,
 				     hv_vm_packet_descriptor *pkt);
 static void hv_nv_on_receive(struct hv_device *device,
 			     hv_vm_packet_descriptor *pkt);
 static void hv_nv_send_receive_completion(struct hv_device *device,
 					  uint64_t tid);
 
 
 /*
  *
  */
 static inline netvsc_dev *
 hv_nv_alloc_net_device(struct hv_device *device)
 {
 	netvsc_dev *net_dev;
 	hn_softc_t *sc = device_get_softc(device->device);
 
 	net_dev = malloc(sizeof(netvsc_dev), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (net_dev == NULL) {
 		return (NULL);
 	}
 
 	net_dev->dev = device;
 	net_dev->destroy = FALSE;
 	sc->net_dev = net_dev;
 
 	return (net_dev);
 }
 
 /*
  *
  */
 static inline netvsc_dev *
 hv_nv_get_outbound_net_device(struct hv_device *device)
 {
 	hn_softc_t *sc = device_get_softc(device->device);
 	netvsc_dev *net_dev = sc->net_dev;;
 
 	if ((net_dev != NULL) && net_dev->destroy) {
 		return (NULL);
 	}
 
 	return (net_dev);
 }
 
 /*
  *
  */
 static inline netvsc_dev *
 hv_nv_get_inbound_net_device(struct hv_device *device)
 {
 	hn_softc_t *sc = device_get_softc(device->device);
 	netvsc_dev *net_dev = sc->net_dev;;
 
 	if (net_dev == NULL) {
 		return (net_dev);
 	}
 	/*
 	 * When the device is being destroyed; we only
 	 * permit incoming packets if and only if there
 	 * are outstanding sends.
 	 */
 	if (net_dev->destroy && net_dev->num_outstanding_sends == 0) {
 		return (NULL);
 	}
 
 	return (net_dev);
 }
 
 /*
  * Net VSC initialize receive buffer with net VSP
  * 
  * Net VSP:  Network virtual services client, also known as the
  *     Hyper-V extensible switch and the synthetic data path.
  */
 static int 
 hv_nv_init_rx_buffer_with_net_vsp(struct hv_device *device)
 {
 	netvsc_dev *net_dev;
 	nvsp_msg *init_pkt;
 	int ret = 0;
 
 	net_dev = hv_nv_get_outbound_net_device(device);
 	if (!net_dev) {
 		return (ENODEV);
 	}
 
 	net_dev->rx_buf = contigmalloc(net_dev->rx_buf_size, M_DEVBUF,
 	    M_ZERO, 0UL, BUS_SPACE_MAXADDR, PAGE_SIZE, 0);
 	if (net_dev->rx_buf == NULL) {
 		ret = ENOMEM;
 		goto cleanup;
 	}
 
 	/*
 	 * Establish the GPADL handle for this buffer on this channel.
 	 * Note:  This call uses the vmbus connection rather than the
 	 * channel to establish the gpadl handle. 
 	 * GPADL:  Guest physical address descriptor list.
 	 */
 	ret = hv_vmbus_channel_establish_gpadl(
 		device->channel, net_dev->rx_buf,
 		net_dev->rx_buf_size, &net_dev->rx_buf_gpadl_handle);
 	if (ret != 0) {
 		goto cleanup;
 	}
 	
 	/* sema_wait(&ext->channel_init_sema); KYS CHECK */
 
 	/* Notify the NetVsp of the gpadl handle */
 	init_pkt = &net_dev->channel_init_packet;
 
 	memset(init_pkt, 0, sizeof(nvsp_msg));
 
 	init_pkt->hdr.msg_type = nvsp_msg_1_type_send_rx_buf;
 	init_pkt->msgs.vers_1_msgs.send_rx_buf.gpadl_handle =
 	    net_dev->rx_buf_gpadl_handle;
 	init_pkt->msgs.vers_1_msgs.send_rx_buf.id =
 	    NETVSC_RECEIVE_BUFFER_ID;
 
 	/* Send the gpadl notification request */
 
 	ret = hv_vmbus_channel_send_packet(device->channel, init_pkt,
 	    sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt,
 	    HV_VMBUS_PACKET_TYPE_DATA_IN_BAND,
 	    HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 	if (ret != 0) {
 		goto cleanup;
 	}
 
 	sema_wait(&net_dev->channel_init_sema);
 
 	/* Check the response */
 	if (init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.status
 	    != nvsp_status_success) {
 		ret = EINVAL;
 		goto cleanup;
 	}
 
 	net_dev->rx_section_count =
 	    init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.num_sections;
 
 	net_dev->rx_sections = malloc(net_dev->rx_section_count *
 	    sizeof(nvsp_1_rx_buf_section), M_DEVBUF, M_NOWAIT);
 	if (net_dev->rx_sections == NULL) {
 		ret = EINVAL;
 		goto cleanup;
 	}
 	memcpy(net_dev->rx_sections, 
 	    init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.sections,
 	    net_dev->rx_section_count * sizeof(nvsp_1_rx_buf_section));
 
 
 	/*
 	 * For first release, there should only be 1 section that represents
 	 * the entire receive buffer
 	 */
 	if (net_dev->rx_section_count != 1
 	    || net_dev->rx_sections->offset != 0) {
 		ret = EINVAL;
 		goto cleanup;
 	}
 
 	goto exit;
 
 cleanup:
 	hv_nv_destroy_rx_buffer(net_dev);
 	
 exit:
 	return (ret);
 }
 
 /*
  * Net VSC initialize send buffer with net VSP
  */
 static int 
 hv_nv_init_send_buffer_with_net_vsp(struct hv_device *device)
 {
 	netvsc_dev *net_dev;
 	nvsp_msg *init_pkt;
 	int ret = 0;
 
 	net_dev = hv_nv_get_outbound_net_device(device);
 	if (!net_dev) {
 		return (ENODEV);
 	}
 
 	net_dev->send_buf  = contigmalloc(net_dev->send_buf_size, M_DEVBUF,
 	    M_ZERO, 0UL, BUS_SPACE_MAXADDR, PAGE_SIZE, 0);
 	if (net_dev->send_buf == NULL) {
 		ret = ENOMEM;
 		goto cleanup;
 	}
 
 	/*
 	 * Establish the gpadl handle for this buffer on this channel.
 	 * Note:  This call uses the vmbus connection rather than the
 	 * channel to establish the gpadl handle. 
 	 */
 	ret = hv_vmbus_channel_establish_gpadl(device->channel,
 	    net_dev->send_buf, net_dev->send_buf_size,
 	    &net_dev->send_buf_gpadl_handle);
 	if (ret != 0) {
 		goto cleanup;
 	}
 
 	/* Notify the NetVsp of the gpadl handle */
 
 	init_pkt = &net_dev->channel_init_packet;
 
 	memset(init_pkt, 0, sizeof(nvsp_msg));
 
 	init_pkt->hdr.msg_type = nvsp_msg_1_type_send_send_buf;
 	init_pkt->msgs.vers_1_msgs.send_rx_buf.gpadl_handle =
 	    net_dev->send_buf_gpadl_handle;
 	init_pkt->msgs.vers_1_msgs.send_rx_buf.id =
 	    NETVSC_SEND_BUFFER_ID;
 
 	/* Send the gpadl notification request */
 
 	ret = hv_vmbus_channel_send_packet(device->channel, init_pkt,
 	    sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt,
 	    HV_VMBUS_PACKET_TYPE_DATA_IN_BAND,
 	    HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 	if (ret != 0) {
 		goto cleanup;
 	}
 
 	sema_wait(&net_dev->channel_init_sema);
 
 	/* Check the response */
 	if (init_pkt->msgs.vers_1_msgs.send_send_buf_complete.status
 	    != nvsp_status_success) {
 		ret = EINVAL;
 		goto cleanup;
 	}
 
 	net_dev->send_section_size =
 	    init_pkt->msgs.vers_1_msgs.send_send_buf_complete.section_size;
 
 	goto exit;
 
 cleanup:
 	hv_nv_destroy_send_buffer(net_dev);
 	
 exit:
 	return (ret);
 }
 
 /*
  * Net VSC destroy receive buffer
  */
 static int
 hv_nv_destroy_rx_buffer(netvsc_dev *net_dev)
 {
 	nvsp_msg *revoke_pkt;
 	int ret = 0;
 
 	/*
 	 * If we got a section count, it means we received a
 	 * send_rx_buf_complete msg 
 	 * (ie sent nvsp_msg_1_type_send_rx_buf msg) therefore,
 	 * we need to send a revoke msg here
 	 */
 	if (net_dev->rx_section_count) {
 		/* Send the revoke receive buffer */
 		revoke_pkt = &net_dev->revoke_packet;
 		memset(revoke_pkt, 0, sizeof(nvsp_msg));
 
 		revoke_pkt->hdr.msg_type = nvsp_msg_1_type_revoke_rx_buf;
 		revoke_pkt->msgs.vers_1_msgs.revoke_rx_buf.id =
 		    NETVSC_RECEIVE_BUFFER_ID;
 
 		ret = hv_vmbus_channel_send_packet(net_dev->dev->channel,
 		    revoke_pkt, sizeof(nvsp_msg),
 		    (uint64_t)(uintptr_t)revoke_pkt,
 		    HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
 
 		/*
 		 * If we failed here, we might as well return and have a leak 
 		 * rather than continue and a bugchk
 		 */
 		if (ret != 0) {
 			return (ret);
 		}
 	}
 		
 	/* Tear down the gpadl on the vsp end */
 	if (net_dev->rx_buf_gpadl_handle) {
 		ret = hv_vmbus_channel_teardown_gpdal(net_dev->dev->channel,
 		    net_dev->rx_buf_gpadl_handle);
 		/*
 		 * If we failed here, we might as well return and have a leak 
 		 * rather than continue and a bugchk
 		 */
 		if (ret != 0) {
 			return (ret);
 		}
 		net_dev->rx_buf_gpadl_handle = 0;
 	}
 
 	if (net_dev->rx_buf) {
 		/* Free up the receive buffer */
 		contigfree(net_dev->rx_buf, net_dev->rx_buf_size, M_DEVBUF);
 		net_dev->rx_buf = NULL;
 	}
 
 	if (net_dev->rx_sections) {
 		free(net_dev->rx_sections, M_DEVBUF);
 		net_dev->rx_sections = NULL;
 		net_dev->rx_section_count = 0;
 	}
 
 	return (ret);
 }
 
 /*
  * Net VSC destroy send buffer
  */
 static int
 hv_nv_destroy_send_buffer(netvsc_dev *net_dev)
 {
 	nvsp_msg *revoke_pkt;
 	int ret = 0;
 
 	/*
 	 * If we got a section count, it means we received a
 	 * send_rx_buf_complete msg 
 	 * (ie sent nvsp_msg_1_type_send_rx_buf msg) therefore,
 	 * we need to send a revoke msg here
 	 */
 	if (net_dev->send_section_size) {
 		/* Send the revoke send buffer */
 		revoke_pkt = &net_dev->revoke_packet;
 		memset(revoke_pkt, 0, sizeof(nvsp_msg));
 
 		revoke_pkt->hdr.msg_type =
 		    nvsp_msg_1_type_revoke_send_buf;
 		revoke_pkt->msgs.vers_1_msgs.revoke_send_buf.id =
 		    NETVSC_SEND_BUFFER_ID;
 
 		ret = hv_vmbus_channel_send_packet(net_dev->dev->channel,
 		    revoke_pkt, sizeof(nvsp_msg),
 		    (uint64_t)(uintptr_t)revoke_pkt,
 		    HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
 		/*
 		 * If we failed here, we might as well return and have a leak 
 		 * rather than continue and a bugchk
 		 */
 		if (ret != 0) {
 			return (ret);
 		}
 	}
 		
 	/* Tear down the gpadl on the vsp end */
 	if (net_dev->send_buf_gpadl_handle) {
 		ret = hv_vmbus_channel_teardown_gpdal(net_dev->dev->channel,
 		    net_dev->send_buf_gpadl_handle);
 
 		/*
 		 * If we failed here, we might as well return and have a leak 
 		 * rather than continue and a bugchk
 		 */
 		if (ret != 0) {
 			return (ret);
 		}
 		net_dev->send_buf_gpadl_handle = 0;
 	}
 
 	if (net_dev->send_buf) {
 		/* Free up the receive buffer */
 		contigfree(net_dev->send_buf, net_dev->send_buf_size, M_DEVBUF);
 		net_dev->send_buf = NULL;
 	}
 
 	return (ret);
 }
 
 
 /*
  * Attempt to negotiate the caller-specified NVSP version
  *
  * For NVSP v2, Server 2008 R2 does not set
  * init_pkt->msgs.init_msgs.init_compl.negotiated_prot_vers
  * to the negotiated version, so we cannot rely on that.
  */
 static int
 hv_nv_negotiate_nvsp_protocol(struct hv_device *device, netvsc_dev *net_dev,
 			      uint32_t nvsp_ver)
 {
 	nvsp_msg *init_pkt;
 	int ret;
 
 	init_pkt = &net_dev->channel_init_packet;
 	memset(init_pkt, 0, sizeof(nvsp_msg));
 	init_pkt->hdr.msg_type = nvsp_msg_type_init;
 
 	/*
 	 * Specify parameter as the only acceptable protocol version
 	 */
 	init_pkt->msgs.init_msgs.init.p1.protocol_version = nvsp_ver;
 	init_pkt->msgs.init_msgs.init.protocol_version_2 = nvsp_ver;
 
 	/* Send the init request */
 	ret = hv_vmbus_channel_send_packet(device->channel, init_pkt,
 	    sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt,
 	    HV_VMBUS_PACKET_TYPE_DATA_IN_BAND,
 	    HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 	if (ret != 0)
 		return (-1);
 
 	sema_wait(&net_dev->channel_init_sema);
 
 	if (init_pkt->msgs.init_msgs.init_compl.status != nvsp_status_success)
 		return (EINVAL);
 
 	return (0);
 }
 
 /*
  * Send NDIS version 2 config packet containing MTU.
  *
  * Not valid for NDIS version 1.
  */
 static int
 hv_nv_send_ndis_config(struct hv_device *device, uint32_t mtu)
 {
 	netvsc_dev *net_dev;
 	nvsp_msg *init_pkt;
 	int ret;
 
 	net_dev = hv_nv_get_outbound_net_device(device);
 	if (!net_dev)
 		return (-ENODEV);
 
 	/*
 	 * Set up configuration packet, write MTU
 	 * Indicate we are capable of handling VLAN tags
 	 */
 	init_pkt = &net_dev->channel_init_packet;
 	memset(init_pkt, 0, sizeof(nvsp_msg));
 	init_pkt->hdr.msg_type = nvsp_msg_2_type_send_ndis_config;
 	init_pkt->msgs.vers_2_msgs.send_ndis_config.mtu = mtu;
 	init_pkt->
 		msgs.vers_2_msgs.send_ndis_config.capabilities.u1.u2.ieee8021q
 		= 1;
 
 	/* Send the configuration packet */
 	ret = hv_vmbus_channel_send_packet(device->channel, init_pkt,
 	    sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt,
 	    HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
 	if (ret != 0)
 		return (-EINVAL);
 
 	return (0);
 }
 
 /*
  * Net VSC connect to VSP
  */
 static int
 hv_nv_connect_to_vsp(struct hv_device *device)
 {
 	netvsc_dev *net_dev;
 	nvsp_msg *init_pkt;
 	uint32_t nvsp_vers;
 	uint32_t ndis_version;
 	int ret = 0;
 	device_t dev = device->device;
 	hn_softc_t *sc = device_get_softc(dev);
-	struct ifnet *ifp = sc->arpcom.ac_ifp;
+	struct ifnet *ifp = sc->hn_ifp;
 
 	net_dev = hv_nv_get_outbound_net_device(device);
 	if (!net_dev) {
 		return (ENODEV);
 	}
 
 	/*
 	 * Negotiate the NVSP version.  Try NVSP v2 first.
 	 */
 	nvsp_vers = NVSP_PROTOCOL_VERSION_2;
 	ret = hv_nv_negotiate_nvsp_protocol(device, net_dev, nvsp_vers);
 	if (ret != 0) {
 		/* NVSP v2 failed, try NVSP v1 */
 		nvsp_vers = NVSP_PROTOCOL_VERSION_1;
 		ret = hv_nv_negotiate_nvsp_protocol(device, net_dev, nvsp_vers);
 		if (ret != 0) {
 			/* NVSP v1 failed, return bad status */
 			return (ret);
 		}
 	}
 	net_dev->nvsp_version = nvsp_vers;
 
 	/*
 	 * Set the MTU if supported by this NVSP protocol version
 	 * This needs to be right after the NVSP init message per Haiyang
 	 */
 	if (nvsp_vers >= NVSP_PROTOCOL_VERSION_2)
 		ret = hv_nv_send_ndis_config(device, ifp->if_mtu);
 
 	/*
 	 * Send the NDIS version
 	 */
 	init_pkt = &net_dev->channel_init_packet;
 
 	memset(init_pkt, 0, sizeof(nvsp_msg));
 
 	/*
 	 * Updated to version 5.1, minimum, for VLAN per Haiyang
 	 */
 	ndis_version = NDIS_VERSION;
 
 	init_pkt->hdr.msg_type = nvsp_msg_1_type_send_ndis_vers;
 	init_pkt->msgs.vers_1_msgs.send_ndis_vers.ndis_major_vers =
 	    (ndis_version & 0xFFFF0000) >> 16;
 	init_pkt->msgs.vers_1_msgs.send_ndis_vers.ndis_minor_vers =
 	    ndis_version & 0xFFFF;
 
 	/* Send the init request */
 
 	ret = hv_vmbus_channel_send_packet(device->channel, init_pkt,
 	    sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt,
 	    HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0);
 	if (ret != 0) {
 		goto cleanup;
 	}
 	/*
 	 * TODO:  BUGBUG - We have to wait for the above msg since the netvsp
 	 * uses KMCL which acknowledges packet (completion packet) 
 	 * since our Vmbus always set the
 	 * HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED flag
 	 */
 	/* sema_wait(&NetVscChannel->channel_init_sema); */
 
 	/* Post the big receive buffer to NetVSP */
 	ret = hv_nv_init_rx_buffer_with_net_vsp(device);
 	if (ret == 0)
 		ret = hv_nv_init_send_buffer_with_net_vsp(device);
 
 cleanup:
 	return (ret);
 }
 
 /*
  * Net VSC disconnect from VSP
  */
 static void
 hv_nv_disconnect_from_vsp(netvsc_dev *net_dev)
 {
 	hv_nv_destroy_rx_buffer(net_dev);
 	hv_nv_destroy_send_buffer(net_dev);
 }
 
 /*
  * Net VSC on device add
  * 
  * Callback when the device belonging to this driver is added
  */
 netvsc_dev *
 hv_nv_on_device_add(struct hv_device *device, void *additional_info)
 {
 	netvsc_dev *net_dev;
 	netvsc_packet *packet;
 	netvsc_packet *next_packet;
 	int i, ret = 0;
 
 	net_dev = hv_nv_alloc_net_device(device);
 	if (!net_dev)
 		goto cleanup;
 
 	/* Initialize the NetVSC channel extension */
 	net_dev->rx_buf_size = NETVSC_RECEIVE_BUFFER_SIZE;
 	mtx_init(&net_dev->rx_pkt_list_lock, "HV-RPL", NULL,
 	    MTX_SPIN | MTX_RECURSE);
 
 	net_dev->send_buf_size = NETVSC_SEND_BUFFER_SIZE;
 
 	/* Same effect as STAILQ_HEAD_INITIALIZER() static initializer */
 	STAILQ_INIT(&net_dev->myrx_packet_list);
 
 	/* 
 	 * malloc a sufficient number of netvsc_packet buffers to hold
 	 * a packet list.  Add them to the netvsc device packet queue.
 	 */
 	for (i=0; i < NETVSC_RECEIVE_PACKETLIST_COUNT; i++) {
 		packet = malloc(sizeof(netvsc_packet) +
 		    (NETVSC_RECEIVE_SG_COUNT * sizeof(hv_vmbus_page_buffer)),
 		    M_DEVBUF, M_NOWAIT | M_ZERO);
 		if (!packet) {
 			break;
 		}
 		STAILQ_INSERT_TAIL(&net_dev->myrx_packet_list, packet,
 		    mylist_entry);
 	}
 
 	sema_init(&net_dev->channel_init_sema, 0, "netdev_sema");
 
 	/*
 	 * Open the channel
 	 */
 	ret = hv_vmbus_channel_open(device->channel,
 	    NETVSC_DEVICE_RING_BUFFER_SIZE, NETVSC_DEVICE_RING_BUFFER_SIZE,
 	    NULL, 0, hv_nv_on_channel_callback, device);
 	if (ret != 0)
 		goto cleanup;
 
 	/*
 	 * Connect with the NetVsp
 	 */
 	ret = hv_nv_connect_to_vsp(device);
 	if (ret != 0)
 		goto close;
 
 	return (net_dev);
 
 close:
 	/* Now, we can close the channel safely */
 
 	hv_vmbus_channel_close(device->channel);
 
 cleanup:
 	/*
 	 * Free the packet buffers on the netvsc device packet queue.
 	 * Release other resources.
 	 */
 	if (net_dev) {
 		sema_destroy(&net_dev->channel_init_sema);
 
 		packet = STAILQ_FIRST(&net_dev->myrx_packet_list);
 		while (packet != NULL) {
 			next_packet = STAILQ_NEXT(packet, mylist_entry);
 			free(packet, M_DEVBUF);
 			packet = next_packet;
 		}
 		/* Reset the list to initial state */
 		STAILQ_INIT(&net_dev->myrx_packet_list);
 
 		mtx_destroy(&net_dev->rx_pkt_list_lock);
 
 		free(net_dev, M_DEVBUF);
 	}
 
 	return (NULL);
 }
 
 /*
  * Net VSC on device remove
  */
 int
 hv_nv_on_device_remove(struct hv_device *device, boolean_t destroy_channel)
 {
 	netvsc_packet *net_vsc_pkt;
 	netvsc_packet *next_net_vsc_pkt;
 	hn_softc_t *sc = device_get_softc(device->device);
 	netvsc_dev *net_dev = sc->net_dev;;
 	
 	/* Stop outbound traffic ie sends and receives completions */
 	mtx_lock(&device->channel->inbound_lock);
 	net_dev->destroy = TRUE;
 	mtx_unlock(&device->channel->inbound_lock);
 
 	/* Wait for all send completions */
 	while (net_dev->num_outstanding_sends) {
 		DELAY(100);
 	}
 
 	hv_nv_disconnect_from_vsp(net_dev);
 
 	/* At this point, no one should be accessing net_dev except in here */
 
 	/* Now, we can close the channel safely */
 
 	if (!destroy_channel) {
 		device->channel->state =
 		    HV_CHANNEL_CLOSING_NONDESTRUCTIVE_STATE;
 	}
 
 	hv_vmbus_channel_close(device->channel);
 
 	/* Release all resources */
 	net_vsc_pkt = STAILQ_FIRST(&net_dev->myrx_packet_list);
 	while (net_vsc_pkt != NULL) {
 		next_net_vsc_pkt = STAILQ_NEXT(net_vsc_pkt, mylist_entry);
 		free(net_vsc_pkt, M_DEVBUF);
 		net_vsc_pkt = next_net_vsc_pkt;
 	}
 
 	/* Reset the list to initial state */
 	STAILQ_INIT(&net_dev->myrx_packet_list);
 
 	mtx_destroy(&net_dev->rx_pkt_list_lock);
 	sema_destroy(&net_dev->channel_init_sema);
 	free(net_dev, M_DEVBUF);
 
 	return (0);
 }
 
 /*
  * Net VSC on send completion
  */
 static void 
 hv_nv_on_send_completion(struct hv_device *device, hv_vm_packet_descriptor *pkt)
 {
 	netvsc_dev *net_dev;
 	nvsp_msg *nvsp_msg_pkt;
 	netvsc_packet *net_vsc_pkt;
 
 	net_dev = hv_nv_get_inbound_net_device(device);
 	if (!net_dev) {
 		return;
 	}
 
 	nvsp_msg_pkt =
 	    (nvsp_msg *)((unsigned long)pkt + (pkt->data_offset8 << 3));
 
 	if (nvsp_msg_pkt->hdr.msg_type == nvsp_msg_type_init_complete
 		|| nvsp_msg_pkt->hdr.msg_type
 			== nvsp_msg_1_type_send_rx_buf_complete
 		|| nvsp_msg_pkt->hdr.msg_type
 			== nvsp_msg_1_type_send_send_buf_complete) {
 		/* Copy the response back */
 		memcpy(&net_dev->channel_init_packet, nvsp_msg_pkt,
 		    sizeof(nvsp_msg));			
 		sema_post(&net_dev->channel_init_sema);
 	} else if (nvsp_msg_pkt->hdr.msg_type ==
 				   nvsp_msg_1_type_send_rndis_pkt_complete) {
 		/* Get the send context */
 		net_vsc_pkt =
 		    (netvsc_packet *)(unsigned long)pkt->transaction_id;
 
 		/* Notify the layer above us */
 		net_vsc_pkt->compl.send.on_send_completion(
 		    net_vsc_pkt->compl.send.send_completion_context);
 
 		atomic_subtract_int(&net_dev->num_outstanding_sends, 1);
 	}
 }
 
 /*
  * Net VSC on send
  * Sends a packet on the specified Hyper-V device.
  * Returns 0 on success, non-zero on failure.
  */
 int
 hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt)
 {
 	netvsc_dev *net_dev;
 	nvsp_msg send_msg;
 	int ret;
 
 	net_dev = hv_nv_get_outbound_net_device(device);
 	if (!net_dev)
 		return (ENODEV);
 
 	send_msg.hdr.msg_type = nvsp_msg_1_type_send_rndis_pkt;
 	if (pkt->is_data_pkt) {
 		/* 0 is RMC_DATA */
 		send_msg.msgs.vers_1_msgs.send_rndis_pkt.chan_type = 0;
 	} else {
 		/* 1 is RMC_CONTROL */
 		send_msg.msgs.vers_1_msgs.send_rndis_pkt.chan_type = 1;
 	}
 
 	/* Not using send buffer section */
 	send_msg.msgs.vers_1_msgs.send_rndis_pkt.send_buf_section_idx =
 	    0xFFFFFFFF;
 	send_msg.msgs.vers_1_msgs.send_rndis_pkt.send_buf_section_size = 0;
 
 	if (pkt->page_buf_count) {
 		ret = hv_vmbus_channel_send_packet_pagebuffer(device->channel,
 		    pkt->page_buffers, pkt->page_buf_count,
 		    &send_msg, sizeof(nvsp_msg), (uint64_t)(uintptr_t)pkt);
 	} else {
 		ret = hv_vmbus_channel_send_packet(device->channel,
 		    &send_msg, sizeof(nvsp_msg), (uint64_t)(uintptr_t)pkt,
 		    HV_VMBUS_PACKET_TYPE_DATA_IN_BAND,
 		    HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 	}
 
 	/* Record outstanding send only if send_packet() succeeded */
 	if (ret == 0)
 		atomic_add_int(&net_dev->num_outstanding_sends, 1);
 
 	return (ret);
 }
 
 /*
  * Net VSC on receive
  *
  * In the FreeBSD Hyper-V virtual world, this function deals exclusively
  * with virtual addresses.
  */
 static void 
 hv_nv_on_receive(struct hv_device *device, hv_vm_packet_descriptor *pkt)
 {
 	netvsc_dev *net_dev;
 	hv_vm_transfer_page_packet_header *vm_xfer_page_pkt;
 	nvsp_msg *nvsp_msg_pkt;
 	netvsc_packet *net_vsc_pkt = NULL;
 	unsigned long start;
 	xfer_page_packet *xfer_page_pkt = NULL;
 	STAILQ_HEAD(PKT_LIST, netvsc_packet_) mylist_head =
 	    STAILQ_HEAD_INITIALIZER(mylist_head);
 	int count = 0;
 	int i = 0;
 
 	net_dev = hv_nv_get_inbound_net_device(device);
 	if (!net_dev)
 		return;
 
 	/*
 	 * All inbound packets other than send completion should be
 	 * xfer page packet.
 	 */
 	if (pkt->type != HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES)
 		return;
 
 	nvsp_msg_pkt = (nvsp_msg *)((unsigned long)pkt
 		+ (pkt->data_offset8 << 3));
 
 	/* Make sure this is a valid nvsp packet */
 	if (nvsp_msg_pkt->hdr.msg_type != nvsp_msg_1_type_send_rndis_pkt)
 		return;
 	
 	vm_xfer_page_pkt = (hv_vm_transfer_page_packet_header *)pkt;
 
 	if (vm_xfer_page_pkt->transfer_page_set_id
 		!= NETVSC_RECEIVE_BUFFER_ID) {
 		return;
 	}
 
 	STAILQ_INIT(&mylist_head);
 
 	/*
 	 * Grab free packets (range count + 1) to represent this xfer page
 	 * packet.  +1 to represent the xfer page packet itself.  We grab it
 	 * here so that we know exactly how many we can fulfill.
 	 */
 	mtx_lock_spin(&net_dev->rx_pkt_list_lock);
 	while (!STAILQ_EMPTY(&net_dev->myrx_packet_list)) {	
 		net_vsc_pkt = STAILQ_FIRST(&net_dev->myrx_packet_list);
 		STAILQ_REMOVE_HEAD(&net_dev->myrx_packet_list, mylist_entry);
 
 		STAILQ_INSERT_TAIL(&mylist_head, net_vsc_pkt, mylist_entry);
 
 		if (++count == vm_xfer_page_pkt->range_count + 1)
 			break;
 	}
 
 	mtx_unlock_spin(&net_dev->rx_pkt_list_lock);
 
 	/*
 	 * We need at least 2 netvsc pkts (1 to represent the xfer page
 	 * and at least 1 for the range) i.e. we can handle some of the
 	 * xfer page packet ranges...
 	 */
 	if (count < 2) {
 		/* Return netvsc packet to the freelist */
 		mtx_lock_spin(&net_dev->rx_pkt_list_lock);
 		for (i=count; i != 0; i--) {
 			net_vsc_pkt = STAILQ_FIRST(&mylist_head);
 			STAILQ_REMOVE_HEAD(&mylist_head, mylist_entry);
 
 			STAILQ_INSERT_TAIL(&net_dev->myrx_packet_list,
 			    net_vsc_pkt, mylist_entry);
 		}
 		mtx_unlock_spin(&net_dev->rx_pkt_list_lock);
 
 		hv_nv_send_receive_completion(device,
 		    vm_xfer_page_pkt->d.transaction_id);
 
 		return;
 	}
 
 	/* Take the first packet in the list */
 	xfer_page_pkt = (xfer_page_packet *)STAILQ_FIRST(&mylist_head);
 	STAILQ_REMOVE_HEAD(&mylist_head, mylist_entry);
 
 	/* This is how many data packets we can supply */
 	xfer_page_pkt->count = count - 1;
 
 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
 	for (i=0; i < (count - 1); i++) {
 		net_vsc_pkt = STAILQ_FIRST(&mylist_head);
 		STAILQ_REMOVE_HEAD(&mylist_head, mylist_entry);
 
 		/*
 		 * Initialize the netvsc packet
 		 */
 		net_vsc_pkt->xfer_page_pkt = xfer_page_pkt;
 		net_vsc_pkt->compl.rx.rx_completion_context = net_vsc_pkt;
 		net_vsc_pkt->device = device;
 		/* Save this so that we can send it back */
 		net_vsc_pkt->compl.rx.rx_completion_tid =
 		    vm_xfer_page_pkt->d.transaction_id;
 
 		net_vsc_pkt->tot_data_buf_len =
 		    vm_xfer_page_pkt->ranges[i].byte_count;
 		net_vsc_pkt->page_buf_count = 1;
 
 		net_vsc_pkt->page_buffers[0].length =
 		    vm_xfer_page_pkt->ranges[i].byte_count;
 
 		/* The virtual address of the packet in the receive buffer */
 		start = ((unsigned long)net_dev->rx_buf +
 		    vm_xfer_page_pkt->ranges[i].byte_offset);
 		start = ((unsigned long)start) & ~(PAGE_SIZE - 1);
 
 		/* Page number of the virtual page containing packet start */
 		net_vsc_pkt->page_buffers[0].pfn = start >> PAGE_SHIFT;
 
 		/* Calculate the page relative offset */
 		net_vsc_pkt->page_buffers[0].offset =
 		    vm_xfer_page_pkt->ranges[i].byte_offset & (PAGE_SIZE - 1);
 
 		/*
 		 * In this implementation, we are dealing with virtual
 		 * addresses exclusively.  Since we aren't using physical
 		 * addresses at all, we don't care if a packet crosses a
 		 * page boundary.  For this reason, the original code to
 		 * check for and handle page crossings has been removed.
 		 */
 
 		/*
 		 * Pass it to the upper layer.  The receive completion call
 		 * has been moved into this function.
 		 */
 		hv_rf_on_receive(device, net_vsc_pkt);
 
 		/*
 		 * Moved completion call back here so that all received 
 		 * messages (not just data messages) will trigger a response
 		 * message back to the host.
 		 */
 		hv_nv_on_receive_completion(net_vsc_pkt);
 	}
 }
 
 /*
  * Net VSC send receive completion
  */
 static void
 hv_nv_send_receive_completion(struct hv_device *device, uint64_t tid)
 {
 	nvsp_msg rx_comp_msg;
 	int retries = 0;
 	int ret = 0;
 	
 	rx_comp_msg.hdr.msg_type = nvsp_msg_1_type_send_rndis_pkt_complete;
 
 	/* Pass in the status */
 	rx_comp_msg.msgs.vers_1_msgs.send_rndis_pkt_complete.status =
 	    nvsp_status_success;
 
 retry_send_cmplt:
 	/* Send the completion */
 	ret = hv_vmbus_channel_send_packet(device->channel, &rx_comp_msg,
 	    sizeof(nvsp_msg), tid, HV_VMBUS_PACKET_TYPE_COMPLETION, 0);
 	if (ret == 0) {
 		/* success */
 		/* no-op */
 	} else if (ret == EAGAIN) {
 		/* no more room... wait a bit and attempt to retry 3 times */
 		retries++;
 
 		if (retries < 4) {
 			DELAY(100);
 			goto retry_send_cmplt;
 		}
 	}
 }
 
 /*
  * Net VSC on receive completion
  *
  * Send a receive completion packet to RNDIS device (ie NetVsp)
  */
 void
 hv_nv_on_receive_completion(void *context)
 {
 	netvsc_packet *packet = (netvsc_packet *)context;
 	struct hv_device *device = (struct hv_device *)packet->device;
 	netvsc_dev    *net_dev;
 	uint64_t       tid = 0;
 	boolean_t send_rx_completion = FALSE;
 
 	/*
 	 * Even though it seems logical to do a hv_nv_get_outbound_net_device()
 	 * here to send out receive completion, we are using
 	 * hv_nv_get_inbound_net_device() since we may have disabled
 	 * outbound traffic already.
 	 */
 	net_dev = hv_nv_get_inbound_net_device(device);
 	if (net_dev == NULL)
 		return;
 	
 	/* Overloading use of the lock. */
 	mtx_lock_spin(&net_dev->rx_pkt_list_lock);
 
 	packet->xfer_page_pkt->count--;
 
 	/*
 	 * Last one in the line that represent 1 xfer page packet.
 	 * Return the xfer page packet itself to the free list.
 	 */
 	if (packet->xfer_page_pkt->count == 0) {
 		send_rx_completion = TRUE;
 		tid = packet->compl.rx.rx_completion_tid;
 		STAILQ_INSERT_TAIL(&net_dev->myrx_packet_list,
 		    (netvsc_packet *)(packet->xfer_page_pkt), mylist_entry);
 	}
 
 	/* Put the packet back on the free list */
 	STAILQ_INSERT_TAIL(&net_dev->myrx_packet_list, packet, mylist_entry);
 	mtx_unlock_spin(&net_dev->rx_pkt_list_lock);
 
 	/* Send a receive completion for the xfer page packet */
 	if (send_rx_completion)
 		hv_nv_send_receive_completion(device, tid);
 }
 
 /*
  * Net VSC on channel callback
  */
 static void
 hv_nv_on_channel_callback(void *context)
 {
 	/* Fixme:  Magic number */
 	const int net_pkt_size = 2048;
 	struct hv_device *device = (struct hv_device *)context;
 	netvsc_dev *net_dev;
 	uint32_t bytes_rxed;
 	uint64_t request_id;
 	uint8_t  *packet;
 	hv_vm_packet_descriptor *desc;
 	uint8_t *buffer;
 	int     bufferlen = net_pkt_size;
 	int     ret = 0;
 
 	packet = malloc(net_pkt_size * sizeof(uint8_t), M_DEVBUF, M_NOWAIT);
 	if (!packet)
 		return;
 
 	buffer = packet;
 
 	net_dev = hv_nv_get_inbound_net_device(device);
 	if (net_dev == NULL)
 		goto out;
 
 	do {
 		ret = hv_vmbus_channel_recv_packet_raw(device->channel,
 		    buffer, bufferlen, &bytes_rxed, &request_id);
 		if (ret == 0) {
 			if (bytes_rxed > 0) {
 				desc = (hv_vm_packet_descriptor *)buffer;
 				switch (desc->type) {
 				case HV_VMBUS_PACKET_TYPE_COMPLETION:
 					hv_nv_on_send_completion(device, desc);
 					break;
 				case HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES:
 					hv_nv_on_receive(device, desc);
 					break;
 				default:
 					break;
 				}
 			} else {
 				break;
 			}
 		} else if (ret == ENOBUFS) {
 			/* Handle large packet */
 			free(buffer, M_DEVBUF);
 			buffer = malloc(bytes_rxed, M_DEVBUF, M_NOWAIT);
 			if (buffer == NULL) {
 				break;
 			}
 			bufferlen = bytes_rxed;
 		}
 	} while (1);
 
 out:
 	free(buffer, M_DEVBUF);
 }
 
Index: user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_net_vsc.h
===================================================================
--- user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_net_vsc.h	(revision 274240)
+++ user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_net_vsc.h	(revision 274241)
@@ -1,999 +1,998 @@
 /*-
  * Copyright (c) 2009-2012 Microsoft Corp.
  * Copyright (c) 2010-2012 Citrix Inc.
  * Copyright (c) 2012 NetApp Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * HyperV vmbus (virtual machine bus) network VSC (virtual services client)
  * header file
  *
  * (Updated from unencumbered NvspProtocol.h)
  */
 
 #ifndef __HV_NET_VSC_H__
 #define __HV_NET_VSC_H__
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 
 #include <dev/hyperv/include/hyperv.h>
 
 
 #define NVSP_INVALID_PROTOCOL_VERSION           (0xFFFFFFFF)
 
 #define NVSP_PROTOCOL_VERSION_1                 2
 #define NVSP_PROTOCOL_VERSION_2                 0x30002
 #define NVSP_MIN_PROTOCOL_VERSION               (NVSP_PROTOCOL_VERSION_1)
 #define NVSP_MAX_PROTOCOL_VERSION               (NVSP_PROTOCOL_VERSION_2)
 
 #define NVSP_PROTOCOL_VERSION_CURRENT           NVSP_PROTOCOL_VERSION_2
 
 #define NVSP_OPERATIONAL_STATUS_OK              (0x00000000)
 #define NVSP_OPERATIONAL_STATUS_DEGRADED        (0x00000001)
 #define NVSP_OPERATIONAL_STATUS_NONRECOVERABLE  (0x00000002)
 #define NVSP_OPERATIONAL_STATUS_NO_CONTACT      (0x00000003)
 #define NVSP_OPERATIONAL_STATUS_LOST_COMMUNICATION (0x00000004)
 
 /*
  * Maximun number of transfer pages (packets) the VSP will use on a receive
  */
 #define NVSP_MAX_PACKETS_PER_RECEIVE            375
 
 
 typedef enum nvsp_msg_type_ {
 	nvsp_msg_type_none                      = 0,
 
 	/*
 	 * Init Messages
 	 */
 	nvsp_msg_type_init                      = 1,
 	nvsp_msg_type_init_complete             = 2,
 
 	nvsp_version_msg_start                  = 100,
 
 	/*
 	 * Version 1 Messages
 	 */
 	nvsp_msg_1_type_send_ndis_vers          = nvsp_version_msg_start,
 
 	nvsp_msg_1_type_send_rx_buf,
 	nvsp_msg_1_type_send_rx_buf_complete,
 	nvsp_msg_1_type_revoke_rx_buf,
 
 	nvsp_msg_1_type_send_send_buf,
 	nvsp_msg_1_type_send_send_buf_complete,
 	nvsp_msg_1_type_revoke_send_buf,
 
 	nvsp_msg_1_type_send_rndis_pkt,
 	nvsp_msg_1_type_send_rndis_pkt_complete,
     
 	/*
 	 * Version 2 Messages
 	 */
 	nvsp_msg_2_type_send_chimney_delegated_buf,
 	nvsp_msg_2_type_send_chimney_delegated_buf_complete,
 	nvsp_msg_2_type_revoke_chimney_delegated_buf,
 
 	nvsp_msg_2_type_resume_chimney_rx_indication,
 
 	nvsp_msg_2_type_terminate_chimney,
 	nvsp_msg_2_type_terminate_chimney_complete,
 
 	nvsp_msg_2_type_indicate_chimney_event,
 
 	nvsp_msg_2_type_send_chimney_packet,
 	nvsp_msg_2_type_send_chimney_packet_complete,
 
 	nvsp_msg_2_type_post_chimney_rx_request,
 	nvsp_msg_2_type_post_chimney_rx_request_complete,
 
 	nvsp_msg_2_type_alloc_rx_buf,
 	nvsp_msg_2_type_alloc_rx_buf_complete,
 
 	nvsp_msg_2_type_free_rx_buf,
 
 	nvsp_msg_2_send_vmq_rndis_pkt,
 	nvsp_msg_2_send_vmq_rndis_pkt_complete,
 
 	nvsp_msg_2_type_send_ndis_config,
 
 	nvsp_msg_2_type_alloc_chimney_handle,
 	nvsp_msg_2_type_alloc_chimney_handle_complete,
 } nvsp_msg_type;
 
 typedef enum nvsp_status_ {
 	nvsp_status_none = 0,
 	nvsp_status_success,
 	nvsp_status_failure,
 	/* Deprecated */
 	nvsp_status_prot_vers_range_too_new,
 	/* Deprecated */
 	nvsp_status_prot_vers_range_too_old,
 	nvsp_status_invalid_rndis_pkt,
 	nvsp_status_busy,
 	nvsp_status_max,
 } nvsp_status;
 
 typedef struct nvsp_msg_hdr_ {
 	uint32_t                                msg_type;
 } __packed nvsp_msg_hdr;
 
 /*
  * Init Messages
  */
 
 /*
  * This message is used by the VSC to initialize the channel
  * after the channels has been opened. This message should 
  * never include anything other then versioning (i.e. this
  * message will be the same for ever).
  *
  * Forever is a long time.  The values have been redefined
  * in Win7 to indicate major and minor protocol version
  * number.
  */
 typedef struct nvsp_msg_init_ {
 	union {
 		struct {
 			uint16_t                minor_protocol_version;
 			uint16_t                major_protocol_version;
 		} s;
 		/* Formerly min_protocol_version */
 		uint32_t                        protocol_version;
 	} p1;
 	/* Formerly max_protocol_version */
 	uint32_t                                protocol_version_2;
 } __packed nvsp_msg_init;
 
 /*
  * This message is used by the VSP to complete the initialization
  * of the channel. This message should never include anything other 
  * then versioning (i.e. this message will be the same forever).
  */
 typedef struct nvsp_msg_init_complete_ {
 	/* Deprecated */
 	uint32_t                                negotiated_prot_vers;
 	uint32_t                                max_mdl_chain_len;
 	uint32_t                                status;
 } __packed nvsp_msg_init_complete;
 
 typedef union nvsp_msg_init_uber_ {
 	nvsp_msg_init                           init;
 	nvsp_msg_init_complete                  init_compl;
 } __packed nvsp_msg_init_uber;
 
 /*
  * Version 1 Messages
  */
 
 /*
  * This message is used by the VSC to send the NDIS version
  * to the VSP.  The VSP can use this information when handling
  * OIDs sent by the VSC.
  */
 typedef struct nvsp_1_msg_send_ndis_version_ {
 	uint32_t                                ndis_major_vers;
 	/* Deprecated */
 	uint32_t                                ndis_minor_vers;
 } __packed nvsp_1_msg_send_ndis_version;
 
 /*
  * This message is used by the VSC to send a receive buffer
  * to the VSP. The VSP can then use the receive buffer to
  * send data to the VSC.
  */
 typedef struct nvsp_1_msg_send_rx_buf_ {
 	uint32_t                                gpadl_handle;
 	uint16_t                                id;
 } __packed nvsp_1_msg_send_rx_buf;
 
 typedef struct nvsp_1_rx_buf_section_ {
 	uint32_t                                offset;
 	uint32_t                                sub_allocation_size;
 	uint32_t                                num_sub_allocations;
 	uint32_t                                end_offset;
 } __packed nvsp_1_rx_buf_section;
 
 /*
  * This message is used by the VSP to acknowledge a receive 
  * buffer send by the VSC.  This message must be sent by the 
  * VSP before the VSP uses the receive buffer.
  */
 typedef struct nvsp_1_msg_send_rx_buf_complete_ {
 	uint32_t                                status;
 	uint32_t                                num_sections;
 
 	/*
 	 * The receive buffer is split into two parts, a large
 	 * suballocation section and a small suballocation
 	 * section. These sections are then suballocated by a 
 	 * certain size.
 	 *
 	 * For example, the following break up of the receive
 	 * buffer has 6 large suballocations and 10 small
 	 * suballocations.
 	 *
 	 * |            Large Section          |  |   Small Section   |
 	 * ------------------------------------------------------------
 	 * |     |     |     |     |     |     |  | | | | | | | | | | |
 	 * |                                      |  
 	 * LargeOffset                            SmallOffset
 	 */
 	nvsp_1_rx_buf_section                   sections[1];
 
 } __packed nvsp_1_msg_send_rx_buf_complete;
 
 /*
  * This message is sent by the VSC to revoke the receive buffer.
  * After the VSP completes this transaction, the VSP should never
  * use the receive buffer again.
  */
 typedef struct nvsp_1_msg_revoke_rx_buf_ {
 	uint16_t                                id;
 } __packed nvsp_1_msg_revoke_rx_buf;
 
 /*
  * This message is used by the VSC to send a send buffer
  * to the VSP. The VSC can then use the send buffer to
  * send data to the VSP.
  */
 typedef struct nvsp_1_msg_send_send_buf_ {
 	uint32_t                                gpadl_handle;
 	uint16_t                                id;
 } __packed nvsp_1_msg_send_send_buf;
 
 /*
  * This message is used by the VSP to acknowledge a send 
  * buffer sent by the VSC. This message must be sent by the 
  * VSP before the VSP uses the sent buffer.
  */
 typedef struct nvsp_1_msg_send_send_buf_complete_ {
 	uint32_t                                status;
 
 	/*
 	 * The VSC gets to choose the size of the send buffer and
 	 * the VSP gets to choose the sections size of the buffer.
 	 * This was done to enable dynamic reconfigurations when
 	 * the cost of GPA-direct buffers decreases.
 	 */
 	uint32_t                                section_size;
 } __packed nvsp_1_msg_send_send_buf_complete;
 
 /*
  * This message is sent by the VSC to revoke the send buffer.
  * After the VSP completes this transaction, the vsp should never
  * use the send buffer again.
  */
 typedef struct nvsp_1_msg_revoke_send_buf_ {
 	uint16_t                                id;
 } __packed nvsp_1_msg_revoke_send_buf;
 
 /*
  * This message is used by both the VSP and the VSC to send
  * an RNDIS message to the opposite channel endpoint.
  */
 typedef struct nvsp_1_msg_send_rndis_pkt_ {
 	/*
 	 * This field is specified by RNIDS.  They assume there's
 	 * two different channels of communication. However, 
 	 * the Network VSP only has one.  Therefore, the channel
 	 * travels with the RNDIS packet.
 	 */
 	uint32_t                                chan_type;
 
 	/*
 	 * This field is used to send part or all of the data
 	 * through a send buffer. This values specifies an 
 	 * index into the send buffer.  If the index is 
 	 * 0xFFFFFFFF, then the send buffer is not being used
 	 * and all of the data was sent through other VMBus
 	 * mechanisms.
 	 */
 	uint32_t                                send_buf_section_idx;
 	uint32_t                                send_buf_section_size;
 } __packed nvsp_1_msg_send_rndis_pkt;
 
 /*
  * This message is used by both the VSP and the VSC to complete
  * a RNDIS message to the opposite channel endpoint.  At this
  * point, the initiator of this message cannot use any resources
  * associated with the original RNDIS packet.
  */
 typedef struct nvsp_1_msg_send_rndis_pkt_complete_ {
 	uint32_t                                status;
 } __packed nvsp_1_msg_send_rndis_pkt_complete;
 
 
 /*
  * Version 2 Messages
  */
 
 /*
  * This message is used by the VSC to send the NDIS version
  * to the VSP.  The VSP can use this information when handling
  * OIDs sent by the VSC.
  */
 typedef struct nvsp_2_netvsc_capabilities_ {
 	union {
 		uint64_t                        as_uint64;
 		struct {
 			uint64_t                vmq           : 1;
 			uint64_t                chimney       : 1;
 			uint64_t                sriov         : 1;
 			uint64_t                ieee8021q     : 1;
 			uint64_t                correlationid : 1;
 			uint64_t                teaming       : 1;
 		} u2;
 	} u1;
 } __packed nvsp_2_netvsc_capabilities;
 
 typedef struct nvsp_2_msg_send_ndis_config_ {
 	uint32_t                                mtu;
 	uint32_t                                reserved;
 	nvsp_2_netvsc_capabilities              capabilities;
 } __packed nvsp_2_msg_send_ndis_config;
 
 /*
  * NvspMessage2TypeSendChimneyDelegatedBuffer
  */
 typedef struct nvsp_2_msg_send_chimney_buf_
 {
 	/*
 	 * On WIN7 beta, delegated_obj_max_size is defined as a uint32_t
 	 * Since WIN7 RC, it was split into two uint16_t.  To have the same
 	 * struct layout, delegated_obj_max_size shall be the first field.
 	 */
 	uint16_t                                delegated_obj_max_size;
 
 	/*
 	 * The revision # of chimney protocol used between NVSC and NVSP.
 	 *
 	 * This revision is NOT related to the chimney revision between
 	 * NDIS protocol and miniport drivers.
 	 */
 	uint16_t                                revision;
 
 	uint32_t                                gpadl_handle;
 } __packed nvsp_2_msg_send_chimney_buf;
 
 
 /* Unsupported chimney revision 0 (only present in WIN7 beta) */
 #define NVSP_CHIMNEY_REVISION_0                 0
 
 /* WIN7 Beta Chimney QFE */
 #define NVSP_CHIMNEY_REVISION_1                 1
 
 /* The chimney revision since WIN7 RC */
 #define NVSP_CHIMNEY_REVISION_2                 2
 
 
 /*
  * NvspMessage2TypeSendChimneyDelegatedBufferComplete
  */
 typedef struct nvsp_2_msg_send_chimney_buf_complete_ {
 	uint32_t                                status;
 
 	/*
 	 * Maximum number outstanding sends and pre-posted receives.
 	 *
 	 * NVSC should not post more than SendQuota/ReceiveQuota packets.
 	 * Otherwise, it can block the non-chimney path for an indefinite
 	 * amount of time.
 	 * (since chimney sends/receives are affected by the remote peer).
 	 *
 	 * Note: NVSP enforces the quota restrictions on a per-VMBCHANNEL
 	 * basis.  It doesn't enforce the restriction separately for chimney
 	 * send/receive.  If NVSC doesn't voluntarily enforce "SendQuota",
 	 * it may kill its own network connectivity.
 	 */
 	uint32_t                                send_quota;
 	uint32_t                                rx_quota;
 } __packed nvsp_2_msg_send_chimney_buf_complete;
 
 /*
  * NvspMessage2TypeRevokeChimneyDelegatedBuffer
  */
 typedef struct nvsp_2_msg_revoke_chimney_buf_ {
 	uint32_t                                gpadl_handle;
 } __packed nvsp_2_msg_revoke_chimney_buf;
 
 
 #define NVSP_CHIMNEY_OBJECT_TYPE_NEIGHBOR       0
 #define NVSP_CHIMNEY_OBJECT_TYPE_PATH4          1
 #define NVSP_CHIMNEY_OBJECT_TYPE_PATH6          2
 #define NVSP_CHIMNEY_OBJECT_TYPE_TCP            3
 
 /*
  * NvspMessage2TypeAllocateChimneyHandle
  */
 typedef struct nvsp_2_msg_alloc_chimney_handle_ {
 	uint64_t                                vsc_context;
 	uint32_t                                object_type;
 } __packed nvsp_2_msg_alloc_chimney_handle;
 
 /*
  * NvspMessage2TypeAllocateChimneyHandleComplete
  */
 typedef struct nvsp_2_msg_alloc_chimney_handle_complete_ {
 	uint32_t                                vsp_handle;
 } __packed nvsp_2_msg_alloc_chimney_handle_complete;
 
 
 /*
  * NvspMessage2TypeResumeChimneyRXIndication
  */
 typedef struct nvsp_2_msg_resume_chimney_rx_indication {
 	/*
 	 * Handle identifying the offloaded connection
 	 */
 	uint32_t                                vsp_tcp_handle;
 } __packed nvsp_2_msg_resume_chimney_rx_indication;
 
 
 #define NVSP_2_MSG_TERMINATE_CHIMNEY_FLAGS_FIRST_STAGE      (0x01u)
 #define NVSP_2_MSG_TERMINATE_CHIMNEY_FLAGS_RESERVED         (~(0x01u))
 
 /*
  * NvspMessage2TypeTerminateChimney
  */
 typedef struct nvsp_2_msg_terminate_chimney_ {
 	/*
 	* Handle identifying the offloaded object
 	*/
 	uint32_t                                vsp_handle;
 
 	/*
 	 * Terminate Offload Flags
 	 *     Bit 0:
 	 *         When set to 0, terminate the offload at the destination NIC
 	 *     Bit 1-31:  Reserved, shall be zero
 	 */
 	uint32_t                                flags;
 
 	union {
 		/*
 		 * This field is valid only when bit 0 of flags is clear.
 		 * It specifies the index into the premapped delegated
 		 * object buffer.  The buffer was sent through the
 		 * NvspMessage2TypeSendChimneyDelegatedBuffer
 		 * message at initialization time.
 		 *
 		 * NVSP will write the delegated state into the delegated
 		 * buffer upon upload completion.
 		 */
 		uint32_t                        index;
 
 		/*
 		 * This field is valid only when bit 0 of flags is set.
 		 *
 		 * The seqence number of the most recently accepted RX
 		 * indication when VSC sets its TCP context into
 		 * "terminating" state.
 		 *
 		 * This allows NVSP to determines if there are any in-flight
 		 * RX indications for which the acceptance state is still
 		 * undefined.
 		 */
 		uint64_t                        last_accepted_rx_seq_no;
 	} f0;
 } __packed nvsp_2_msg_terminate_chimney;
 
 
 #define NVSP_TERMINATE_CHIMNEY_COMPLETE_FLAG_DATA_CORRUPTED     0x0000001u
 
 /*
  * NvspMessage2TypeTerminateChimneyComplete
  */
 typedef struct nvsp_2_msg_terminate_chimney_complete_ {
 	uint64_t                                vsc_context;
 	uint32_t                                flags;
 } __packed nvsp_2_msg_terminate_chimney_complete;
 
 /*
  * NvspMessage2TypeIndicateChimneyEvent
  */
 typedef struct nvsp_2_msg_indicate_chimney_event_ {
 	/*
 	 * When VscTcpContext is 0, event_type is an NDIS_STATUS event code
 	 * Otherwise, EventType is an TCP connection event (defined in
 	 * NdisTcpOffloadEventHandler chimney DDK document).
 	 */
 	uint32_t                                event_type;
 
 	/*
 	 * When VscTcpContext is 0, EventType is an NDIS_STATUS event code
 	 * Otherwise, EventType is an TCP connection event specific information
 	 * (defined in NdisTcpOffloadEventHandler chimney DDK document).
 	 */
 	uint32_t                                event_specific_info;
 
 	/*
 	 * If not 0, the event is per-TCP connection event.  This field
 	 * contains the VSC's TCP context.
 	 * If 0, the event indication is global.
 	 */
 	uint64_t                                vsc_tcp_context;
 } __packed nvsp_2_msg_indicate_chimney_event;
 
 
 #define NVSP_1_CHIMNEY_SEND_INVALID_OOB_INDEX       0xffffu
 #define NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX   0xffffu
 
 /*
  * NvspMessage2TypeSendChimneyPacket
  */
 typedef struct nvsp_2_msg_send_chimney_pkt_ {
     /*
      * Identify the TCP connection for which this chimney send is
      */
     uint32_t                                    vsp_tcp_handle;
 
     /*
      * This field is used to send part or all of the data
      * through a send buffer. This values specifies an
      * index into the send buffer. If the index is
      * 0xFFFF, then the send buffer is not being used
      * and all of the data was sent through other VMBus
      * mechanisms.
      */
     uint16_t                                    send_buf_section_index;
     uint16_t                                    send_buf_section_size;
 
     /*
      * OOB Data Index
      * This an index to the OOB data buffer. If the index is 0xFFFFFFFF,
      * then there is no OOB data.
      *
      * This field shall be always 0xFFFFFFFF for now. It is reserved for
      * the future.
      */
     uint16_t                                    oob_data_index;
 
     /*
      * DisconnectFlags = 0
      *      Normal chimney send. See MiniportTcpOffloadSend for details.
      *
      * DisconnectFlags = TCP_DISCONNECT_GRACEFUL_CLOSE (0x01)
      *      Graceful disconnect. See MiniportTcpOffloadDisconnect for details.
      *
      * DisconnectFlags = TCP_DISCONNECT_ABORTIVE_CLOSE (0x02)
      *      Abortive disconnect. See MiniportTcpOffloadDisconnect for details.
      */
     uint16_t                                    disconnect_flags;
 
     uint32_t                                    seq_no;
 } __packed nvsp_2_msg_send_chimney_pkt;
 
 /*
  * NvspMessage2TypeSendChimneyPacketComplete
  */
 typedef struct nvsp_2_msg_send_chimney_pkt_complete_ {
     /*
      * The NDIS_STATUS for the chimney send
      */
     uint32_t                                    status;
 
     /*
      * Number of bytes that have been sent to the peer (and ACKed by the peer).
      */
     uint32_t                                    bytes_transferred;
 } __packed nvsp_2_msg_send_chimney_pkt_complete;
 
 
 #define NVSP_1_CHIMNEY_RECV_FLAG_NO_PUSH        0x0001u
 #define NVSP_1_CHIMNEY_RECV_INVALID_OOB_INDEX   0xffffu
 
 /*
  * NvspMessage2TypePostChimneyRecvRequest
  */
 typedef struct nvsp_2_msg_post_chimney_rx_request_ {
 	/*
 	 * Identify the TCP connection which this chimney receive request
 	 * is for.
 	 */
 	uint32_t                                vsp_tcp_handle;
 
 	/*
 	 * OOB Data Index
 	 * This an index to the OOB data buffer. If the index is 0xFFFFFFFF,
 	 * then there is no OOB data.
 	 *
 	 * This field shall be always 0xFFFFFFFF for now. It is reserved for
 	 * the future.
 	 */
 	uint32_t                                oob_data_index;
 
 	/*
 	 * Bit 0
 	 *      When it is set, this is a "no-push" receive.
 	 *      When it is clear, this is a "push" receive.
 	 *
 	 * Bit 1-15:  Reserved and shall be zero
 	 */
 	uint16_t                                flags;
 
 	/*
 	 * For debugging and diagnoses purpose.
 	 * The SeqNo is per TCP connection and starts from 0.
 	 */
 	uint32_t                                seq_no;
 } __packed nvsp_2_msg_post_chimney_rx_request;
 
 /*
  * NvspMessage2TypePostChimneyRecvRequestComplete
  */
 typedef struct nvsp_2_msg_post_chimney_rx_request_complete_ {
 	/*
 	 * The NDIS_STATUS for the chimney send
 	 */
 	uint32_t                                status;
 
 	/*
 	 * Number of bytes that have been sent to the peer (and ACKed by
 	 * the peer).
 	 */
 	uint32_t                                bytes_xferred;
 } __packed nvsp_2_msg_post_chimney_rx_request_complete;
 
 /*
  * NvspMessage2TypeAllocateReceiveBuffer
  */
 typedef struct nvsp_2_msg_alloc_rx_buf_ {
 	/*
 	 * Allocation ID to match the allocation request and response
 	 */
 	uint32_t                                allocation_id;
 
 	/*
 	 * Length of the VM shared memory receive buffer that needs to
 	 * be allocated
 	 */
 	uint32_t                                length;
 } __packed nvsp_2_msg_alloc_rx_buf;
 
 /*
  * NvspMessage2TypeAllocateReceiveBufferComplete
  */
 typedef struct nvsp_2_msg_alloc_rx_buf_complete_ {
 	/*
 	 * The NDIS_STATUS code for buffer allocation
 	 */
 	uint32_t                                status;
 
 	/*
 	 * Allocation ID from NVSP_2_MESSAGE_ALLOCATE_RECEIVE_BUFFER
 	 */
 	uint32_t                                allocation_id;
 
 	/*
 	 * GPADL handle for the allocated receive buffer
 	 */
 	uint32_t                                gpadl_handle;
 
 	/*
 	 * Receive buffer ID that is further used in
 	 * NvspMessage2SendVmqRndisPacket
 	 */
 	uint64_t                                rx_buf_id;
 } __packed nvsp_2_msg_alloc_rx_buf_complete;
 
 /*
  * NvspMessage2TypeFreeReceiveBuffer
  */
 typedef struct nvsp_2_msg_free_rx_buf_ {
 	/*
 	 * Receive buffer ID previous returned in
 	 * NvspMessage2TypeAllocateReceiveBufferComplete message
 	 */
 	uint64_t                                rx_buf_id;
 } __packed nvsp_2_msg_free_rx_buf;
 
 /*
  * This structure is used in defining the buffers in
  * NVSP_2_MESSAGE_SEND_VMQ_RNDIS_PACKET structure
  */
 typedef struct nvsp_xfer_page_range_ {
 	/*
 	 * Specifies the ID of the receive buffer that has the buffer. This
 	 * ID can be the general receive buffer ID specified in
 	 * NvspMessage1TypeSendReceiveBuffer or it can be the shared memory
 	 * receive buffer ID allocated by the VSC and specified in
 	 * NvspMessage2TypeAllocateReceiveBufferComplete message
 	 */
 	uint64_t                                xfer_page_set_id;
 
 	/*
 	 * Number of bytes
 	 */
 	uint32_t                                byte_count;
 
 	/*
 	 * Offset in bytes from the beginning of the buffer
 	 */
 	uint32_t                                byte_offset;
 } __packed nvsp_xfer_page_range;
 
 /*
  * NvspMessage2SendVmqRndisPacket
  */
 typedef struct nvsp_2_msg_send_vmq_rndis_pkt_ {
 	/*
 	 * This field is specified by RNIDS. They assume there's
 	 * two different channels of communication. However,
 	 * the Network VSP only has one. Therefore, the channel
 	 * travels with the RNDIS packet. It must be RMC_DATA
 	 */
 	uint32_t                                channel_type;
 
 	/*
 	 * Only the Range element corresponding to the RNDIS header of
 	 * the first RNDIS message in the multiple RNDIS messages sent
 	 * in one NVSP message.  Information about the data portions as well
 	 * as the subsequent RNDIS messages in the same NVSP message are
 	 * embedded in the RNDIS header itself
 	 */
 	nvsp_xfer_page_range                    range;
 } __packed nvsp_2_msg_send_vmq_rndis_pkt;
 
 /*
  * This message is used by the VSC to complete
  * a RNDIS VMQ message to the VSP.  At this point,
  * the initiator of this message can use any resources
  * associated with the original RNDIS VMQ packet.
  */
 typedef struct nvsp_2_msg_send_vmq_rndis_pkt_complete_
 {
 	uint32_t                                status;
 } __packed nvsp_2_msg_send_vmq_rndis_pkt_complete;
 
 
 typedef union nvsp_1_msg_uber_ {
 	nvsp_1_msg_send_ndis_version            send_ndis_vers;
 
 	nvsp_1_msg_send_rx_buf                  send_rx_buf;
 	nvsp_1_msg_send_rx_buf_complete         send_rx_buf_complete;
 	nvsp_1_msg_revoke_rx_buf                revoke_rx_buf;
 
 	nvsp_1_msg_send_send_buf                send_send_buf;
 	nvsp_1_msg_send_send_buf_complete       send_send_buf_complete;
 	nvsp_1_msg_revoke_send_buf              revoke_send_buf;
 
 	nvsp_1_msg_send_rndis_pkt               send_rndis_pkt;
 	nvsp_1_msg_send_rndis_pkt_complete      send_rndis_pkt_complete;
 } __packed nvsp_1_msg_uber;
 
 
 typedef union nvsp_2_msg_uber_ {
 	nvsp_2_msg_send_ndis_config             send_ndis_config;
 
 	nvsp_2_msg_send_chimney_buf             send_chimney_buf;
 	nvsp_2_msg_send_chimney_buf_complete    send_chimney_buf_complete;
 	nvsp_2_msg_revoke_chimney_buf           revoke_chimney_buf;
 
 	nvsp_2_msg_resume_chimney_rx_indication resume_chimney_rx_indication;
 	nvsp_2_msg_terminate_chimney            terminate_chimney;
 	nvsp_2_msg_terminate_chimney_complete   terminate_chimney_complete;
 	nvsp_2_msg_indicate_chimney_event       indicate_chimney_event;
 
 	nvsp_2_msg_send_chimney_pkt             send_chimney_packet;
 	nvsp_2_msg_send_chimney_pkt_complete    send_chimney_packet_complete;
 	nvsp_2_msg_post_chimney_rx_request      post_chimney_rx_request;
 	nvsp_2_msg_post_chimney_rx_request_complete
 	                                       post_chimney_rx_request_complete;
 
 	nvsp_2_msg_alloc_rx_buf                 alloc_rx_buffer;
 	nvsp_2_msg_alloc_rx_buf_complete        alloc_rx_buffer_complete;
 	nvsp_2_msg_free_rx_buf                  free_rx_buffer;
 
 	nvsp_2_msg_send_vmq_rndis_pkt           send_vmq_rndis_pkt;
 	nvsp_2_msg_send_vmq_rndis_pkt_complete  send_vmq_rndis_pkt_complete;
 	nvsp_2_msg_alloc_chimney_handle         alloc_chimney_handle;
 	nvsp_2_msg_alloc_chimney_handle_complete alloc_chimney_handle_complete;
 } __packed nvsp_2_msg_uber;
 
 
 typedef union nvsp_all_msgs_ {
 	nvsp_msg_init_uber                      init_msgs;
 	nvsp_1_msg_uber                         vers_1_msgs;
 	nvsp_2_msg_uber                         vers_2_msgs;
 } __packed nvsp_all_msgs;
 
 /*
  * ALL Messages
  */
 typedef struct nvsp_msg_ {
 	nvsp_msg_hdr                            hdr; 
 	nvsp_all_msgs                           msgs;
 } __packed nvsp_msg;
 
 
 /*
  * The following arguably belongs in a separate header file
  */
 
 /*
  * Defines
  */
 
 #define NETVSC_SEND_BUFFER_SIZE			(64*1024)   /* 64K */
 #define NETVSC_SEND_BUFFER_ID			0xface
 
 
 #define NETVSC_RECEIVE_BUFFER_SIZE		(1024*1024) /* 1MB */
 
 #define NETVSC_RECEIVE_BUFFER_ID		0xcafe
 
 #define NETVSC_RECEIVE_SG_COUNT			1
 
 /* Preallocated receive packets */
 #define NETVSC_RECEIVE_PACKETLIST_COUNT		256
 
 /*
  * Maximum MTU we permit to be configured for a netvsc interface.
  * When the code was developed, a max MTU of 12232 was tested and
  * proven to work.  9K is a reasonable maximum for an Ethernet.
  */
 #define NETVSC_MAX_CONFIGURABLE_MTU		(9 * 1024)
 
 /*
  * Data types
  */
 
 /*
  * Per netvsc channel-specific
  */
 typedef struct netvsc_dev_ {
 	struct hv_device			*dev;
 	int					num_outstanding_sends;
 
 	/* List of free preallocated NETVSC_PACKET to represent RX packet */
 	STAILQ_HEAD(PQ, netvsc_packet_)		myrx_packet_list;
 	struct mtx				rx_pkt_list_lock;
 
 	/* Send buffer allocated by us but manages by NetVSP */
 	void					*send_buf;
 	uint32_t				send_buf_size;
 	uint32_t				send_buf_gpadl_handle;
 	uint32_t				send_section_size;
 
 	/* Receive buffer allocated by us but managed by NetVSP */
 	void					*rx_buf;
 	uint32_t				rx_buf_size;
 	uint32_t				rx_buf_gpadl_handle;
 	uint32_t				rx_section_count;
 	nvsp_1_rx_buf_section			*rx_sections;
 
 	/* Used for NetVSP initialization protocol */
 	struct sema				channel_init_sema;
 	nvsp_msg				channel_init_packet;
 
 	nvsp_msg				revoke_packet;
 	/*uint8_t				hw_mac_addr[HW_MACADDR_LEN];*/
 
 	/* Holds rndis device info */
 	void					*extension;
 
 	hv_bool_uint8_t				destroy;
 	/* Negotiated NVSP version */
 	uint32_t				nvsp_version;
 } netvsc_dev;
 
 
 typedef void (*pfn_on_send_rx_completion)(void *);
 
 #define NETVSC_DEVICE_RING_BUFFER_SIZE   (64 * PAGE_SIZE)
 #define NETVSC_PACKET_MAXPAGE            16
 
 
 typedef struct xfer_page_packet_ {
 	/*
 	 * This needs to be here because the network RX code casts
 	 * an instantiation of this structure to a netvsc_packet.
 	 */
 	STAILQ_ENTRY(netvsc_packet_) mylist_entry;
 
 	uint32_t count;
 } xfer_page_packet;
 
 typedef struct netvsc_packet_ {
 	/*
 	 * List used when enqueued on &net_dev->rx_packet_list,
 	 * and when enqueued within the netvsc code
 	 */
 	STAILQ_ENTRY(netvsc_packet_) mylist_entry;
 	struct hv_device           *device;
 	hv_bool_uint8_t            is_data_pkt;      /* One byte */
 	uint16_t		   vlan_tci;
 	xfer_page_packet           *xfer_page_pkt;
 
 	/* Completion */
 	union {
 		struct {
 			uint64_t   rx_completion_tid;
 			void	   *rx_completion_context;
 			/* This is no longer used */
 			pfn_on_send_rx_completion   on_rx_completion;
 		} rx;
 		struct {
 			uint64_t    send_completion_tid;
 			void	    *send_completion_context;
 			/* Still used in netvsc and filter code */
 			pfn_on_send_rx_completion   on_send_completion;
 		} send;
 	} compl;
 
 	void		*extension;
 	uint32_t	tot_data_buf_len;
 	uint32_t	page_buf_count;
 	hv_vmbus_page_buffer	page_buffers[NETVSC_PACKET_MAXPAGE];
 } netvsc_packet;
 
 typedef struct {
 	uint8_t		mac_addr[6];  /* Assumption unsigned long */
 	hv_bool_uint8_t	link_state;
 } netvsc_device_info;
 
 /*
  * Device-specific softc structure
  */
 typedef struct hn_softc {
 	struct ifnet    *hn_ifp;
-	struct arpcom   arpcom;
 	device_t        hn_dev;
 	uint8_t         hn_unit;
 	int             hn_carrier;
 	int             hn_if_flags;
 	struct mtx      hn_lock;
 	int             hn_initdone;
 	/* See hv_netvsc_drv_freebsd.c for rules on how to use */
 	int             temp_unusable;
 	struct hv_device  *hn_dev_obj;
 	netvsc_dev  	*net_dev;
 } hn_softc_t;
 
 
 /*
  * Externs
  */
 extern int hv_promisc_mode;
 
 extern void netvsc_linkstatus_callback(struct hv_device *device_obj,
 				       uint32_t status);
 extern int  netvsc_recv(struct hv_device *device_obj, netvsc_packet *packet);
 extern void netvsc_xmit_completion(void *context);
 
 extern void hv_nv_on_receive_completion(void *context);
 extern netvsc_dev *hv_nv_on_device_add(struct hv_device *device, void *additional_info);
 extern int  hv_nv_on_device_remove(struct hv_device *device,
 				   boolean_t destroy_channel);
 extern int  hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt);
 
 #endif  /* __HV_NET_VSC_H__ */
 
Index: user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
===================================================================
--- user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c	(revision 274240)
+++ user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c	(revision 274241)
@@ -1,1018 +1,1016 @@
 /*-
  * Copyright (c) 2010-2012 Citrix Inc.
  * Copyright (c) 2009-2012 Microsoft Corp.
  * Copyright (c) 2012 NetApp Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 2004-2006 Kip Macy
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/ethernet.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 
 #include <net/bpf.h>
 
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/if_ether.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/pmap.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <machine/frame.h>
 #include <machine/vmparam.h>
 
 #include <sys/bus.h>
 #include <sys/rman.h>
 #include <sys/mutex.h>
 #include <sys/errno.h>
 #include <sys/types.h>
 #include <machine/atomic.h>
 
 #include <machine/intr_machdep.h>
 
 #include <dev/hyperv/include/hyperv.h>
 #include "hv_net_vsc.h"
 #include "hv_rndis.h"
 #include "hv_rndis_filter.h"
 
 
 /* Short for Hyper-V network interface */
 #define NETVSC_DEVNAME    "hn"
 
 /*
  * It looks like offset 0 of buf is reserved to hold the softc pointer.
  * The sc pointer evidently not needed, and is not presently populated.
  * The packet offset is where the netvsc_packet starts in the buffer.
  */
 #define HV_NV_SC_PTR_OFFSET_IN_BUF         0
 #define HV_NV_PACKET_OFFSET_IN_BUF         16
 
 
 /*
  * Data types
  */
 
 struct hv_netvsc_driver_context {
 	uint32_t		drv_inited;
 };
 
 /*
  * Be aware that this sleepable mutex will exhibit WITNESS errors when
  * certain TCP and ARP code paths are taken.  This appears to be a
  * well-known condition, as all other drivers checked use a sleeping
  * mutex to protect their transmit paths.
  * Also Be aware that mutexes do not play well with semaphores, and there
  * is a conflicting semaphore in a certain channel code path.
  */
 #define NV_LOCK_INIT(_sc, _name) \
 	    mtx_init(&(_sc)->hn_lock, _name, MTX_NETWORK_LOCK, MTX_DEF)
 #define NV_LOCK(_sc)		mtx_lock(&(_sc)->hn_lock)
 #define NV_LOCK_ASSERT(_sc)	mtx_assert(&(_sc)->hn_lock, MA_OWNED)
 #define NV_UNLOCK(_sc)		mtx_unlock(&(_sc)->hn_lock)
 #define NV_LOCK_DESTROY(_sc)	mtx_destroy(&(_sc)->hn_lock)
 
 
 /*
  * Globals
  */
 
 int hv_promisc_mode = 0;    /* normal mode by default */
 
 /* The one and only one */
 static struct hv_netvsc_driver_context g_netvsc_drv;
 
 
 /*
  * Forward declarations
  */
 static void hn_stop(hn_softc_t *sc);
 static void hn_ifinit_locked(hn_softc_t *sc);
 static void hn_ifinit(void *xsc);
 static int  hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
 static int  hn_start_locked(struct ifnet *ifp);
 static void hn_start(struct ifnet *ifp);
 
 
 /*
  * NetVsc driver initialization
  * Note:  Filter init is no longer required
  */
 static int
 netvsc_drv_init(void)
 {
 	return (0);
 }
 
 /*
  * NetVsc global initialization entry point
  */
 static void
 netvsc_init(void)
 {
 	if (bootverbose)
 		printf("Netvsc initializing... ");
 
 	/*
 	 * XXXKYS: cleanup initialization
 	 */
 	if (!cold && !g_netvsc_drv.drv_inited) {
 		g_netvsc_drv.drv_inited = 1;
 		netvsc_drv_init();
 		if (bootverbose)
 			printf("done!\n");
 	} else if (bootverbose)
 		printf("Already initialized!\n");
 }
 
 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
 static const hv_guid g_net_vsc_device_type = {
 	.data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
 };
 
 /*
  * Standard probe entry point.
  *
  */
 static int
 netvsc_probe(device_t dev)
 {
 	const char *p;
 
 	p = vmbus_get_type(dev);
 	if (!memcmp(p, &g_net_vsc_device_type.data, sizeof(hv_guid))) {
 		device_set_desc(dev, "Synthetic Network Interface");
 		if (bootverbose)
 			printf("Netvsc probe... DONE \n");
 
 		return (BUS_PROBE_DEFAULT);
 	}
 
 	return (ENXIO);
 }
 
 /*
  * Standard attach entry point.
  *
  * Called when the driver is loaded.  It allocates needed resources,
  * and initializes the "hardware" and software.
  */
 static int
 netvsc_attach(device_t dev)
 {
 	struct hv_device *device_ctx = vmbus_get_devctx(dev);
 	netvsc_device_info device_info;
 	hn_softc_t *sc;
 	int unit = device_get_unit(dev);
 	struct ifnet *ifp;
 	int ret;
 
 	netvsc_init();
 
 	sc = device_get_softc(dev);
 	if (sc == NULL) {
 		return (ENOMEM);
 	}
 
 	bzero(sc, sizeof(hn_softc_t));
 	sc->hn_unit = unit;
 	sc->hn_dev = dev;
 
 	NV_LOCK_INIT(sc, "NetVSCLock");
 
 	sc->hn_dev_obj = device_ctx;
 
-	ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
+	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
 	ifp->if_softc = sc;
 
 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 	ifp->if_dunit = unit;
 	ifp->if_dname = NETVSC_DEVNAME;
 
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_ioctl = hn_ioctl;
 	ifp->if_start = hn_start;
 	ifp->if_init = hn_ifinit;
 	/* needed by hv_rf_on_device_add() code */
 	ifp->if_mtu = ETHERMTU;
 	IFQ_SET_MAXLEN(&ifp->if_snd, 512);
 	ifp->if_snd.ifq_drv_maxlen = 511;
 	IFQ_SET_READY(&ifp->if_snd);
 
 	/*
 	 * Tell upper layers that we support full VLAN capability.
 	 */
 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
 	ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
 
 	ret = hv_rf_on_device_add(device_ctx, &device_info);
 	if (ret != 0) {
 		if_free(ifp);
 
 		return (ret);
 	}
 	if (device_info.link_state == 0) {
 		sc->hn_carrier = 1;
 	}
 
 	ether_ifattach(ifp, device_info.mac_addr);
 
 	return (0);
 }
 
 /*
  * Standard detach entry point
  */
 static int
 netvsc_detach(device_t dev)
 {
 	struct hv_device *hv_device = vmbus_get_devctx(dev); 
 
 	if (bootverbose)
 		printf("netvsc_detach\n");
 
 	/*
 	 * XXXKYS:  Need to clean up all our
 	 * driver state; this is the driver
 	 * unloading.
 	 */
 
 	/*
 	 * XXXKYS:  Need to stop outgoing traffic and unregister
 	 * the netdevice.
 	 */
 
 	hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL);
 
 	return (0);
 }
 
 /*
  * Standard shutdown entry point
  */
 static int
 netvsc_shutdown(device_t dev)
 {
 	return (0);
 }
 
 /*
  * Send completion processing
  *
  * Note:  It looks like offset 0 of buf is reserved to hold the softc
  * pointer.  The sc pointer is not currently needed in this function, and
  * it is not presently populated by the TX function.
  */
 void
 netvsc_xmit_completion(void *context)
 {
 	netvsc_packet *packet = (netvsc_packet *)context;
 	struct mbuf *mb;
 	uint8_t *buf;
 
 	mb = (struct mbuf *)(uintptr_t)packet->compl.send.send_completion_tid;
 	buf = ((uint8_t *)packet) - HV_NV_PACKET_OFFSET_IN_BUF;
 
 	free(buf, M_DEVBUF);
 
 	if (mb != NULL) {
 		m_freem(mb);
 	}
 }
 
 /*
  * Start a transmit of one or more packets
  */
 static int
 hn_start_locked(struct ifnet *ifp)
 {
 	hn_softc_t *sc = ifp->if_softc;
 	struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
 	uint8_t *buf;
 	netvsc_packet *packet;
 	struct mbuf *m_head, *m;
 	struct mbuf *mc_head = NULL;
 	int i;
 	int num_frags;
 	int len;
 	int xlen;
 	int rppi_size;
 	int retries = 0;
 	int ret = 0;
 
 	while (!IFQ_DRV_IS_EMPTY(&sc->hn_ifp->if_snd)) {
 		IFQ_DRV_DEQUEUE(&sc->hn_ifp->if_snd, m_head);
 		if (m_head == NULL) {
 			break;
 		}
 
 		len = 0;
 		num_frags = 0;
 		xlen = 0;
 
 		/* Walk the mbuf list computing total length and num frags */
 		for (m = m_head; m != NULL; m = m->m_next) {
 			if (m->m_len != 0) {
 				num_frags++;
 				len += m->m_len;
 			}
 		}
 
 		/*
 		 * Reserve the number of pages requested.  Currently,
 		 * one page is reserved for the message in the RNDIS
 		 * filter packet
 		 */
 		num_frags += HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
 
 		/* If exceeds # page_buffers in netvsc_packet */
 		if (num_frags > NETVSC_PACKET_MAXPAGE) {
 			m_freem(m);
 
 			return (EINVAL);
 		}
 
 		rppi_size = 0;
 		if (m_head->m_flags & M_VLANTAG) {
 			rppi_size = sizeof(rndis_per_packet_info) + 
 			    sizeof(ndis_8021q_info);
 		}
 
 		/*
 		 * Allocate a buffer with space for a netvsc packet plus a
 		 * number of reserved areas.  First comes a (currently 16
 		 * bytes, currently unused) reserved data area.  Second is
 		 * the netvsc_packet, which includes (currently 4) page
 		 * buffers.  Third (optional) is a rndis_per_packet_info
 		 * struct, but only if a VLAN tag should be inserted into the
 		 * Ethernet frame by the Hyper-V infrastructure.  Fourth is
 		 * an area reserved for an rndis_filter_packet struct.
 		 * Changed malloc to M_NOWAIT to avoid sleep under spin lock.
 		 * No longer reserving extra space for page buffers, as they
 		 * are already part of the netvsc_packet.
 		 */
 		buf = malloc(HV_NV_PACKET_OFFSET_IN_BUF +
 		    sizeof(netvsc_packet) + rppi_size +
 		    sizeof(rndis_filter_packet),
 		    M_DEVBUF, M_ZERO | M_NOWAIT);
 		if (buf == NULL) {
 			m_freem(m);
 
 			return (ENOMEM);
 		}
 
 		packet = (netvsc_packet *)(buf + HV_NV_PACKET_OFFSET_IN_BUF);
 		*(vm_offset_t *)buf = HV_NV_SC_PTR_OFFSET_IN_BUF;
 
 		/*
 		 * extension points to the area reserved for the
 		 * rndis_filter_packet, which is placed just after
 		 * the netvsc_packet (and rppi struct, if present;
 		 * length is updated later).
 		 */
 		packet->extension = packet + 1;
 
 		/* Set up the rndis header */
 		packet->page_buf_count = num_frags;
 
 		/* Initialize it from the mbuf */
 		packet->tot_data_buf_len = len;
 
 		/*
 		 * If the Hyper-V infrastructure needs to embed a VLAN tag,
 		 * initialize netvsc_packet and rppi struct values as needed.
 		 */
 		if (rppi_size) {
 			/* Lower layers need the VLAN TCI */
 			packet->vlan_tci = m_head->m_pkthdr.ether_vtag;
 		}
 
 		/*
 		 * Fill the page buffers with mbuf info starting at index
 		 * HV_RF_NUM_TX_RESERVED_PAGE_BUFS.
 		 */
 		i = HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
 		for (m = m_head; m != NULL; m = m->m_next) {
 			if (m->m_len) {
 				vm_offset_t paddr =
 				    vtophys(mtod(m, vm_offset_t));
 				packet->page_buffers[i].pfn =
 				    paddr >> PAGE_SHIFT;
 				packet->page_buffers[i].offset =
 				    paddr & (PAGE_SIZE - 1);
 				packet->page_buffers[i].length = m->m_len;
 				i++;
 			}
 		}
 
 		/*
 		 * If bpf, copy the mbuf chain.  This is less expensive than
 		 * it appears; the mbuf clusters are not copied, only their
 		 * reference counts are incremented.
 		 * Needed to avoid a race condition where the completion
 		 * callback is invoked, freeing the mbuf chain, before the
 		 * bpf_mtap code has a chance to run.
 		 */
 		if (ifp->if_bpf) {
 			mc_head = m_copypacket(m_head, M_NOWAIT);
 		}
 retry_send:
 		/* Set the completion routine */
 		packet->compl.send.on_send_completion = netvsc_xmit_completion;
 		packet->compl.send.send_completion_context = packet;
 		packet->compl.send.send_completion_tid = (uint64_t)(uintptr_t)m_head;
 
 		/* Removed critical_enter(), does not appear necessary */
 		ret = hv_rf_on_send(device_ctx, packet);
 
 		if (ret == 0) {
 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 			/* if bpf && mc_head, call bpf_mtap code */
 			if (mc_head) {
 				ETHER_BPF_MTAP(ifp, mc_head);
 			}
 		} else {
 			retries++;
 			if (retries < 4) {
 				goto retry_send;
 			}
 
 			IF_PREPEND(&ifp->if_snd, m_head);
 			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 
 			/*
 			 * Null the mbuf pointer so the completion function
 			 * does not free the mbuf chain.  We just pushed the
 			 * mbuf chain back on the if_snd queue.
 			 */
 			packet->compl.send.send_completion_tid = 0;
 
 			/*
 			 * Release the resources since we will not get any
 			 * send completion
 			 */
 			netvsc_xmit_completion(packet);
 		}
 
 		/* if bpf && mc_head, free the mbuf chain copy */
 		if (mc_head) {
 			m_freem(mc_head);
 		}
 	}
 
 	return (ret);
 }
 
 /*
  * Link up/down notification
  */
 void
 netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status)
 {
 	hn_softc_t *sc = device_get_softc(device_obj->device);
 
 	if (sc == NULL) {
 		return;
 	}
 
 	if (status == 1) {
 		sc->hn_carrier = 1;
 	} else {
 		sc->hn_carrier = 0;
 	}
 }
 
 /*
  * Append the specified data to the indicated mbuf chain,
  * Extend the mbuf chain if the new data does not fit in
  * existing space.
  *
  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
  * There should be an equivalent in the kernel mbuf code,
  * but there does not appear to be one yet.
  *
  * Differs from m_append() in that additional mbufs are
  * allocated with cluster size MJUMPAGESIZE, and filled
  * accordingly.
  *
  * Return 1 if able to complete the job; otherwise 0.
  */
 static int
 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
 {
 	struct mbuf *m, *n;
 	int remainder, space;
 
 	for (m = m0; m->m_next != NULL; m = m->m_next)
 		;
 	remainder = len;
 	space = M_TRAILINGSPACE(m);
 	if (space > 0) {
 		/*
 		 * Copy into available space.
 		 */
 		if (space > remainder)
 			space = remainder;
 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
 		m->m_len += space;
 		cp += space;
 		remainder -= space;
 	}
 	while (remainder > 0) {
 		/*
 		 * Allocate a new mbuf; could check space
 		 * and allocate a cluster instead.
 		 */
 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
 		if (n == NULL)
 			break;
 		n->m_len = min(MJUMPAGESIZE, remainder);
 		bcopy(cp, mtod(n, caddr_t), n->m_len);
 		cp += n->m_len;
 		remainder -= n->m_len;
 		m->m_next = n;
 		m = n;
 	}
 	if (m0->m_flags & M_PKTHDR)
 		m0->m_pkthdr.len += len - remainder;
 
 	return (remainder == 0);
 }
 
 
 /*
  * Called when we receive a data packet from the "wire" on the
  * specified device
  *
  * Note:  This is no longer used as a callback
  */
 int
 netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet)
 {
 	hn_softc_t *sc = (hn_softc_t *)device_get_softc(device_ctx->device);
 	struct mbuf *m_new;
 	struct ifnet *ifp;
 	int size;
 	int i;
 
 	if (sc == NULL) {
 		return (0); /* TODO: KYS how can this be! */
 	}
 
 	ifp = sc->hn_ifp;
-	
-	ifp = sc->arpcom.ac_ifp;
 
 	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 		return (0);
 	}
 
 	/*
 	 * Bail out if packet contains more data than configured MTU.
 	 */
 	if (packet->tot_data_buf_len > (ifp->if_mtu + ETHER_HDR_LEN)) {
 		return (0);
 	}
 
 	/*
 	 * Get an mbuf with a cluster.  For packets 2K or less,
 	 * get a standard 2K cluster.  For anything larger, get a
 	 * 4K cluster.  Any buffers larger than 4K can cause problems
 	 * if looped around to the Hyper-V TX channel, so avoid them.
 	 */
 	size = MCLBYTES;
 
 	if (packet->tot_data_buf_len > MCLBYTES) {
 		/* 4096 */
 		size = MJUMPAGESIZE;
 	}
 
 	m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
 
 	if (m_new == NULL)
 		return (0);
 
 	/*
 	 * Remove trailing junk from RX data buffer.
 	 * Fixme:  This will not work for multiple Hyper-V RX buffers.
 	 * Fortunately, the channel gathers all RX data into one buffer.
 	 *
 	 * L2 frame length, with L2 header, not including CRC
 	 */
 	packet->page_buffers[0].length = packet->tot_data_buf_len;
 
 	/*
 	 * Copy the received packet to one or more mbufs. 
 	 * The copy is required since the memory pointed to by netvsc_packet
 	 * cannot be deallocated
 	 */
 	for (i=0; i < packet->page_buf_count; i++) {
 		/* Shift virtual page number to form virtual page address */
 		uint8_t *vaddr = (uint8_t *)(uintptr_t)
 		    (packet->page_buffers[i].pfn << PAGE_SHIFT);
 
 		hv_m_append(m_new, packet->page_buffers[i].length,
 		    vaddr + packet->page_buffers[i].offset);
 	}
 
 	m_new->m_pkthdr.rcvif = ifp;
 
 	if ((packet->vlan_tci != 0) &&
 			    (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) {
 		m_new->m_pkthdr.ether_vtag = packet->vlan_tci;
 		m_new->m_flags |= M_VLANTAG;
 	}
 
 	/*
 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
 	 * messages (not just data messages) will trigger a response.
 	 */
 
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 
 	/* We're not holding the lock here, so don't release it */
 	(*ifp->if_input)(ifp, m_new);
 
 	return (0);
 }
 
 /*
  * Rules for using sc->temp_unusable:
  * 1.  sc->temp_unusable can only be read or written while holding NV_LOCK()
  * 2.  code reading sc->temp_unusable under NV_LOCK(), and finding 
  *     sc->temp_unusable set, must release NV_LOCK() and exit
  * 3.  to retain exclusive control of the interface,
  *     sc->temp_unusable must be set by code before releasing NV_LOCK()
  * 4.  only code setting sc->temp_unusable can clear sc->temp_unusable
  * 5.  code setting sc->temp_unusable must eventually clear sc->temp_unusable
  */
 
 /*
  * Standard ioctl entry point.  Called when the user wants to configure
  * the interface.
  */
 static int
 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	hn_softc_t *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	netvsc_device_info device_info;
 	struct hv_device *hn_dev;
 	int mask, error = 0;
 	int retry_cnt = 500;
 	
 	switch(cmd) {
 
 	case SIOCSIFADDR:
 	case SIOCGIFADDR:
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	case SIOCSIFMTU:
 		hn_dev = vmbus_get_devctx(sc->hn_dev);
 
 		/* Check MTU value change */
 		if (ifp->if_mtu == ifr->ifr_mtu)
 			break;
 
 		if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) {
 			error = EINVAL;
 			break;
 		}
 
 		/* Obtain and record requested MTU */
 		ifp->if_mtu = ifr->ifr_mtu;
  		
 		do {
 			NV_LOCK(sc);
 			if (!sc->temp_unusable) {
 				sc->temp_unusable = TRUE;
 				retry_cnt = -1;
 			}
 			NV_UNLOCK(sc);
 			if (retry_cnt > 0) {
 				retry_cnt--;
 				DELAY(5 * 1000);
 			}
 		} while (retry_cnt > 0);
 
 		if (retry_cnt == 0) {
 			error = EINVAL;
 			break;
 		}
 
 		/* We must remove and add back the device to cause the new
 		 * MTU to take effect.  This includes tearing down, but not
 		 * deleting the channel, then bringing it back up.
 		 */
 		error = hv_rf_on_device_remove(hn_dev, HV_RF_NV_RETAIN_CHANNEL);
 		if (error) {
 			NV_LOCK(sc);
 			sc->temp_unusable = FALSE;
 			NV_UNLOCK(sc);
 			break;
 		}
 		error = hv_rf_on_device_add(hn_dev, &device_info);
 		if (error) {
 			NV_LOCK(sc);
 			sc->temp_unusable = FALSE;
 			NV_UNLOCK(sc);
 			break;
 		}
 
 		hn_ifinit_locked(sc);
 
 		NV_LOCK(sc);
 		sc->temp_unusable = FALSE;
 		NV_UNLOCK(sc);
 		break;
 	case SIOCSIFFLAGS:
 		do {
                        NV_LOCK(sc);
                        if (!sc->temp_unusable) {
                                sc->temp_unusable = TRUE;
                                retry_cnt = -1;
                        }
                        NV_UNLOCK(sc);
                        if (retry_cnt > 0) {
                       	        retry_cnt--;
                         	DELAY(5 * 1000);
                        }
                 } while (retry_cnt > 0);
 
                 if (retry_cnt == 0) {
                        error = EINVAL;
                        break;
                 }
 
 		if (ifp->if_flags & IFF_UP) {
 			/*
 			 * If only the state of the PROMISC flag changed,
 			 * then just use the 'set promisc mode' command
 			 * instead of reinitializing the entire NIC. Doing
 			 * a full re-init means reloading the firmware and
 			 * waiting for it to start up, which may take a
 			 * second or two.
 			 */
 #ifdef notyet
 			/* Fixme:  Promiscuous mode? */
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
 			    ifp->if_flags & IFF_PROMISC &&
 			    !(sc->hn_if_flags & IFF_PROMISC)) {
 				/* do something here for Hyper-V */
 			} else if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
 			    !(ifp->if_flags & IFF_PROMISC) &&
 			    sc->hn_if_flags & IFF_PROMISC) {
 				/* do something here for Hyper-V */
 			} else
 #endif
 				hn_ifinit_locked(sc);
 		} else {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				hn_stop(sc);
 			}
 		}
 		NV_LOCK(sc);
 		sc->temp_unusable = FALSE;
 		NV_UNLOCK(sc);
 		sc->hn_if_flags = ifp->if_flags;
 		error = 0;
 		break;
 	case SIOCSIFCAP:
 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
 		if (mask & IFCAP_HWCSUM) {
 			if (IFCAP_HWCSUM & ifp->if_capenable) {
 				ifp->if_capenable &= ~IFCAP_HWCSUM;
 			} else {
 				ifp->if_capenable |= IFCAP_HWCSUM;
 			}
 		}
 		error = 0;
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 #ifdef notyet
 		/* Fixme:  Multicast mode? */
 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 			NV_LOCK(sc);
 			netvsc_setmulti(sc);
 			NV_UNLOCK(sc);
 			error = 0;
 		}
 #endif
 		/* FALLTHROUGH */
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		error = EINVAL;
 		break;
 	default:
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 
 	return (error);
 }
 
 /*
  *
  */
 static void
 hn_stop(hn_softc_t *sc)
 {
 	struct ifnet *ifp;
 	int ret;
 	struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
 
 	ifp = sc->hn_ifp;
 
 	if (bootverbose)
 		printf(" Closing Device ...\n");
 
 	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 	sc->hn_initdone = 0;
 
 	ret = hv_rf_on_close(device_ctx);
 }
 
 /*
  * FreeBSD transmit entry point
  */
 static void
 hn_start(struct ifnet *ifp)
 {
 	hn_softc_t *sc;
 
 	sc = ifp->if_softc;
 	NV_LOCK(sc);
 	if (sc->temp_unusable) {
 		NV_UNLOCK(sc);
 		return;
 	}
 	hn_start_locked(ifp);
 	NV_UNLOCK(sc);
 }
 
 /*
  *
  */
 static void
 hn_ifinit_locked(hn_softc_t *sc)
 {
 	struct ifnet *ifp;
 	struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
 	int ret;
 
 	ifp = sc->hn_ifp;
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		return;
 	}
 
 	hv_promisc_mode = 1;
 
 	ret = hv_rf_on_open(device_ctx);
 	if (ret != 0) {
 		return;
 	} else {
 		sc->hn_initdone = 1;
 	}
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 }
 
 /*
  *
  */
 static void
 hn_ifinit(void *xsc)
 {
 	hn_softc_t *sc = xsc;
 
 	NV_LOCK(sc);
 	if (sc->temp_unusable) {
 		NV_UNLOCK(sc);
 		return;
 	}
 	sc->temp_unusable = TRUE;
 	NV_UNLOCK(sc);
 
 	hn_ifinit_locked(sc);
 
 	NV_LOCK(sc);
 	sc->temp_unusable = FALSE;
 	NV_UNLOCK(sc);
 }
 
 #ifdef LATER
 /*
  *
  */
 static void
 hn_watchdog(struct ifnet *ifp)
 {
 	hn_softc_t *sc;
 	sc = ifp->if_softc;
 
 	printf("hn%d: watchdog timeout -- resetting\n", sc->hn_unit);
 	hn_ifinit(sc);    /*???*/
 	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 }
 #endif
 
 static device_method_t netvsc_methods[] = {
         /* Device interface */
         DEVMETHOD(device_probe,         netvsc_probe),
         DEVMETHOD(device_attach,        netvsc_attach),
         DEVMETHOD(device_detach,        netvsc_detach),
         DEVMETHOD(device_shutdown,      netvsc_shutdown),
 
         { 0, 0 }
 };
 
 static driver_t netvsc_driver = {
         NETVSC_DEVNAME,
         netvsc_methods,
         sizeof(hn_softc_t)
 };
 
 static devclass_t netvsc_devclass;
 
 DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0);
 MODULE_VERSION(hn, 1);
 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
 SYSINIT(netvsc_initx, SI_SUB_KTHREAD_IDLE, SI_ORDER_MIDDLE + 1, netvsc_init,
      NULL);
 
Index: user/dchagin/lemul/sys/dev/hyperv
===================================================================
--- user/dchagin/lemul/sys/dev/hyperv	(revision 274240)
+++ user/dchagin/lemul/sys/dev/hyperv	(revision 274241)

Property changes on: user/dchagin/lemul/sys/dev/hyperv
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sys/dev/hyperv:r273766-274240
Index: user/dchagin/lemul/sys/dev/ixl/ixl_txrx.c
===================================================================
--- user/dchagin/lemul/sys/dev/ixl/ixl_txrx.c	(revision 274240)
+++ user/dchagin/lemul/sys/dev/ixl/ixl_txrx.c	(revision 274241)
@@ -1,1673 +1,1673 @@
 /******************************************************************************
 
   Copyright (c) 2013-2014, Intel Corporation 
   All rights reserved.
   
   Redistribution and use in source and binary forms, with or without 
   modification, are permitted provided that the following conditions are met:
   
    1. Redistributions of source code must retain the above copyright notice, 
       this list of conditions and the following disclaimer.
   
    2. Redistributions in binary form must reproduce the above copyright 
       notice, this list of conditions and the following disclaimer in the 
       documentation and/or other materials provided with the distribution.
   
    3. Neither the name of the Intel Corporation nor the names of its 
       contributors may be used to endorse or promote products derived from 
       this software without specific prior written permission.
   
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   POSSIBILITY OF SUCH DAMAGE.
 
 ******************************************************************************/
 /*$FreeBSD$*/
 
 /*
 **	IXL driver TX/RX Routines:
 **	    This was seperated to allow usage by
 ** 	    both the BASE and the VF drivers.
 */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "ixl.h"
 
 /* Local Prototypes */
 static void	ixl_rx_checksum(struct mbuf *, u32, u32, u8);
 static void	ixl_refresh_mbufs(struct ixl_queue *, int);
 static int      ixl_xmit(struct ixl_queue *, struct mbuf **);
 static int	ixl_tx_setup_offload(struct ixl_queue *,
 		    struct mbuf *, u32 *, u32 *);
 static bool	ixl_tso_setup(struct ixl_queue *, struct mbuf *);
 
 static __inline void ixl_rx_discard(struct rx_ring *, int);
 static __inline void ixl_rx_input(struct rx_ring *, struct ifnet *,
 		    struct mbuf *, u8);
 
 /*
 ** Multiqueue Transmit driver
 **
 */
 int
 ixl_mq_start(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ixl_vsi		*vsi = ifp->if_softc;
 	struct ixl_queue	*que;
 	struct tx_ring		*txr;
 	int 			err, i;
 
 	/* Which queue to use */
 	if ((m->m_flags & M_FLOWID) != 0)
 		i = m->m_pkthdr.flowid % vsi->num_queues;
 	else
 		i = curcpu % vsi->num_queues;
 
 	/* Check for a hung queue and pick alternative */
 	if (((1 << i) & vsi->active_queues) == 0)
 		i = ffsl(vsi->active_queues);
 
 	que = &vsi->queues[i];
 	txr = &que->txr;
 
 	err = drbr_enqueue(ifp, txr->br, m);
 	if (err)
 		return(err);
 	if (IXL_TX_TRYLOCK(txr)) {
 		ixl_mq_start_locked(ifp, txr);
 		IXL_TX_UNLOCK(txr);
 	} else
 		taskqueue_enqueue(que->tq, &que->tx_task);
 
 	return (0);
 }
 
 int
 ixl_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
 {
 	struct ixl_queue	*que = txr->que;
 	struct ixl_vsi		*vsi = que->vsi;
         struct mbuf		*next;
         int			err = 0;
 
 
 	if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) ||
 	    vsi->link_active == 0)
 		return (ENETDOWN);
 
 	/* Process the transmit queue */
 	while ((next = drbr_peek(ifp, txr->br)) != NULL) {
 		if ((err = ixl_xmit(que, &next)) != 0) {
 			if (next == NULL)
 				drbr_advance(ifp, txr->br);
 			else
 				drbr_putback(ifp, txr->br, next);
 			break;
 		}
 		drbr_advance(ifp, txr->br);
 		/* Send a copy of the frame to the BPF listener */
 		ETHER_BPF_MTAP(ifp, next);
 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			break;
 	}
 
 	if (txr->avail < IXL_TX_CLEANUP_THRESHOLD)
 		ixl_txeof(que);
 
 	return (err);
 }
 
 /*
  * Called from a taskqueue to drain queued transmit packets.
  */
 void
 ixl_deferred_mq_start(void *arg, int pending)
 {
 	struct ixl_queue	*que = arg;
         struct tx_ring		*txr = &que->txr;
 	struct ixl_vsi		*vsi = que->vsi;
         struct ifnet		*ifp = vsi->ifp;
         
 	IXL_TX_LOCK(txr);
 	if (!drbr_empty(ifp, txr->br))
 		ixl_mq_start_locked(ifp, txr);
 	IXL_TX_UNLOCK(txr);
 }
 
 /*
 ** Flush all queue ring buffers
 */
 void
 ixl_qflush(struct ifnet *ifp)
 {
 	struct ixl_vsi	*vsi = ifp->if_softc;
 
         for (int i = 0; i < vsi->num_queues; i++) {
 		struct ixl_queue *que = &vsi->queues[i];
 		struct tx_ring	*txr = &que->txr;
 		struct mbuf	*m;
 		IXL_TX_LOCK(txr);
 		while ((m = buf_ring_dequeue_sc(txr->br)) != NULL)
 			m_freem(m);
 		IXL_TX_UNLOCK(txr);
 	}
 	if_qflush(ifp);
 }
 
 /*
 ** Find mbuf chains passed to the driver 
 ** that are 'sparse', using more than 8
 ** mbufs to deliver an mss-size chunk of data
 */
 static inline bool
 ixl_tso_detect_sparse(struct mbuf *mp)
 {
 	struct mbuf	*m;
 	int		num = 0, mss;
 	bool		ret = FALSE;
 
 	mss = mp->m_pkthdr.tso_segsz;
 	for (m = mp->m_next; m != NULL; m = m->m_next) {
 		num++;
 		mss -= m->m_len;
 		if (mss < 1)
 			break;
 		if (m->m_next == NULL)
 			break;
 	}
 	if (num > IXL_SPARSE_CHAIN)
 		ret = TRUE;
 
 	return (ret);
 }
 
 
 /*********************************************************************
  *
  *  This routine maps the mbufs to tx descriptors, allowing the
  *  TX engine to transmit the packets. 
  *  	- return 0 on success, positive on failure
  *
  **********************************************************************/
 #define IXL_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
 
 static int
 ixl_xmit(struct ixl_queue *que, struct mbuf **m_headp)
 {
 	struct ixl_vsi		*vsi = que->vsi;
 	struct i40e_hw		*hw = vsi->hw;
 	struct tx_ring		*txr = &que->txr;
 	struct ixl_tx_buf	*buf;
 	struct i40e_tx_desc	*txd = NULL;
 	struct mbuf		*m_head, *m;
 	int             	i, j, error, nsegs, maxsegs;
 	int			first, last = 0;
 	u16			vtag = 0;
 	u32			cmd, off;
 	bus_dmamap_t		map;
 	bus_dma_tag_t		tag;
 	bus_dma_segment_t	segs[IXL_MAX_TSO_SEGS];
 
 
 	cmd = off = 0;
 	m_head = *m_headp;
 
         /*
          * Important to capture the first descriptor
          * used because it will contain the index of
          * the one we tell the hardware to report back
          */
         first = txr->next_avail;
 	buf = &txr->buffers[first];
 	map = buf->map;
 	tag = txr->tx_tag;
 	maxsegs = IXL_MAX_TX_SEGS;
 
 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 		/* Use larger mapping for TSO */
 		tag = txr->tso_tag;
 		maxsegs = IXL_MAX_TSO_SEGS;
 		if (ixl_tso_detect_sparse(m_head)) {
 			m = m_defrag(m_head, M_NOWAIT);
 			if (m == NULL) {
 				m_freem(*m_headp);
 				*m_headp = NULL;
 				return (ENOBUFS);
 			}
 			*m_headp = m;
 		}
 	}
 
 	/*
 	 * Map the packet for DMA.
 	 */
 	error = bus_dmamap_load_mbuf_sg(tag, map,
 	    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
 
 	if (error == EFBIG) {
 		struct mbuf *m;
 
 		m = m_collapse(*m_headp, M_NOWAIT, maxsegs);
 		if (m == NULL) {
 			que->mbuf_defrag_failed++;
 			m_freem(*m_headp);
 			*m_headp = NULL;
 			return (ENOBUFS);
 		}
 		*m_headp = m;
 
 		/* Try it again */
 		error = bus_dmamap_load_mbuf_sg(tag, map,
 		    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
 
 		if (error == ENOMEM) {
 			que->tx_dma_setup++;
 			return (error);
 		} else if (error != 0) {
 			que->tx_dma_setup++;
 			m_freem(*m_headp);
 			*m_headp = NULL;
 			return (error);
 		}
 	} else if (error == ENOMEM) {
 		que->tx_dma_setup++;
 		return (error);
 	} else if (error != 0) {
 		que->tx_dma_setup++;
 		m_freem(*m_headp);
 		*m_headp = NULL;
 		return (error);
 	}
 
 	/* Make certain there are enough descriptors */
 	if (nsegs > txr->avail - 2) {
 		txr->no_desc++;
 		error = ENOBUFS;
 		goto xmit_fail;
 	}
 	m_head = *m_headp;
 
 	/* Set up the TSO/CSUM offload */
 	if (m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) {
 		error = ixl_tx_setup_offload(que, m_head, &cmd, &off);
 		if (error)
 			goto xmit_fail;
 	}
 
 	cmd |= I40E_TX_DESC_CMD_ICRC;
 	/* Grab the VLAN tag */
 	if (m_head->m_flags & M_VLANTAG) {
 		cmd |= I40E_TX_DESC_CMD_IL2TAG1;
 		vtag = htole16(m_head->m_pkthdr.ether_vtag);
 	}
 
 	i = txr->next_avail;
 	for (j = 0; j < nsegs; j++) {
 		bus_size_t seglen;
 
 		buf = &txr->buffers[i];
 		buf->tag = tag; /* Keep track of the type tag */
 		txd = &txr->base[i];
 		seglen = segs[j].ds_len;
 
 		txd->buffer_addr = htole64(segs[j].ds_addr);
 		txd->cmd_type_offset_bsz =
 		    htole64(I40E_TX_DESC_DTYPE_DATA
 		    | ((u64)cmd  << I40E_TXD_QW1_CMD_SHIFT)
 		    | ((u64)off << I40E_TXD_QW1_OFFSET_SHIFT)
 		    | ((u64)seglen  << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)
 		    | ((u64)vtag  << I40E_TXD_QW1_L2TAG1_SHIFT));
 
 		last = i; /* descriptor that will get completion IRQ */
 
 		if (++i == que->num_desc)
 			i = 0;
 
 		buf->m_head = NULL;
 		buf->eop_index = -1;
 	}
 	/* Set the last descriptor for report */
 	txd->cmd_type_offset_bsz |=
 	    htole64(((u64)IXL_TXD_CMD << I40E_TXD_QW1_CMD_SHIFT));
 	txr->avail -= nsegs;
 	txr->next_avail = i;
 
 	buf->m_head = m_head;
 	/* Swap the dma map between the first and last descriptor */
 	txr->buffers[first].map = buf->map;
 	buf->map = map;
 	bus_dmamap_sync(tag, map, BUS_DMASYNC_PREWRITE);
 
         /* Set the index of the descriptor that will be marked done */
         buf = &txr->buffers[first];
 	buf->eop_index = last;
 
         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
             BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	/*
 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
 	 * hardware that this frame is available to transmit.
 	 */
 	++txr->total_packets;
 	wr32(hw, txr->tail, i);
 
 	ixl_flush(hw);
 	/* Mark outstanding work */
 	if (que->busy == 0)
 		que->busy = 1;
 	return (0);
 
 xmit_fail:
 	bus_dmamap_unload(tag, buf->map);
 	return (error);
 }
 
 
 /*********************************************************************
  *
  *  Allocate memory for tx_buffer structures. The tx_buffer stores all
  *  the information needed to transmit a packet on the wire. This is
  *  called only once at attach, setup is done every reset.
  *
  **********************************************************************/
 int
 ixl_allocate_tx_data(struct ixl_queue *que)
 {
 	struct tx_ring		*txr = &que->txr;
 	struct ixl_vsi		*vsi = que->vsi;
 	device_t		dev = vsi->dev;
 	struct ixl_tx_buf	*buf;
 	int			error = 0;
 
 	/*
 	 * Setup DMA descriptor areas.
 	 */
 	if ((error = bus_dma_tag_create(NULL,		/* parent */
 			       1, 0,			/* alignment, bounds */
 			       BUS_SPACE_MAXADDR,	/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       IXL_TSO_SIZE,		/* maxsize */
 			       IXL_MAX_TX_SEGS,		/* nsegments */
 			       PAGE_SIZE,		/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txr->tx_tag))) {
 		device_printf(dev,"Unable to allocate TX DMA tag\n");
 		goto fail;
 	}
 
 	/* Make a special tag for TSO */
 	if ((error = bus_dma_tag_create(NULL,		/* parent */
 			       1, 0,			/* alignment, bounds */
 			       BUS_SPACE_MAXADDR,	/* lowaddr */
 			       BUS_SPACE_MAXADDR,	/* highaddr */
 			       NULL, NULL,		/* filter, filterarg */
 			       IXL_TSO_SIZE,		/* maxsize */
 			       IXL_MAX_TSO_SEGS,	/* nsegments */
 			       PAGE_SIZE,		/* maxsegsize */
 			       0,			/* flags */
 			       NULL,			/* lockfunc */
 			       NULL,			/* lockfuncarg */
 			       &txr->tso_tag))) {
 		device_printf(dev,"Unable to allocate TX TSO DMA tag\n");
 		goto fail;
 	}
 
 	if (!(txr->buffers =
 	    (struct ixl_tx_buf *) malloc(sizeof(struct ixl_tx_buf) *
 	    que->num_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
 		error = ENOMEM;
 		goto fail;
 	}
 
         /* Create the descriptor buffer default dma maps */
 	buf = txr->buffers;
 	for (int i = 0; i < que->num_desc; i++, buf++) {
 		buf->tag = txr->tx_tag;
 		error = bus_dmamap_create(buf->tag, 0, &buf->map);
 		if (error != 0) {
 			device_printf(dev, "Unable to create TX DMA map\n");
 			goto fail;
 		}
 	}
 fail:
 	return (error);
 }
 
 
 /*********************************************************************
  *
  *  (Re)Initialize a queue transmit ring.
  *	- called by init, it clears the descriptor ring,
  *	  and frees any stale mbufs 
  *
  **********************************************************************/
 void
 ixl_init_tx_ring(struct ixl_queue *que)
 {
 	struct tx_ring *txr = &que->txr;
 	struct ixl_tx_buf *buf;
 
 	/* Clear the old ring contents */
 	IXL_TX_LOCK(txr);
 	bzero((void *)txr->base,
 	      (sizeof(struct i40e_tx_desc)) * que->num_desc);
 
 	/* Reset indices */
 	txr->next_avail = 0;
 	txr->next_to_clean = 0;
 
 #ifdef IXL_FDIR
 	/* Initialize flow director */
 	txr->atr_rate = ixl_atr_rate;
 	txr->atr_count = 0;
 #endif
 
 	/* Free any existing tx mbufs. */
         buf = txr->buffers;
 	for (int i = 0; i < que->num_desc; i++, buf++) {
 		if (buf->m_head != NULL) {
 			bus_dmamap_sync(buf->tag, buf->map,
 			    BUS_DMASYNC_POSTWRITE);
 			bus_dmamap_unload(buf->tag, buf->map);
 			m_freem(buf->m_head);
 			buf->m_head = NULL;
 		}
 		/* Clear the EOP index */
 		buf->eop_index = -1;
         }
 
 	/* Set number of descriptors available */
 	txr->avail = que->num_desc;
 
 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 	IXL_TX_UNLOCK(txr);
 }
 
 
 /*********************************************************************
  *
  *  Free transmit ring related data structures.
  *
  **********************************************************************/
 void
 ixl_free_que_tx(struct ixl_queue *que)
 {
 	struct tx_ring *txr = &que->txr;
 	struct ixl_tx_buf *buf;
 
 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
 
 	for (int i = 0; i < que->num_desc; i++) {
 		buf = &txr->buffers[i];
 		if (buf->m_head != NULL) {
 			bus_dmamap_sync(buf->tag, buf->map,
 			    BUS_DMASYNC_POSTWRITE);
 			bus_dmamap_unload(buf->tag,
 			    buf->map);
 			m_freem(buf->m_head);
 			buf->m_head = NULL;
 			if (buf->map != NULL) {
 				bus_dmamap_destroy(buf->tag,
 				    buf->map);
 				buf->map = NULL;
 			}
 		} else if (buf->map != NULL) {
 			bus_dmamap_unload(buf->tag,
 			    buf->map);
 			bus_dmamap_destroy(buf->tag,
 			    buf->map);
 			buf->map = NULL;
 		}
 	}
 	if (txr->br != NULL)
 		buf_ring_free(txr->br, M_DEVBUF);
 	if (txr->buffers != NULL) {
 		free(txr->buffers, M_DEVBUF);
 		txr->buffers = NULL;
 	}
 	if (txr->tx_tag != NULL) {
 		bus_dma_tag_destroy(txr->tx_tag);
 		txr->tx_tag = NULL;
 	}
 	if (txr->tso_tag != NULL) {
 		bus_dma_tag_destroy(txr->tso_tag);
 		txr->tso_tag = NULL;
 	}
 
 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
 	return;
 }
 
 /*********************************************************************
  *
  *  Setup descriptor for hw offloads 
  *
  **********************************************************************/
 
 static int
 ixl_tx_setup_offload(struct ixl_queue *que,
     struct mbuf *mp, u32 *cmd, u32 *off)
 {
 	struct ether_vlan_header	*eh;
 #ifdef INET
 	struct ip			*ip = NULL;
 #endif
 	struct tcphdr			*th = NULL;
 #ifdef INET6
 	struct ip6_hdr			*ip6;
 #endif
 	int				elen, ip_hlen = 0, tcp_hlen;
 	u16				etype;
 	u8				ipproto = 0;
 	bool				tso = FALSE;
 
 
 	/* Set up the TSO context descriptor if required */
 	if (mp->m_pkthdr.csum_flags & CSUM_TSO) {
 		tso = ixl_tso_setup(que, mp);
 		if (tso)
 			++que->tso;
 		else
 			return (ENXIO);
 	}
 
 	/*
 	 * Determine where frame payload starts.
 	 * Jump over vlan headers if already present,
 	 * helpful for QinQ too.
 	 */
 	eh = mtod(mp, struct ether_vlan_header *);
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		etype = ntohs(eh->evl_proto);
 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 	} else {
 		etype = ntohs(eh->evl_encap_proto);
 		elen = ETHER_HDR_LEN;
 	}
 
 	switch (etype) {
 #ifdef INET
 		case ETHERTYPE_IP:
 			ip = (struct ip *)(mp->m_data + elen);
 			ip_hlen = ip->ip_hl << 2;
 			ipproto = ip->ip_p;
 			th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
 			/* The IP checksum must be recalculated with TSO */
 			if (tso)
 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
 			else
 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4;
 			break;
 #endif
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 			ip6 = (struct ip6_hdr *)(mp->m_data + elen);
 			ip_hlen = sizeof(struct ip6_hdr);
 			ipproto = ip6->ip6_nxt;
 			th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
 			*cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
 			break;
 #endif
 		default:
 			break;
 	}
 
 	*off |= (elen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
 	*off |= (ip_hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
 
 	switch (ipproto) {
 		case IPPROTO_TCP:
 			tcp_hlen = th->th_off << 2;
 			if (mp->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) {
 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
 				*off |= (tcp_hlen >> 2) <<
 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
 			}
 #ifdef IXL_FDIR
 			ixl_atr(que, th, etype);
 #endif
 			break;
 		case IPPROTO_UDP:
 			if (mp->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_UDP_IPV6)) {
 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
 				*off |= (sizeof(struct udphdr) >> 2) <<
 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
 			}
 			break;
 
 		case IPPROTO_SCTP:
 			if (mp->m_pkthdr.csum_flags & (CSUM_SCTP|CSUM_SCTP_IPV6)) {
 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
 				*off |= (sizeof(struct sctphdr) >> 2) <<
 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
 			}
 			/* Fall Thru */
 		default:
 			break;
 	}
 
         return (0);
 }
 
 
 /**********************************************************************
  *
  *  Setup context for hardware segmentation offload (TSO)
  *
  **********************************************************************/
 static bool
 ixl_tso_setup(struct ixl_queue *que, struct mbuf *mp)
 {
 	struct tx_ring			*txr = &que->txr;
 	struct i40e_tx_context_desc	*TXD;
 	struct ixl_tx_buf		*buf;
 	u32				cmd, mss, type, tsolen;
 	u16				etype;
 	int				idx, elen, ip_hlen, tcp_hlen;
 	struct ether_vlan_header	*eh;
 #ifdef INET
 	struct ip			*ip;
 #endif
 #ifdef INET6
 	struct ip6_hdr			*ip6;
 #endif
 #if defined(INET6) || defined(INET)
 	struct tcphdr			*th;
 #endif
 	u64				type_cmd_tso_mss;
 
 	/*
 	 * Determine where frame payload starts.
 	 * Jump over vlan headers if already present
 	 */
 	eh = mtod(mp, struct ether_vlan_header *);
 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 		etype = eh->evl_proto;
 	} else {
 		elen = ETHER_HDR_LEN;
 		etype = eh->evl_encap_proto;
 	}
 
         switch (ntohs(etype)) {
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		ip6 = (struct ip6_hdr *)(mp->m_data + elen);
 		if (ip6->ip6_nxt != IPPROTO_TCP)
 			return (ENXIO);
 		ip_hlen = sizeof(struct ip6_hdr);
 		th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
 		tcp_hlen = th->th_off << 2;
 		break;
 #endif
 #ifdef INET
 	case ETHERTYPE_IP:
 		ip = (struct ip *)(mp->m_data + elen);
 		if (ip->ip_p != IPPROTO_TCP)
 			return (ENXIO);
 		ip->ip_sum = 0;
 		ip_hlen = ip->ip_hl << 2;
 		th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
 		tcp_hlen = th->th_off << 2;
 		break;
 #endif
 	default:
 		printf("%s: CSUM_TSO but no supported IP version (0x%04x)",
 		    __func__, ntohs(etype));
 		return FALSE;
         }
 
         /* Ensure we have at least the IP+TCP header in the first mbuf. */
         if (mp->m_len < elen + ip_hlen + sizeof(struct tcphdr))
 		return FALSE;
 
 	idx = txr->next_avail;
 	buf = &txr->buffers[idx];
 	TXD = (struct i40e_tx_context_desc *) &txr->base[idx];
 	tsolen = mp->m_pkthdr.len - (elen + ip_hlen + tcp_hlen);
 
 	type = I40E_TX_DESC_DTYPE_CONTEXT;
 	cmd = I40E_TX_CTX_DESC_TSO;
 	mss = mp->m_pkthdr.tso_segsz;
 
 	type_cmd_tso_mss = ((u64)type << I40E_TXD_CTX_QW1_DTYPE_SHIFT) |
 	    ((u64)cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
 	    ((u64)tsolen << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
 	    ((u64)mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
 	TXD->type_cmd_tso_mss = htole64(type_cmd_tso_mss);
 
 	TXD->tunneling_params = htole32(0);
 	buf->m_head = NULL;
 	buf->eop_index = -1;
 
 	if (++idx == que->num_desc)
 		idx = 0;
 
 	txr->avail--;
 	txr->next_avail = idx;
 
 	return TRUE;
 }
 
 /*             
 ** ixl_get_tx_head - Retrieve the value from the 
 **    location the HW records its HEAD index
 */
 static inline u32
 ixl_get_tx_head(struct ixl_queue *que)
 {
 	struct tx_ring  *txr = &que->txr;
 	void *head = &txr->base[que->num_desc];
 	return LE32_TO_CPU(*(volatile __le32 *)head);
 }
 
 /**********************************************************************
  *
  *  Examine each tx_buffer in the used queue. If the hardware is done
  *  processing the packet then free associated resources. The
  *  tx_buffer is put back on the free queue.
  *
  **********************************************************************/
 bool
 ixl_txeof(struct ixl_queue *que)
 {
 	struct tx_ring		*txr = &que->txr;
 	u32			first, last, head, done, processed;
 	struct ixl_tx_buf	*buf;
 	struct i40e_tx_desc	*tx_desc, *eop_desc;
 
 
 	mtx_assert(&txr->mtx, MA_OWNED);
 
 
 	/* These are not the descriptors you seek, move along :) */
 	if (txr->avail == que->num_desc) {
 		que->busy = 0;
 		return FALSE;
 	}
 
 	processed = 0;
 	first = txr->next_to_clean;
 	buf = &txr->buffers[first];
 	tx_desc = (struct i40e_tx_desc *)&txr->base[first];
 	last = buf->eop_index;
 	if (last == -1)
 		return FALSE;
 	eop_desc = (struct i40e_tx_desc *)&txr->base[last];
 
 	/* Get the Head WB value */
 	head = ixl_get_tx_head(que);
 
 	/*
 	** Get the index of the first descriptor
 	** BEYOND the EOP and call that 'done'.
 	** I do this so the comparison in the
 	** inner while loop below can be simple
 	*/
 	if (++last == que->num_desc) last = 0;
 	done = last;
 
         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
             BUS_DMASYNC_POSTREAD);
 	/*
 	** The HEAD index of the ring is written in a 
 	** defined location, this rather than a done bit
 	** is what is used to keep track of what must be
 	** 'cleaned'.
 	*/
 	while (first != head) {
 		/* We clean the range of the packet */
 		while (first != done) {
 			++txr->avail;
 			++processed;
 
 			if (buf->m_head) {
 				txr->bytes += /* for ITR adjustment */
 				    buf->m_head->m_pkthdr.len;
 				txr->tx_bytes += /* for TX stats */
 				    buf->m_head->m_pkthdr.len;
 				bus_dmamap_sync(buf->tag,
 				    buf->map,
 				    BUS_DMASYNC_POSTWRITE);
 				bus_dmamap_unload(buf->tag,
 				    buf->map);
 				m_freem(buf->m_head);
 				buf->m_head = NULL;
 				buf->map = NULL;
 			}
 			buf->eop_index = -1;
 
 			if (++first == que->num_desc)
 				first = 0;
 
 			buf = &txr->buffers[first];
 			tx_desc = &txr->base[first];
 		}
 		++txr->packets;
 		/* See if there is more work now */
 		last = buf->eop_index;
 		if (last != -1) {
 			eop_desc = &txr->base[last];
 			/* Get next done point */
 			if (++last == que->num_desc) last = 0;
 			done = last;
 		} else
 			break;
 	}
 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 	txr->next_to_clean = first;
 
 
 	/*
 	** Hang detection, we know there's
 	** work outstanding or the first return
 	** would have been taken, so indicate an
 	** unsuccessful pass, in local_timer if
 	** the value is too great the queue will
 	** be considered hung. If anything has been
 	** cleaned then reset the state.
 	*/
 	if ((processed == 0) && (que->busy != IXL_QUEUE_HUNG))
 		++que->busy;
 
 	if (processed)
 		que->busy = 1; /* Note this turns off HUNG */
 
 	/*
 	 * If there are no pending descriptors, clear the timeout.
 	 */
 	if (txr->avail == que->num_desc) {
 		que->busy = 0;
 		return FALSE;
 	}
 
 	return TRUE;
 }
 
 /*********************************************************************
  *
  *  Refresh mbuf buffers for RX descriptor rings
  *   - now keeps its own state so discards due to resource
  *     exhaustion are unnecessary, if an mbuf cannot be obtained
  *     it just returns, keeping its placeholder, thus it can simply
  *     be recalled to try again.
  *
  **********************************************************************/
 static void
 ixl_refresh_mbufs(struct ixl_queue *que, int limit)
 {
 	struct ixl_vsi		*vsi = que->vsi;
 	struct rx_ring		*rxr = &que->rxr;
 	bus_dma_segment_t	hseg[1];
 	bus_dma_segment_t	pseg[1];
 	struct ixl_rx_buf	*buf;
 	struct mbuf		*mh, *mp;
 	int			i, j, nsegs, error;
 	bool			refreshed = FALSE;
 
 	i = j = rxr->next_refresh;
 	/* Control the loop with one beyond */
 	if (++j == que->num_desc)
 		j = 0;
 
 	while (j != limit) {
 		buf = &rxr->buffers[i];
 		if (rxr->hdr_split == FALSE)
 			goto no_split;
 
 		if (buf->m_head == NULL) {
 			mh = m_gethdr(M_NOWAIT, MT_DATA);
 			if (mh == NULL)
 				goto update;
 		} else
 			mh = buf->m_head;
 
 		mh->m_pkthdr.len = mh->m_len = MHLEN;
 		mh->m_len = MHLEN;
 		mh->m_flags |= M_PKTHDR;
 		/* Get the memory mapping */
 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
 		    buf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT);
 		if (error != 0) {
 			printf("Refresh mbufs: hdr dmamap load"
 			    " failure - %d\n", error);
 			m_free(mh);
 			buf->m_head = NULL;
 			goto update;
 		}
 		buf->m_head = mh;
 		bus_dmamap_sync(rxr->htag, buf->hmap,
 		    BUS_DMASYNC_PREREAD);
 		rxr->base[i].read.hdr_addr =
 		   htole64(hseg[0].ds_addr);
 
 no_split:
 		if (buf->m_pack == NULL) {
 			mp = m_getjcl(M_NOWAIT, MT_DATA,
 			    M_PKTHDR, rxr->mbuf_sz);
 			if (mp == NULL)
 				goto update;
 		} else
 			mp = buf->m_pack;
 
 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
 		/* Get the memory mapping */
 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
 		    buf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT);
 		if (error != 0) {
 			printf("Refresh mbufs: payload dmamap load"
 			    " failure - %d\n", error);
 			m_free(mp);
 			buf->m_pack = NULL;
 			goto update;
 		}
 		buf->m_pack = mp;
 		bus_dmamap_sync(rxr->ptag, buf->pmap,
 		    BUS_DMASYNC_PREREAD);
 		rxr->base[i].read.pkt_addr =
 		   htole64(pseg[0].ds_addr);
 		/* Used only when doing header split */
 		rxr->base[i].read.hdr_addr = 0;
 
 		refreshed = TRUE;
 		/* Next is precalculated */
 		i = j;
 		rxr->next_refresh = i;
 		if (++j == que->num_desc)
 			j = 0;
 	}
 update:
 	if (refreshed) /* Update hardware tail index */
 		wr32(vsi->hw, rxr->tail, rxr->next_refresh);
 	return;
 }
 
 
 /*********************************************************************
  *
  *  Allocate memory for rx_buffer structures. Since we use one
  *  rx_buffer per descriptor, the maximum number of rx_buffer's
  *  that we'll need is equal to the number of receive descriptors
  *  that we've defined.
  *
  **********************************************************************/
 int
 ixl_allocate_rx_data(struct ixl_queue *que)
 {
 	struct rx_ring		*rxr = &que->rxr;
 	struct ixl_vsi		*vsi = que->vsi;
 	device_t 		dev = vsi->dev;
 	struct ixl_rx_buf 	*buf;
 	int             	i, bsize, error;
 
 	bsize = sizeof(struct ixl_rx_buf) * que->num_desc;
 	if (!(rxr->buffers =
 	    (struct ixl_rx_buf *) malloc(bsize,
 	    M_DEVBUF, M_NOWAIT | M_ZERO))) {
 		device_printf(dev, "Unable to allocate rx_buffer memory\n");
 		error = ENOMEM;
 		return (error);
 	}
 
 	if ((error = bus_dma_tag_create(NULL,	/* parent */
 				   1, 0,	/* alignment, bounds */
 				   BUS_SPACE_MAXADDR,	/* lowaddr */
 				   BUS_SPACE_MAXADDR,	/* highaddr */
 				   NULL, NULL,		/* filter, filterarg */
 				   MSIZE,		/* maxsize */
 				   1,			/* nsegments */
 				   MSIZE,		/* maxsegsize */
 				   0,			/* flags */
 				   NULL,		/* lockfunc */
 				   NULL,		/* lockfuncarg */
 				   &rxr->htag))) {
 		device_printf(dev, "Unable to create RX DMA htag\n");
 		return (error);
 	}
 
 	if ((error = bus_dma_tag_create(NULL,	/* parent */
 				   1, 0,	/* alignment, bounds */
 				   BUS_SPACE_MAXADDR,	/* lowaddr */
 				   BUS_SPACE_MAXADDR,	/* highaddr */
 				   NULL, NULL,		/* filter, filterarg */
 				   MJUM16BYTES,		/* maxsize */
 				   1,			/* nsegments */
 				   MJUM16BYTES,		/* maxsegsize */
 				   0,			/* flags */
 				   NULL,		/* lockfunc */
 				   NULL,		/* lockfuncarg */
 				   &rxr->ptag))) {
 		device_printf(dev, "Unable to create RX DMA ptag\n");
 		return (error);
 	}
 
 	for (i = 0; i < que->num_desc; i++) {
 		buf = &rxr->buffers[i];
 		error = bus_dmamap_create(rxr->htag,
 		    BUS_DMA_NOWAIT, &buf->hmap);
 		if (error) {
 			device_printf(dev, "Unable to create RX head map\n");
 			break;
 		}
 		error = bus_dmamap_create(rxr->ptag,
 		    BUS_DMA_NOWAIT, &buf->pmap);
 		if (error) {
 			device_printf(dev, "Unable to create RX pkt map\n");
 			break;
 		}
 	}
 
 	return (error);
 }
 
 
 /*********************************************************************
  *
  *  (Re)Initialize the queue receive ring and its buffers.
  *
  **********************************************************************/
 int
 ixl_init_rx_ring(struct ixl_queue *que)
 {
 	struct	rx_ring 	*rxr = &que->rxr;
-#if defined(INET6) || defined(INET)
 	struct ixl_vsi		*vsi = que->vsi;
+#if defined(INET6) || defined(INET)
 	struct ifnet		*ifp = vsi->ifp;
 	struct lro_ctrl		*lro = &rxr->lro;
 #endif
 	struct ixl_rx_buf	*buf;
 	bus_dma_segment_t	pseg[1], hseg[1];
 	int			rsize, nsegs, error = 0;
 
 	IXL_RX_LOCK(rxr);
 	/* Clear the ring contents */
 	rsize = roundup2(que->num_desc *
 	    sizeof(union i40e_rx_desc), DBA_ALIGN);
 	bzero((void *)rxr->base, rsize);
 	/* Cleanup any existing buffers */
 	for (int i = 0; i < que->num_desc; i++) {
 		buf = &rxr->buffers[i];
 		if (buf->m_head != NULL) {
 			bus_dmamap_sync(rxr->htag, buf->hmap,
 			    BUS_DMASYNC_POSTREAD);
 			bus_dmamap_unload(rxr->htag, buf->hmap);
 			buf->m_head->m_flags |= M_PKTHDR;
 			m_freem(buf->m_head);
 		}
 		if (buf->m_pack != NULL) {
 			bus_dmamap_sync(rxr->ptag, buf->pmap,
 			    BUS_DMASYNC_POSTREAD);
 			bus_dmamap_unload(rxr->ptag, buf->pmap);
 			buf->m_pack->m_flags |= M_PKTHDR;
 			m_freem(buf->m_pack);
 		}
 		buf->m_head = NULL;
 		buf->m_pack = NULL;
 	}
 
 	/* header split is off */
 	rxr->hdr_split = FALSE;
 
 	/* Now replenish the mbufs */
 	for (int j = 0; j != que->num_desc; ++j) {
 		struct mbuf	*mh, *mp;
 
 		buf = &rxr->buffers[j];
 		/*
 		** Don't allocate mbufs if not
 		** doing header split, its wasteful
 		*/ 
 		if (rxr->hdr_split == FALSE)
 			goto skip_head;
 
 		/* First the header */
 		buf->m_head = m_gethdr(M_NOWAIT, MT_DATA);
 		if (buf->m_head == NULL) {
 			error = ENOBUFS;
 			goto fail;
 		}
 		m_adj(buf->m_head, ETHER_ALIGN);
 		mh = buf->m_head;
 		mh->m_len = mh->m_pkthdr.len = MHLEN;
 		mh->m_flags |= M_PKTHDR;
 		/* Get the memory mapping */
 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
 		    buf->hmap, buf->m_head, hseg,
 		    &nsegs, BUS_DMA_NOWAIT);
 		if (error != 0) /* Nothing elegant to do here */
 			goto fail;
 		bus_dmamap_sync(rxr->htag,
 		    buf->hmap, BUS_DMASYNC_PREREAD);
 		/* Update descriptor */
 		rxr->base[j].read.hdr_addr = htole64(hseg[0].ds_addr);
 
 skip_head:
 		/* Now the payload cluster */
 		buf->m_pack = m_getjcl(M_NOWAIT, MT_DATA,
 		    M_PKTHDR, rxr->mbuf_sz);
 		if (buf->m_pack == NULL) {
 			error = ENOBUFS;
                         goto fail;
 		}
 		mp = buf->m_pack;
 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
 		/* Get the memory mapping */
 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
 		    buf->pmap, mp, pseg,
 		    &nsegs, BUS_DMA_NOWAIT);
 		if (error != 0)
                         goto fail;
 		bus_dmamap_sync(rxr->ptag,
 		    buf->pmap, BUS_DMASYNC_PREREAD);
 		/* Update descriptor */
 		rxr->base[j].read.pkt_addr = htole64(pseg[0].ds_addr);
 		rxr->base[j].read.hdr_addr = 0;
 	}
 
 
 	/* Setup our descriptor indices */
 	rxr->next_check = 0;
 	rxr->next_refresh = 0;
 	rxr->lro_enabled = FALSE;
 	rxr->split = 0;
 	rxr->bytes = 0;
 	rxr->discard = FALSE;
 
 	wr32(vsi->hw, rxr->tail, que->num_desc - 1);
 	ixl_flush(vsi->hw);
 
 #if defined(INET6) || defined(INET)
 	/*
 	** Now set up the LRO interface:
 	*/
 	if (ifp->if_capenable & IFCAP_LRO) {
 		int err = tcp_lro_init(lro);
 		if (err) {
 			if_printf(ifp, "queue %d: LRO Initialization failed!\n", que->me);
 			goto fail;
 		}
 		INIT_DBG_IF(ifp, "queue %d: RX Soft LRO Initialized", que->me);
 		rxr->lro_enabled = TRUE;
 		lro->ifp = vsi->ifp;
 	}
 #endif
 
 	bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 fail:
 	IXL_RX_UNLOCK(rxr);
 	return (error);
 }
 
 
 /*********************************************************************
  *
  *  Free station receive ring data structures
  *
  **********************************************************************/
 void
 ixl_free_que_rx(struct ixl_queue *que)
 {
 	struct rx_ring		*rxr = &que->rxr;
 	struct ixl_rx_buf	*buf;
 
 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
 
 	/* Cleanup any existing buffers */
 	if (rxr->buffers != NULL) {
 		for (int i = 0; i < que->num_desc; i++) {
 			buf = &rxr->buffers[i];
 			if (buf->m_head != NULL) {
 				bus_dmamap_sync(rxr->htag, buf->hmap,
 				    BUS_DMASYNC_POSTREAD);
 				bus_dmamap_unload(rxr->htag, buf->hmap);
 				buf->m_head->m_flags |= M_PKTHDR;
 				m_freem(buf->m_head);
 			}
 			if (buf->m_pack != NULL) {
 				bus_dmamap_sync(rxr->ptag, buf->pmap,
 				    BUS_DMASYNC_POSTREAD);
 				bus_dmamap_unload(rxr->ptag, buf->pmap);
 				buf->m_pack->m_flags |= M_PKTHDR;
 				m_freem(buf->m_pack);
 			}
 			buf->m_head = NULL;
 			buf->m_pack = NULL;
 			if (buf->hmap != NULL) {
 				bus_dmamap_destroy(rxr->htag, buf->hmap);
 				buf->hmap = NULL;
 			}
 			if (buf->pmap != NULL) {
 				bus_dmamap_destroy(rxr->ptag, buf->pmap);
 				buf->pmap = NULL;
 			}
 		}
 		if (rxr->buffers != NULL) {
 			free(rxr->buffers, M_DEVBUF);
 			rxr->buffers = NULL;
 		}
 	}
 
 	if (rxr->htag != NULL) {
 		bus_dma_tag_destroy(rxr->htag);
 		rxr->htag = NULL;
 	}
 	if (rxr->ptag != NULL) {
 		bus_dma_tag_destroy(rxr->ptag);
 		rxr->ptag = NULL;
 	}
 
 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
 	return;
 }
 
 static __inline void
 ixl_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u8 ptype)
 {
 
 #if defined(INET6) || defined(INET)
         /*
          * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet
          * should be computed by hardware. Also it should not have VLAN tag in
          * ethernet header.
          */
         if (rxr->lro_enabled &&
             (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 &&
             (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) ==
             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) {
                 /*
                  * Send to the stack if:
                  **  - LRO not enabled, or
                  **  - no LRO resources, or
                  **  - lro enqueue fails
                  */
                 if (rxr->lro.lro_cnt != 0)
                         if (tcp_lro_rx(&rxr->lro, m, 0) == 0)
                                 return;
         }
 #endif
 	IXL_RX_UNLOCK(rxr);
         (*ifp->if_input)(ifp, m);
 	IXL_RX_LOCK(rxr);
 }
 
 
 static __inline void
 ixl_rx_discard(struct rx_ring *rxr, int i)
 {
 	struct ixl_rx_buf	*rbuf;
 
 	rbuf = &rxr->buffers[i];
 
         if (rbuf->fmp != NULL) {/* Partial chain ? */
 		rbuf->fmp->m_flags |= M_PKTHDR;
                 m_freem(rbuf->fmp);
                 rbuf->fmp = NULL;
 	}
 
 	/*
 	** With advanced descriptors the writeback
 	** clobbers the buffer addrs, so its easier
 	** to just free the existing mbufs and take
 	** the normal refresh path to get new buffers
 	** and mapping.
 	*/
 	if (rbuf->m_head) {
 		m_free(rbuf->m_head);
 		rbuf->m_head = NULL;
 	}
  
 	if (rbuf->m_pack) {
 		m_free(rbuf->m_pack);
 		rbuf->m_pack = NULL;
 	}
 
 	return;
 }
 
 
 /*********************************************************************
  *
  *  This routine executes in interrupt context. It replenishes
  *  the mbufs in the descriptor and sends data which has been
  *  dma'ed into host memory to upper layer.
  *
  *  We loop at most count times if count is > 0, or until done if
  *  count < 0.
  *
  *  Return TRUE for more work, FALSE for all clean.
  *********************************************************************/
 bool
 ixl_rxeof(struct ixl_queue *que, int count)
 {
 	struct ixl_vsi		*vsi = que->vsi;
 	struct rx_ring		*rxr = &que->rxr;
 	struct ifnet		*ifp = vsi->ifp;
 #if defined(INET6) || defined(INET)
 	struct lro_ctrl		*lro = &rxr->lro;
 	struct lro_entry	*queued;
 #endif
 	int			i, nextp, processed = 0;
 	union i40e_rx_desc	*cur;
 	struct ixl_rx_buf	*rbuf, *nbuf;
 
 
 	IXL_RX_LOCK(rxr);
 
 
 	for (i = rxr->next_check; count != 0;) {
 		struct mbuf	*sendmp, *mh, *mp;
 		u32		rsc, status, error;
 		u16		hlen, plen, vtag;
 		u64		qword;
 		u8		ptype;
 		bool		eop;
  
 		/* Sync the ring. */
 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
 		cur = &rxr->base[i];
 		qword = le64toh(cur->wb.qword1.status_error_len);
 		status = (qword & I40E_RXD_QW1_STATUS_MASK)
 		    >> I40E_RXD_QW1_STATUS_SHIFT;
 		error = (qword & I40E_RXD_QW1_ERROR_MASK)
 		    >> I40E_RXD_QW1_ERROR_SHIFT;
 		plen = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK)
 		    >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
 		hlen = (qword & I40E_RXD_QW1_LENGTH_HBUF_MASK)
 		    >> I40E_RXD_QW1_LENGTH_HBUF_SHIFT;
 		ptype = (qword & I40E_RXD_QW1_PTYPE_MASK)
 		    >> I40E_RXD_QW1_PTYPE_SHIFT;
 
 		if ((status & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) == 0) {
 			++rxr->not_done;
 			break;
 		}
 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			break;
 
 		count--;
 		sendmp = NULL;
 		nbuf = NULL;
 		rsc = 0;
 		cur->wb.qword1.status_error_len = 0;
 		rbuf = &rxr->buffers[i];
 		mh = rbuf->m_head;
 		mp = rbuf->m_pack;
 		eop = (status & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT));
 		if (status & (1 << I40E_RX_DESC_STATUS_L2TAG1P_SHIFT))
 			vtag = le16toh(cur->wb.qword0.lo_dword.l2tag1);
 		else
 			vtag = 0;
 
 		/*
 		** Make sure bad packets are discarded,
 		** note that only EOP descriptor has valid
 		** error results.
 		*/
                 if (eop && (error & (1 << I40E_RX_DESC_ERROR_RXE_SHIFT))) {
 			rxr->discarded++;
 			ixl_rx_discard(rxr, i);
 			goto next_desc;
 		}
 
 		/* Prefetch the next buffer */
 		if (!eop) {
 			nextp = i + 1;
 			if (nextp == que->num_desc)
 				nextp = 0;
 			nbuf = &rxr->buffers[nextp];
 			prefetch(nbuf);
 		}
 
 		/*
 		** The header mbuf is ONLY used when header 
 		** split is enabled, otherwise we get normal 
 		** behavior, ie, both header and payload
 		** are DMA'd into the payload buffer.
 		**
 		** Rather than using the fmp/lmp global pointers
 		** we now keep the head of a packet chain in the
 		** buffer struct and pass this along from one
 		** descriptor to the next, until we get EOP.
 		*/
 		if (rxr->hdr_split && (rbuf->fmp == NULL)) {
 			if (hlen > IXL_RX_HDR)
 				hlen = IXL_RX_HDR;
 			mh->m_len = hlen;
 			mh->m_flags |= M_PKTHDR;
 			mh->m_next = NULL;
 			mh->m_pkthdr.len = mh->m_len;
 			/* Null buf pointer so it is refreshed */
 			rbuf->m_head = NULL;
 			/*
 			** Check the payload length, this
 			** could be zero if its a small
 			** packet.
 			*/
 			if (plen > 0) {
 				mp->m_len = plen;
 				mp->m_next = NULL;
 				mp->m_flags &= ~M_PKTHDR;
 				mh->m_next = mp;
 				mh->m_pkthdr.len += mp->m_len;
 				/* Null buf pointer so it is refreshed */
 				rbuf->m_pack = NULL;
 				rxr->split++;
 			}
 			/*
 			** Now create the forward
 			** chain so when complete 
 			** we wont have to.
 			*/
                         if (eop == 0) {
 				/* stash the chain head */
                                 nbuf->fmp = mh;
 				/* Make forward chain */
                                 if (plen)
                                         mp->m_next = nbuf->m_pack;
                                 else
                                         mh->m_next = nbuf->m_pack;
                         } else {
 				/* Singlet, prepare to send */
                                 sendmp = mh;
                                 if (vtag) {
                                         sendmp->m_pkthdr.ether_vtag = vtag;
                                         sendmp->m_flags |= M_VLANTAG;
                                 }
                         }
 		} else {
 			/*
 			** Either no header split, or a
 			** secondary piece of a fragmented
 			** split packet.
 			*/
 			mp->m_len = plen;
 			/*
 			** See if there is a stored head
 			** that determines what we are
 			*/
 			sendmp = rbuf->fmp;
 			rbuf->m_pack = rbuf->fmp = NULL;
 
 			if (sendmp != NULL) /* secondary frag */
 				sendmp->m_pkthdr.len += mp->m_len;
 			else {
 				/* first desc of a non-ps chain */
 				sendmp = mp;
 				sendmp->m_flags |= M_PKTHDR;
 				sendmp->m_pkthdr.len = mp->m_len;
 				if (vtag) {
 					sendmp->m_pkthdr.ether_vtag = vtag;
 					sendmp->m_flags |= M_VLANTAG;
 				}
                         }
 			/* Pass the head pointer on */
 			if (eop == 0) {
 				nbuf->fmp = sendmp;
 				sendmp = NULL;
 				mp->m_next = nbuf->m_pack;
 			}
 		}
 		++processed;
 		/* Sending this frame? */
 		if (eop) {
 			sendmp->m_pkthdr.rcvif = ifp;
 			/* gather stats */
 			rxr->rx_packets++;
 			rxr->rx_bytes += sendmp->m_pkthdr.len;
 			/* capture data for dynamic ITR adjustment */
 			rxr->packets++;
 			rxr->bytes += sendmp->m_pkthdr.len;
 			if ((ifp->if_capenable & IFCAP_RXCSUM) != 0)
 				ixl_rx_checksum(sendmp, status, error, ptype);
 			sendmp->m_pkthdr.flowid = que->msix;
 			sendmp->m_flags |= M_FLOWID;
 		}
 next_desc:
 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
 		/* Advance our pointers to the next descriptor. */
 		if (++i == que->num_desc)
 			i = 0;
 
 		/* Now send to the stack or do LRO */
 		if (sendmp != NULL) {
 			rxr->next_check = i;
 			ixl_rx_input(rxr, ifp, sendmp, ptype);
 			i = rxr->next_check;
 		}
 
                /* Every 8 descriptors we go to refresh mbufs */
 		if (processed == 8) {
 			ixl_refresh_mbufs(que, i);
 			processed = 0;
 		}
 	}
 
 	/* Refresh any remaining buf structs */
 	if (ixl_rx_unrefreshed(que))
 		ixl_refresh_mbufs(que, i);
 
 	rxr->next_check = i;
 
 #if defined(INET6) || defined(INET)
 	/*
 	 * Flush any outstanding LRO work
 	 */
 	while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
 		SLIST_REMOVE_HEAD(&lro->lro_active, next);
 		tcp_lro_flush(lro, queued);
 	}
 #endif
 
 	IXL_RX_UNLOCK(rxr);
 	return (FALSE);
 }
 
 
 /*********************************************************************
  *
  *  Verify that the hardware indicated that the checksum is valid.
  *  Inform the stack about the status of checksum so that stack
  *  doesn't spend time verifying the checksum.
  *
  *********************************************************************/
 static void
 ixl_rx_checksum(struct mbuf * mp, u32 status, u32 error, u8 ptype)
 {
 	struct i40e_rx_ptype_decoded decoded;
 
 	decoded = decode_rx_desc_ptype(ptype);
 
 	/* Errors? */
  	if (error & ((1 << I40E_RX_DESC_ERROR_IPE_SHIFT) |
 	    (1 << I40E_RX_DESC_ERROR_L4E_SHIFT))) {
 		mp->m_pkthdr.csum_flags = 0;
 		return;
 	}
 
 	/* IPv6 with extension headers likely have bad csum */
 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
 	    decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6)
 		if (status &
 		    (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT)) {
 			mp->m_pkthdr.csum_flags = 0;
 			return;
 		}
 
  
 	/* IP Checksum Good */
 	mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
 	mp->m_pkthdr.csum_flags |= CSUM_IP_VALID;
 
 	if (status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) {
 		mp->m_pkthdr.csum_flags |= 
 		    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 		mp->m_pkthdr.csum_data |= htons(0xffff);
 	}
 	return;
 }
 
 #if __FreeBSD_version >= 1100000
 uint64_t
 ixl_get_counter(if_t ifp, ift_counter cnt)
 {
 	struct ixl_vsi *vsi;
 
 	vsi = if_getsoftc(ifp);
 
 	switch (cnt) {
 	case IFCOUNTER_IPACKETS:
 		return (vsi->ipackets);
 	case IFCOUNTER_IERRORS:
 		return (vsi->ierrors);
 	case IFCOUNTER_OPACKETS:
 		return (vsi->opackets);
 	case IFCOUNTER_OERRORS:
 		return (vsi->oerrors);
 	case IFCOUNTER_COLLISIONS:
 		/* Collisions are by standard impossible in 40G/10G Ethernet */
 		return (0);
 	case IFCOUNTER_IBYTES:
 		return (vsi->ibytes);
 	case IFCOUNTER_OBYTES:
 		return (vsi->obytes);
 	case IFCOUNTER_IMCASTS:
 		return (vsi->imcasts);
 	case IFCOUNTER_OMCASTS:
 		return (vsi->omcasts);
 	case IFCOUNTER_IQDROPS:
 		return (vsi->iqdrops);
 	case IFCOUNTER_OQDROPS:
 		return (vsi->oqdrops);
 	case IFCOUNTER_NOPROTO:
 		return (vsi->noproto);
 	default:
 		return (if_get_counter_default(ifp, cnt));
 	}
 }
 #endif
 
Index: user/dchagin/lemul/sys/dev/usb/serial/u3g.c
===================================================================
--- user/dchagin/lemul/sys/dev/usb/serial/u3g.c	(revision 274240)
+++ user/dchagin/lemul/sys/dev/usb/serial/u3g.c	(revision 274241)
@@ -1,1211 +1,1213 @@
 /*
  * Copyright (c) 2008 AnyWi Technologies
  * Author: Andrea Guzzo <aguzzo@anywi.com>
  * * based on uark.c 1.1 2006/08/14 08:30:22 jsg *
  * * parts from ubsa.c 183348 2008-09-25 12:00:56Z phk *
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *
  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  *
  * $FreeBSD$
  */
 
 /*
  * NOTE:
  *
  * - The detour through the tty layer is ridiculously expensive wrt
  *   buffering due to the high speeds.
  *
  *   We should consider adding a simple r/w device which allows
  *   attaching of PPP in a more efficient way.
  *
  */
 
 
 #include <sys/stdint.h>
 #include <sys/stddef.h>
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 
 #include <dev/usb/usb.h>
 #include <dev/usb/usbdi.h>
 #include <dev/usb/usbdi_util.h>
 #include <dev/usb/usb_cdc.h>
 #include "usbdevs.h"
 
 #define	USB_DEBUG_VAR u3g_debug
 #include <dev/usb/usb_debug.h>
 #include <dev/usb/usb_process.h>
 #include <dev/usb/usb_msctest.h>
 
 #include <dev/usb/serial/usb_serial.h>
 #include <dev/usb/quirk/usb_quirk.h>
 
 #ifdef USB_DEBUG
 static int u3g_debug = 0;
 
 static SYSCTL_NODE(_hw_usb, OID_AUTO, u3g, CTLFLAG_RW, 0, "USB 3g");
 SYSCTL_INT(_hw_usb_u3g, OID_AUTO, debug, CTLFLAG_RW,
     &u3g_debug, 0, "Debug level");
 #endif
 
 #define	U3G_MAXPORTS		12
 #define	U3G_CONFIG_INDEX	0
 #define	U3G_BSIZE		2048
 #define	U3G_TXSIZE		(U3G_BSIZE / U3G_TXFRAMES)
 #define	U3G_TXFRAMES		4
 
 /* Eject methods; See also usb_quirks.h:UQ_MSC_EJECT_* */
 #define	U3GINIT_HUAWEI		1	/* Requires Huawei init command */
 #define	U3GINIT_SIERRA		2	/* Requires Sierra init command */
 #define	U3GINIT_SCSIEJECT	3	/* Requires SCSI eject command */
 #define	U3GINIT_REZERO		4	/* Requires SCSI rezero command */
 #define	U3GINIT_ZTESTOR		5	/* Requires ZTE SCSI command */
 #define	U3GINIT_CMOTECH		6	/* Requires CMOTECH SCSI command */
 #define	U3GINIT_WAIT		7	/* Device reappears after a delay */
 #define	U3GINIT_SAEL_M460	8	/* Requires vendor init */
 #define	U3GINIT_HUAWEISCSI	9	/* Requires Huawei SCSI init command */
 #define	U3GINIT_HUAWEISCSI2	10	/* Requires Huawei SCSI init command (2) */
 #define	U3GINIT_TCT		11	/* Requires TCT Mobile init command */
 
 enum {
 	U3G_BULK_WR,
 	U3G_BULK_RD,
 	U3G_INTR,
 	U3G_N_TRANSFER,
 };
 
 struct u3g_softc {
 	struct ucom_super_softc sc_super_ucom;
 	struct ucom_softc sc_ucom[U3G_MAXPORTS];
 
 	struct usb_xfer *sc_xfer[U3G_MAXPORTS][U3G_N_TRANSFER];
 	uint8_t sc_iface[U3G_MAXPORTS];			/* local status register */
 	uint8_t sc_lsr[U3G_MAXPORTS];			/* local status register */
 	uint8_t sc_msr[U3G_MAXPORTS];			/* u3g status register */
 	uint16_t sc_line[U3G_MAXPORTS];			/* line status */
 
 	struct usb_device *sc_udev;
 	struct mtx sc_mtx;
 
 	uint8_t sc_numports;
 };
 
 static device_probe_t u3g_probe;
 static device_attach_t u3g_attach;
 static device_detach_t u3g_detach;
 static void u3g_free_softc(struct u3g_softc *);
 
 static usb_callback_t u3g_write_callback;
 static usb_callback_t u3g_read_callback;
 static usb_callback_t u3g_intr_callback;
 
 static void u3g_cfg_get_status(struct ucom_softc *, uint8_t *, uint8_t *);
 static void u3g_cfg_set_dtr(struct ucom_softc *, uint8_t);
 static void u3g_cfg_set_rts(struct ucom_softc *, uint8_t);
 static void u3g_start_read(struct ucom_softc *ucom);
 static void u3g_stop_read(struct ucom_softc *ucom);
 static void u3g_start_write(struct ucom_softc *ucom);
 static void u3g_stop_write(struct ucom_softc *ucom);
 static void u3g_poll(struct ucom_softc *ucom);
 static void u3g_free(struct ucom_softc *ucom);
 
 
 static void u3g_test_autoinst(void *, struct usb_device *,
 		struct usb_attach_arg *);
 static int u3g_driver_loaded(struct module *mod, int what, void *arg);
 
 static eventhandler_tag u3g_etag;
 
 static const struct usb_config u3g_config[U3G_N_TRANSFER] = {
 
 	[U3G_BULK_WR] = {
 		.type = UE_BULK,
 		.endpoint = UE_ADDR_ANY,
 		.direction = UE_DIR_OUT,
 		.bufsize = U3G_BSIZE,/* bytes */
 		.frames = U3G_TXFRAMES,
 		.flags = {.pipe_bof = 1,.force_short_xfer = 1,},
 		.callback = &u3g_write_callback,
 	},
 
 	[U3G_BULK_RD] = {
 		.type = UE_BULK,
 		.endpoint = UE_ADDR_ANY,
 		.direction = UE_DIR_IN,
 		.bufsize = U3G_BSIZE,/* bytes */
 		.flags = {.pipe_bof = 1,.short_xfer_ok = 1,},
 		.callback = &u3g_read_callback,
 	},
 
 	[U3G_INTR] = {
 		.type = UE_INTERRUPT,
 		.endpoint = UE_ADDR_ANY,
 		.direction = UE_DIR_IN,
 		.flags = {.pipe_bof = 1,.short_xfer_ok = 1,.no_pipe_ok = 1,},
 		.bufsize = 0,	/* use wMaxPacketSize */
 		.callback = &u3g_intr_callback,
 	},
 };
 
 static const struct ucom_callback u3g_callback = {
 	.ucom_cfg_get_status = &u3g_cfg_get_status,
 	.ucom_cfg_set_dtr = &u3g_cfg_set_dtr,
 	.ucom_cfg_set_rts = &u3g_cfg_set_rts,
 	.ucom_start_read = &u3g_start_read,
 	.ucom_stop_read = &u3g_stop_read,
 	.ucom_start_write = &u3g_start_write,
 	.ucom_stop_write = &u3g_stop_write,
 	.ucom_poll = &u3g_poll,
 	.ucom_free = &u3g_free,
 };
 
 static device_method_t u3g_methods[] = {
 	DEVMETHOD(device_probe, u3g_probe),
 	DEVMETHOD(device_attach, u3g_attach),
 	DEVMETHOD(device_detach, u3g_detach),
 	DEVMETHOD_END
 };
 
 static devclass_t u3g_devclass;
 
 static driver_t u3g_driver = {
 	.name = "u3g",
 	.methods = u3g_methods,
 	.size = sizeof(struct u3g_softc),
 };
 
 DRIVER_MODULE(u3g, uhub, u3g_driver, u3g_devclass, u3g_driver_loaded, 0);
 MODULE_DEPEND(u3g, ucom, 1, 1, 1);
 MODULE_DEPEND(u3g, usb, 1, 1, 1);
 MODULE_VERSION(u3g, 1);
 
 static const STRUCT_USB_HOST_ID u3g_devs[] = {
 #define	U3G_DEV(v,p,i) { USB_VPI(USB_VENDOR_##v, USB_PRODUCT_##v##_##p, i) }
 	U3G_DEV(ACERP, H10, 0),
 	U3G_DEV(AIRPLUS, MCD650, 0),
 	U3G_DEV(AIRPRIME, PC5220, 0),
 	U3G_DEV(AIRPRIME, AC313U, 0),
 	U3G_DEV(ALINK, 3G, 0),
 	U3G_DEV(ALINK, 3GU, 0),
 	U3G_DEV(ALINK, DWM652U5, 0),
 	U3G_DEV(AMOI, H01, 0),
 	U3G_DEV(AMOI, H01A, 0),
 	U3G_DEV(AMOI, H02, 0),
 	U3G_DEV(ANYDATA, ADU_500A, 0),
 	U3G_DEV(ANYDATA, ADU_620UW, 0),
 	U3G_DEV(ANYDATA, ADU_E100X, 0),
 	U3G_DEV(AXESSTEL, DATAMODEM, 0),
 	U3G_DEV(CMOTECH, CDMA_MODEM1, 0),
 	U3G_DEV(CMOTECH, CGU628, U3GINIT_CMOTECH),
 	U3G_DEV(DELL, U5500, 0),
 	U3G_DEV(DELL, U5505, 0),
 	U3G_DEV(DELL, U5510, 0),
 	U3G_DEV(DELL, U5520, 0),
 	U3G_DEV(DELL, U5520_2, 0),
 	U3G_DEV(DELL, U5520_3, 0),
 	U3G_DEV(DELL, U5700, 0),
 	U3G_DEV(DELL, U5700_2, 0),
 	U3G_DEV(DELL, U5700_3, 0),
 	U3G_DEV(DELL, U5700_4, 0),
 	U3G_DEV(DELL, U5720, 0),
 	U3G_DEV(DELL, U5720_2, 0),
 	U3G_DEV(DELL, U5730, 0),
 	U3G_DEV(DELL, U5730_2, 0),
 	U3G_DEV(DELL, U5730_3, 0),
 	U3G_DEV(DELL, U740, 0),
 	U3G_DEV(DLINK, DWR510_CD, U3GINIT_SCSIEJECT),
 	U3G_DEV(DLINK, DWR510, 0),
 	U3G_DEV(DLINK3, DWM652, 0),
 	U3G_DEV(HP, EV2200, 0),
 	U3G_DEV(HP, HS2300, 0),
 	U3G_DEV(HP, UN2420_QDL, 0),
 	U3G_DEV(HP, UN2420, 0),
 	U3G_DEV(HUAWEI, E1401, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1402, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1403, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1404, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1405, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1406, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1407, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1408, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1409, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E140A, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E140B, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E140D, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E140E, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E140F, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1410, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1411, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1412, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1413, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1414, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1415, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1416, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1417, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1418, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1419, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E141A, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E141B, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E141C, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E141D, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E141E, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E141F, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1420, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1421, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1422, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1423, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1424, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1425, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1426, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1427, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1428, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1429, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E142A, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E142B, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E142C, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E142D, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E142E, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E142F, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1430, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1431, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1432, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1433, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1434, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1435, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1436, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1437, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1438, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1439, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E143A, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E143B, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E143C, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E143D, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E143E, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E143F, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E173, 0),
 	U3G_DEV(HUAWEI, E173_INIT, U3GINIT_HUAWEISCSI),
 	U3G_DEV(HUAWEI, E3131, 0),
 	U3G_DEV(HUAWEI, E3131_INIT, U3GINIT_HUAWEISCSI),
 	U3G_DEV(HUAWEI, E180V, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E220, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E220BIS, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E392, U3GINIT_HUAWEISCSI),
 	U3G_DEV(HUAWEI, MOBILE, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, E1752, U3GINIT_HUAWEISCSI),
 	U3G_DEV(HUAWEI, E1820, U3GINIT_HUAWEISCSI),
 	U3G_DEV(HUAWEI, K3772, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, K3772_INIT, U3GINIT_HUAWEISCSI2),
 	U3G_DEV(HUAWEI, K3765, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, K3765_INIT, U3GINIT_HUAWEISCSI),
 	U3G_DEV(HUAWEI, K3770, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, K3770_INIT, U3GINIT_HUAWEISCSI),
 	U3G_DEV(HUAWEI, K4505, U3GINIT_HUAWEI),
 	U3G_DEV(HUAWEI, K4505_INIT, U3GINIT_HUAWEISCSI),
 	U3G_DEV(HUAWEI, ETS2055, U3GINIT_HUAWEI),
 	U3G_DEV(KYOCERA2, CDMA_MSM_K, 0),
 	U3G_DEV(KYOCERA2, KPC680, 0),
 	U3G_DEV(LONGCHEER, WM66, U3GINIT_HUAWEI),
 	U3G_DEV(LONGCHEER, DISK, U3GINIT_TCT),
 	U3G_DEV(LONGCHEER, W14, 0),
 	U3G_DEV(LONGCHEER, XSSTICK, 0),
 	U3G_DEV(MERLIN, V620, 0),
 	U3G_DEV(NEOTEL, PRIME, 0),
 	U3G_DEV(NOVATEL, E725, 0),
 	U3G_DEV(NOVATEL, ES620, 0),
 	U3G_DEV(NOVATEL, ES620_2, 0),
 	U3G_DEV(NOVATEL, EU730, 0),
 	U3G_DEV(NOVATEL, EU740, 0),
 	U3G_DEV(NOVATEL, EU870D, 0),
 	U3G_DEV(NOVATEL, MC760, 0),
 	U3G_DEV(NOVATEL, MC547, 0),
 	U3G_DEV(NOVATEL, MC679, 0),
 	U3G_DEV(NOVATEL, MC950D, 0),
 	U3G_DEV(NOVATEL, MC990D, 0),
 	U3G_DEV(NOVATEL, MIFI2200, U3GINIT_SCSIEJECT),
 	U3G_DEV(NOVATEL, MIFI2200V, U3GINIT_SCSIEJECT),
 	U3G_DEV(NOVATEL, U720, 0),
 	U3G_DEV(NOVATEL, U727, 0),
 	U3G_DEV(NOVATEL, U727_2, 0),
 	U3G_DEV(NOVATEL, U740, 0),
 	U3G_DEV(NOVATEL, U740_2, 0),
 	U3G_DEV(NOVATEL, U760, U3GINIT_SCSIEJECT),
 	U3G_DEV(NOVATEL, U870, 0),
 	U3G_DEV(NOVATEL, V620, 0),
 	U3G_DEV(NOVATEL, V640, 0),
 	U3G_DEV(NOVATEL, V720, 0),
 	U3G_DEV(NOVATEL, V740, 0),
 	U3G_DEV(NOVATEL, X950D, 0),
 	U3G_DEV(NOVATEL, XU870, 0),
 	U3G_DEV(MOTOROLA2, MB886, U3GINIT_SCSIEJECT),
 	U3G_DEV(OPTION, E6500, 0),
 	U3G_DEV(OPTION, E6501, 0),
 	U3G_DEV(OPTION, E6601, 0),
 	U3G_DEV(OPTION, E6721, 0),
 	U3G_DEV(OPTION, E6741, 0),
 	U3G_DEV(OPTION, E6761, 0),
 	U3G_DEV(OPTION, E6800, 0),
 	U3G_DEV(OPTION, E7021, 0),
 	U3G_DEV(OPTION, E7041, 0),
 	U3G_DEV(OPTION, E7061, 0),
 	U3G_DEV(OPTION, E7100, 0),
 	U3G_DEV(OPTION, GE40X, 0),
 	U3G_DEV(OPTION, GT3G, 0),
 	U3G_DEV(OPTION, GT3GPLUS, 0),
 	U3G_DEV(OPTION, GT3GQUAD, 0),
 	U3G_DEV(OPTION, GT3G_1, 0),
 	U3G_DEV(OPTION, GT3G_2, 0),
 	U3G_DEV(OPTION, GT3G_3, 0),
 	U3G_DEV(OPTION, GT3G_4, 0),
 	U3G_DEV(OPTION, GT3G_5, 0),
 	U3G_DEV(OPTION, GT3G_6, 0),
 	U3G_DEV(OPTION, GTHSDPA, 0),
 	U3G_DEV(OPTION, GTM380, 0),
 	U3G_DEV(OPTION, GTMAX36, 0),
 	U3G_DEV(OPTION, GTMAX380HSUPAE, 0),
 	U3G_DEV(OPTION, GTMAXHSUPA, 0),
 	U3G_DEV(OPTION, GTMAXHSUPAE, 0),
 	U3G_DEV(OPTION, VODAFONEMC3G, 0),
 	U3G_DEV(QISDA, H20_1, 0),
 	U3G_DEV(QISDA, H20_2, 0),
 	U3G_DEV(QISDA, H21_1, 0),
 	U3G_DEV(QISDA, H21_2, 0),
 	U3G_DEV(QUALCOMM, NTT_L02C_MODEM, U3GINIT_SCSIEJECT),
 	U3G_DEV(QUALCOMM2, AC8700, 0),
 	U3G_DEV(QUALCOMM2, MF330, 0),
 	U3G_DEV(QUALCOMM2, SIM5218, 0),
 	U3G_DEV(QUALCOMM2, VW110L, U3GINIT_SCSIEJECT),
 	U3G_DEV(QUALCOMM2, GOBI2000_QDL, 0),
 	U3G_DEV(QUALCOMM2, GOBI2000, 0),
 	U3G_DEV(QUALCOMMINC, AC2726, 0),
 	U3G_DEV(QUALCOMMINC, AC8700, 0),
 	U3G_DEV(QUALCOMMINC, AC8710, 0),
 	U3G_DEV(QUALCOMMINC, CDMA_MSM, U3GINIT_SCSIEJECT),
 	U3G_DEV(QUALCOMMINC, E0002, 0),
 	U3G_DEV(QUALCOMMINC, E0003, 0),
 	U3G_DEV(QUALCOMMINC, E0004, 0),
 	U3G_DEV(QUALCOMMINC, E0005, 0),
 	U3G_DEV(QUALCOMMINC, E0006, 0),
 	U3G_DEV(QUALCOMMINC, E0007, 0),
 	U3G_DEV(QUALCOMMINC, E0008, 0),
 	U3G_DEV(QUALCOMMINC, E0009, 0),
 	U3G_DEV(QUALCOMMINC, E000A, 0),
 	U3G_DEV(QUALCOMMINC, E000B, 0),
 	U3G_DEV(QUALCOMMINC, E000C, 0),
 	U3G_DEV(QUALCOMMINC, E000D, 0),
 	U3G_DEV(QUALCOMMINC, E000E, 0),
 	U3G_DEV(QUALCOMMINC, E000F, 0),
 	U3G_DEV(QUALCOMMINC, E0010, 0),
 	U3G_DEV(QUALCOMMINC, E0011, 0),
 	U3G_DEV(QUALCOMMINC, E0012, 0),
 	U3G_DEV(QUALCOMMINC, E0013, 0),
 	U3G_DEV(QUALCOMMINC, E0014, 0),
 	U3G_DEV(QUALCOMMINC, E0017, 0),
 	U3G_DEV(QUALCOMMINC, E0018, 0),
 	U3G_DEV(QUALCOMMINC, E0019, 0),
 	U3G_DEV(QUALCOMMINC, E0020, 0),
 	U3G_DEV(QUALCOMMINC, E0021, 0),
 	U3G_DEV(QUALCOMMINC, E0022, 0),
 	U3G_DEV(QUALCOMMINC, E0023, 0),
 	U3G_DEV(QUALCOMMINC, E0024, 0),
 	U3G_DEV(QUALCOMMINC, E0025, 0),
 	U3G_DEV(QUALCOMMINC, E0026, 0),
 	U3G_DEV(QUALCOMMINC, E0027, 0),
 	U3G_DEV(QUALCOMMINC, E0028, 0),
 	U3G_DEV(QUALCOMMINC, E0029, 0),
 	U3G_DEV(QUALCOMMINC, E0030, 0),
 	U3G_DEV(QUALCOMMINC, E0032, 0),
 	U3G_DEV(QUALCOMMINC, E0033, 0),
 	U3G_DEV(QUALCOMMINC, E0037, 0),
 	U3G_DEV(QUALCOMMINC, E0039, 0),
 	U3G_DEV(QUALCOMMINC, E0042, 0),
 	U3G_DEV(QUALCOMMINC, E0043, 0),
 	U3G_DEV(QUALCOMMINC, E0048, 0),
 	U3G_DEV(QUALCOMMINC, E0049, 0),
 	U3G_DEV(QUALCOMMINC, E0051, 0),
 	U3G_DEV(QUALCOMMINC, E0052, 0),
 	U3G_DEV(QUALCOMMINC, E0054, 0),
 	U3G_DEV(QUALCOMMINC, E0055, 0),
 	U3G_DEV(QUALCOMMINC, E0057, 0),
 	U3G_DEV(QUALCOMMINC, E0058, 0),
 	U3G_DEV(QUALCOMMINC, E0059, 0),
 	U3G_DEV(QUALCOMMINC, E0060, 0),
 	U3G_DEV(QUALCOMMINC, E0061, 0),
 	U3G_DEV(QUALCOMMINC, E0062, 0),
 	U3G_DEV(QUALCOMMINC, E0063, 0),
 	U3G_DEV(QUALCOMMINC, E0064, 0),
 	U3G_DEV(QUALCOMMINC, E0066, 0),
 	U3G_DEV(QUALCOMMINC, E0069, 0),
 	U3G_DEV(QUALCOMMINC, E0070, 0),
 	U3G_DEV(QUALCOMMINC, E0073, 0),
 	U3G_DEV(QUALCOMMINC, E0076, 0),
 	U3G_DEV(QUALCOMMINC, E0078, 0),
 	U3G_DEV(QUALCOMMINC, E0082, 0),
 	U3G_DEV(QUALCOMMINC, E0086, 0),
 	U3G_DEV(QUALCOMMINC, SURFSTICK, 0),
 	U3G_DEV(QUALCOMMINC, E2002, 0),
 	U3G_DEV(QUALCOMMINC, E2003, 0),
 	U3G_DEV(QUALCOMMINC, K3772_Z, 0),
 	U3G_DEV(QUALCOMMINC, K3772_Z_INIT, U3GINIT_SCSIEJECT),
+	U3G_DEV(QUALCOMMINC, MF195E, 0),
+	U3G_DEV(QUALCOMMINC, MF195E_INIT, U3GINIT_SCSIEJECT),
 	U3G_DEV(QUALCOMMINC, MF626, 0),
 	U3G_DEV(QUALCOMMINC, MF628, 0),
 	U3G_DEV(QUALCOMMINC, MF633R, 0),
 	U3G_DEV(QUANTA, GKE, 0),
 	U3G_DEV(QUANTA, GLE, 0),
 	U3G_DEV(QUANTA, GLX, 0),
 	U3G_DEV(QUANTA, Q101, 0),
 	U3G_DEV(QUANTA, Q111, 0),
 	U3G_DEV(SIERRA, AC402, 0),
 	U3G_DEV(SIERRA, AC595U, 0),
 	U3G_DEV(SIERRA, AC313U, 0),
 	U3G_DEV(SIERRA, AC597E, 0),
 	U3G_DEV(SIERRA, AC875E, 0),
 	U3G_DEV(SIERRA, AC875U, 0),
 	U3G_DEV(SIERRA, AC875U_2, 0),
 	U3G_DEV(SIERRA, AC880, 0),
 	U3G_DEV(SIERRA, AC880E, 0),
 	U3G_DEV(SIERRA, AC880U, 0),
 	U3G_DEV(SIERRA, AC881, 0),
 	U3G_DEV(SIERRA, AC881E, 0),
 	U3G_DEV(SIERRA, AC881U, 0),
 	U3G_DEV(SIERRA, AC885E, 0),
 	U3G_DEV(SIERRA, AC885E_2, 0),
 	U3G_DEV(SIERRA, AC885U, 0),
 	U3G_DEV(SIERRA, AIRCARD580, 0),
 	U3G_DEV(SIERRA, AIRCARD595, 0),
 	U3G_DEV(SIERRA, AIRCARD875, 0),
 	U3G_DEV(SIERRA, C22, 0),
 	U3G_DEV(SIERRA, C597, 0),
 	U3G_DEV(SIERRA, C888, 0),
 	U3G_DEV(SIERRA, E0029, 0),
 	U3G_DEV(SIERRA, E6892, 0),
 	U3G_DEV(SIERRA, E6893, 0),
 	U3G_DEV(SIERRA, EM5625, 0),
 	U3G_DEV(SIERRA, EM5725, 0),
 	U3G_DEV(SIERRA, MC5720, 0),
 	U3G_DEV(SIERRA, MC5720_2, 0),
 	U3G_DEV(SIERRA, MC5725, 0),
 	U3G_DEV(SIERRA, MC5727, 0),
 	U3G_DEV(SIERRA, MC5727_2, 0),
 	U3G_DEV(SIERRA, MC5728, 0),
 	U3G_DEV(SIERRA, MC8700, 0),
 	U3G_DEV(SIERRA, MC8755, 0),
 	U3G_DEV(SIERRA, MC8755_2, 0),
 	U3G_DEV(SIERRA, MC8755_3, 0),
 	U3G_DEV(SIERRA, MC8755_4, 0),
 	U3G_DEV(SIERRA, MC8765, 0),
 	U3G_DEV(SIERRA, MC8765_2, 0),
 	U3G_DEV(SIERRA, MC8765_3, 0),
 	U3G_DEV(SIERRA, MC8775, 0),
 	U3G_DEV(SIERRA, MC8775_2, 0),
 	U3G_DEV(SIERRA, MC8780, 0),
 	U3G_DEV(SIERRA, MC8780_2, 0),
 	U3G_DEV(SIERRA, MC8780_3, 0),
 	U3G_DEV(SIERRA, MC8781, 0),
 	U3G_DEV(SIERRA, MC8781_2, 0),
 	U3G_DEV(SIERRA, MC8781_3, 0),
 	U3G_DEV(SIERRA, MC8785, 0),
 	U3G_DEV(SIERRA, MC8785_2, 0),
 	U3G_DEV(SIERRA, MC8790, 0),
 	U3G_DEV(SIERRA, MC8791, 0),
 	U3G_DEV(SIERRA, MC8792, 0),
 	U3G_DEV(SIERRA, MINI5725, 0),
 	U3G_DEV(SIERRA, T11, 0),
 	U3G_DEV(SIERRA, T598, 0),
 	U3G_DEV(SILABS, SAEL, U3GINIT_SAEL_M460),
 	U3G_DEV(STELERA, C105, 0),
 	U3G_DEV(STELERA, E1003, 0),
 	U3G_DEV(STELERA, E1004, 0),
 	U3G_DEV(STELERA, E1005, 0),
 	U3G_DEV(STELERA, E1006, 0),
 	U3G_DEV(STELERA, E1007, 0),
 	U3G_DEV(STELERA, E1008, 0),
 	U3G_DEV(STELERA, E1009, 0),
 	U3G_DEV(STELERA, E100A, 0),
 	U3G_DEV(STELERA, E100B, 0),
 	U3G_DEV(STELERA, E100C, 0),
 	U3G_DEV(STELERA, E100D, 0),
 	U3G_DEV(STELERA, E100E, 0),
 	U3G_DEV(STELERA, E100F, 0),
 	U3G_DEV(STELERA, E1010, 0),
 	U3G_DEV(STELERA, E1011, 0),
 	U3G_DEV(STELERA, E1012, 0),
 	U3G_DEV(TCTMOBILE, X060S, 0),
 	U3G_DEV(TCTMOBILE, X080S, U3GINIT_TCT),
 	U3G_DEV(TELIT, UC864E, 0),
 	U3G_DEV(TELIT, UC864G, 0),
 	U3G_DEV(TLAYTECH, TEU800, 0),
 	U3G_DEV(TOSHIBA, G450, 0),
 	U3G_DEV(TOSHIBA, HSDPA, 0),
 	U3G_DEV(YISO, C893, 0),
 	U3G_DEV(WETELECOM, WM_D200, 0),
 	/* Autoinstallers */
 	U3G_DEV(NOVATEL, ZEROCD, U3GINIT_SCSIEJECT),
 	U3G_DEV(OPTION, GTICON322, U3GINIT_REZERO),
 	U3G_DEV(QUALCOMMINC, ZTE_STOR, U3GINIT_ZTESTOR),
 	U3G_DEV(QUALCOMMINC, ZTE_STOR2, U3GINIT_SCSIEJECT),
 	U3G_DEV(QUANTA, Q101_STOR, U3GINIT_SCSIEJECT),
 	U3G_DEV(SIERRA, TRUINSTALL, U3GINIT_SIERRA),
 #undef	U3G_DEV
 };
 
 static int
 u3g_sierra_init(struct usb_device *udev)
 {
 	struct usb_device_request req;
 
 	req.bmRequestType = UT_VENDOR;
 	req.bRequest = UR_SET_INTERFACE;
 	USETW(req.wValue, UF_DEVICE_REMOTE_WAKEUP);
 	USETW(req.wIndex, UHF_PORT_CONNECTION);
 	USETW(req.wLength, 0);
 
 	if (usbd_do_request_flags(udev, NULL, &req,
 	    NULL, 0, NULL, USB_MS_HZ)) {
 		/* ignore any errors */
 	}
 	return (0);
 }
 
 static int
 u3g_huawei_init(struct usb_device *udev)
 {
 	struct usb_device_request req;
 
 	req.bmRequestType = UT_WRITE_DEVICE;
 	req.bRequest = UR_SET_FEATURE;
 	USETW(req.wValue, UF_DEVICE_REMOTE_WAKEUP);
 	USETW(req.wIndex, UHF_PORT_SUSPEND);
 	USETW(req.wLength, 0);
 
 	if (usbd_do_request_flags(udev, NULL, &req,
 	    NULL, 0, NULL, USB_MS_HZ)) {
 		/* ignore any errors */
 	}
 	return (0);
 }
 
 static void
 u3g_sael_m460_init(struct usb_device *udev)
 {
 	static const uint8_t setup[][24] = {
 	     { 0x41, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
 	     { 0x41, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 },
 	     { 0x41, 0x13, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 
 	       0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
 	       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
 	     { 0xc1, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02 },
 	     { 0xc1, 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00 },
 	     { 0x41, 0x07, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00 },
 	     { 0xc1, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 },
 	     { 0x41, 0x01, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00 },
 	     { 0x41, 0x07, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00 },
 	     { 0x41, 0x03, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00 },
 	     { 0x41, 0x19, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00,
 	       0x00, 0x00, 0x00, 0x00, 0x11, 0x13 },
 	     { 0x41, 0x13, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 
 	       0x09, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
 	       0x0a, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00 },
 	     { 0x41, 0x12, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00 },
 	     { 0x41, 0x01, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00 },
 	     { 0x41, 0x07, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00 },
 	     { 0x41, 0x03, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00 },
 	     { 0x41, 0x19, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00,
 	       0x00, 0x00, 0x00, 0x00, 0x11, 0x13 },
 	     { 0x41, 0x13, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00,
 	       0x09, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 
 	       0x0a, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00 },
 	     { 0x41, 0x07, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00 },
 	};
 
 	struct usb_device_request req;
 	usb_error_t err;
 	uint16_t len;
 	uint8_t buf[0x300];
 	uint8_t n;
 
 	DPRINTFN(1, "\n");
 
 	if (usbd_req_set_alt_interface_no(udev, NULL, 0, 0)) {
 		DPRINTFN(0, "Alt setting 0 failed\n");
 		return;
 	}
 
 	for (n = 0; n != (sizeof(setup)/sizeof(setup[0])); n++) {
 
 		memcpy(&req, setup[n], sizeof(req));
 
 		len = UGETW(req.wLength);
 		if (req.bmRequestType & UE_DIR_IN) {
 			if (len > sizeof(buf)) {
 				DPRINTFN(0, "too small buffer\n");
 				continue;
 			}
 			err = usbd_do_request(udev, NULL, &req, buf);
 		} else {
 			if (len > (sizeof(setup[0]) - 8)) {
 				DPRINTFN(0, "too small buffer\n");
 				continue;
 			}
 			err = usbd_do_request(udev, NULL, &req, 
 			    __DECONST(uint8_t *, &setup[n][8]));
 		}
 		if (err) {
 			DPRINTFN(1, "request %u failed\n",
 			    (unsigned int)n);
 			/*
 			 * Some of the requests will fail. Stop doing
 			 * requests when we are getting timeouts so
 			 * that we don't block the explore/attach
 			 * thread forever.
 			 */
 			if (err == USB_ERR_TIMEOUT)
 				break;
 		}
 	}
 }
 
 /*
  * The following function handles 3G modem devices (E220, Mobile,
  * etc.) with auto-install flash disks for Windows/MacOSX on the first
  * interface.  After some command or some delay they change appearance
  * to a modem.
  */
 static void
 u3g_test_autoinst(void *arg, struct usb_device *udev,
     struct usb_attach_arg *uaa)
 {
 	struct usb_interface *iface;
 	struct usb_interface_descriptor *id;
 	int error;
 	unsigned long method;
 
 	if (uaa->dev_state != UAA_DEV_READY)
 		return;
 
 	iface = usbd_get_iface(udev, 0);
 	if (iface == NULL)
 		return;
 	id = iface->idesc;
 	if (id == NULL || id->bInterfaceClass != UICLASS_MASS)
 		return;
 
 	if (usb_test_quirk(uaa, UQ_MSC_EJECT_HUAWEI))
 		method = U3GINIT_HUAWEI;
 	else if (usb_test_quirk(uaa, UQ_MSC_EJECT_SIERRA))
 		method = U3GINIT_SIERRA;
 	else if (usb_test_quirk(uaa, UQ_MSC_EJECT_SCSIEJECT))
 		method = U3GINIT_SCSIEJECT;
 	else if (usb_test_quirk(uaa, UQ_MSC_EJECT_REZERO))
 		method = U3GINIT_REZERO;
 	else if (usb_test_quirk(uaa, UQ_MSC_EJECT_ZTESTOR))
 		method = U3GINIT_ZTESTOR;
 	else if (usb_test_quirk(uaa, UQ_MSC_EJECT_CMOTECH))
 		method = U3GINIT_CMOTECH;
 	else if (usb_test_quirk(uaa, UQ_MSC_EJECT_WAIT))
 		method = U3GINIT_WAIT;
 	else if (usb_test_quirk(uaa, UQ_MSC_EJECT_HUAWEISCSI))
 		method = U3GINIT_HUAWEISCSI;
 	else if (usb_test_quirk(uaa, UQ_MSC_EJECT_HUAWEISCSI2))
 		method = U3GINIT_HUAWEISCSI2;
 	else if (usb_test_quirk(uaa, UQ_MSC_EJECT_TCT))
 		method = U3GINIT_TCT;
 	else if (usbd_lookup_id_by_uaa(u3g_devs, sizeof(u3g_devs), uaa) == 0)
 		method = USB_GET_DRIVER_INFO(uaa);
 	else
 		return;		/* no device match */
 
 	if (bootverbose) {
 		printf("Ejecting %s %s using method %ld\n",
 		       usb_get_manufacturer(udev),
 		       usb_get_product(udev), method);
 	}
 
 	switch (method) {
 		case U3GINIT_HUAWEI:
 			error = u3g_huawei_init(udev);
 			break;
 		case U3GINIT_HUAWEISCSI:
 			error = usb_msc_eject(udev, 0, MSC_EJECT_HUAWEI);
 			break;
 		case U3GINIT_HUAWEISCSI2:
 			error = usb_msc_eject(udev, 0, MSC_EJECT_HUAWEI2);
 			break;
 		case U3GINIT_SCSIEJECT:
 			error = usb_msc_eject(udev, 0, MSC_EJECT_STOPUNIT);
 			break;
 		case U3GINIT_REZERO:
 			error = usb_msc_eject(udev, 0, MSC_EJECT_REZERO);
 			break;
 		case U3GINIT_ZTESTOR:
 			error = usb_msc_eject(udev, 0, MSC_EJECT_STOPUNIT);
 			if (error == 0)
 			    error = usb_msc_eject(udev, 0, MSC_EJECT_ZTESTOR);
 			break;
 		case U3GINIT_CMOTECH:
 			error = usb_msc_eject(udev, 0, MSC_EJECT_CMOTECH);
 			break;
 		case U3GINIT_TCT:
 			error = usb_msc_eject(udev, 0, MSC_EJECT_TCT);
 			break;
 		case U3GINIT_SIERRA:
 			error = u3g_sierra_init(udev);
 			break;
 		case U3GINIT_WAIT:
 			/* Just pretend we ejected, the card will timeout */
 			error = 0;
 			break;
 		default:
 			/* no 3G eject quirks */
 			error = EOPNOTSUPP;
 			break;
 	}
 	if (error == 0) {
 		/* success, mark the udev as disappearing */
 		uaa->dev_state = UAA_DEV_EJECTING;
 	}
 }
 
 static int
 u3g_driver_loaded(struct module *mod, int what, void *arg)
 {
 	switch (what) {
 	case MOD_LOAD:
 		/* register our autoinstall handler */
 		u3g_etag = EVENTHANDLER_REGISTER(usb_dev_configured,
 		    u3g_test_autoinst, NULL, EVENTHANDLER_PRI_ANY);
 		break;
 	case MOD_UNLOAD:
 		EVENTHANDLER_DEREGISTER(usb_dev_configured, u3g_etag);
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
  	return (0);
 }
 
 static int
 u3g_probe(device_t self)
 {
 	struct usb_attach_arg *uaa = device_get_ivars(self);
 
 	if (uaa->usb_mode != USB_MODE_HOST) {
 		return (ENXIO);
 	}
 	if (uaa->info.bConfigIndex != U3G_CONFIG_INDEX) {
 		return (ENXIO);
 	}
 	if (uaa->info.bInterfaceClass != UICLASS_VENDOR) {
 		return (ENXIO);
 	}
 	return (usbd_lookup_id_by_uaa(u3g_devs, sizeof(u3g_devs), uaa));
 }
 
 static int
 u3g_attach(device_t dev)
 {
 	struct usb_config u3g_config_tmp[U3G_N_TRANSFER];
 	struct usb_attach_arg *uaa = device_get_ivars(dev);
 	struct u3g_softc *sc = device_get_softc(dev);
 	struct usb_interface *iface;
 	struct usb_interface_descriptor *id;
 	uint32_t iface_valid;
 	int error, type, nports;
 	int ep, n;
 	uint8_t i;
 
 	DPRINTF("sc=%p\n", sc);
 
 	type = USB_GET_DRIVER_INFO(uaa);
 	if (type == U3GINIT_SAEL_M460
 	    || usb_test_quirk(uaa, UQ_MSC_EJECT_SAEL_M460)) {
 		u3g_sael_m460_init(uaa->device);
 	}
 
 	/* copy in USB config */
 	for (n = 0; n != U3G_N_TRANSFER; n++) 
 		u3g_config_tmp[n] = u3g_config[n];
 
 	device_set_usb_desc(dev);
 	mtx_init(&sc->sc_mtx, "u3g", NULL, MTX_DEF);
 	ucom_ref(&sc->sc_super_ucom);
 
 	sc->sc_udev = uaa->device;
 
 	/* Claim all interfaces on the device */
 	iface_valid = 0;
 	for (i = uaa->info.bIfaceIndex; i < USB_IFACE_MAX; i++) {
 		iface = usbd_get_iface(uaa->device, i);
 		if (iface == NULL)
 			break;
 		id = usbd_get_interface_descriptor(iface);
 		if (id == NULL || id->bInterfaceClass != UICLASS_VENDOR)
 			continue;
 		usbd_set_parent_iface(uaa->device, i, uaa->info.bIfaceIndex);
 		iface_valid |= (1<<i);
 	}
 
 	i = 0;		/* interface index */
 	ep = 0;		/* endpoint index */
 	nports = 0;	/* number of ports */
 	while (i < USB_IFACE_MAX) {
 		if ((iface_valid & (1<<i)) == 0) {
 			i++;
 			continue;
 		}
 
 		/* update BULK endpoint index */
 		for (n = 0; n < U3G_N_TRANSFER; n++)
 			u3g_config_tmp[n].ep_index = ep;
 
 		/* try to allocate a set of BULK endpoints */
 		error = usbd_transfer_setup(uaa->device, &i,
 		    sc->sc_xfer[nports], u3g_config_tmp, U3G_N_TRANSFER,
 		    &sc->sc_ucom[nports], &sc->sc_mtx);
 		if (error) {
 			/* next interface */
 			i++;
 			ep = 0;
 			continue;
 		}
 
 		iface = usbd_get_iface(uaa->device, i);
 		id = usbd_get_interface_descriptor(iface);
 		sc->sc_iface[nports] = id->bInterfaceNumber;
 
 		if (bootverbose && sc->sc_xfer[nports][U3G_INTR]) {
 			device_printf(dev, "port %d supports modem control\n",
 				      nports);
 		}
 
 		/* set stall by default */
 		mtx_lock(&sc->sc_mtx);
 		usbd_xfer_set_stall(sc->sc_xfer[nports][U3G_BULK_WR]);
 		usbd_xfer_set_stall(sc->sc_xfer[nports][U3G_BULK_RD]);
 		mtx_unlock(&sc->sc_mtx);
 
 		nports++;	/* found one port */
 		ep++;
 		if (nports == U3G_MAXPORTS)
 			break;
 	}
 	if (nports == 0) {
 		device_printf(dev, "no ports found\n");
 		goto detach;
 	}
 	sc->sc_numports = nports;
 
 	error = ucom_attach(&sc->sc_super_ucom, sc->sc_ucom,
 	    sc->sc_numports, sc, &u3g_callback, &sc->sc_mtx);
 	if (error) {
 		DPRINTF("ucom_attach failed\n");
 		goto detach;
 	}
 	ucom_set_pnpinfo_usb(&sc->sc_super_ucom, dev);
 	device_printf(dev, "Found %u port%s.\n", sc->sc_numports,
 	    sc->sc_numports > 1 ? "s":"");
 
 	return (0);
 
 detach:
 	u3g_detach(dev);
 	return (ENXIO);
 }
 
 static int
 u3g_detach(device_t dev)
 {
 	struct u3g_softc *sc = device_get_softc(dev);
 	uint8_t subunit;
 
 	DPRINTF("sc=%p\n", sc);
 
 	/* NOTE: It is not dangerous to detach more ports than attached! */
 	ucom_detach(&sc->sc_super_ucom, sc->sc_ucom);
 
 	for (subunit = 0; subunit != U3G_MAXPORTS; subunit++)
 		usbd_transfer_unsetup(sc->sc_xfer[subunit], U3G_N_TRANSFER);
 
 	device_claim_softc(dev);
 
 	u3g_free_softc(sc);
 
 	return (0);
 }
 
 UCOM_UNLOAD_DRAIN(u3g);
 
 static void
 u3g_free_softc(struct u3g_softc *sc)
 {
 	if (ucom_unref(&sc->sc_super_ucom)) {
 		mtx_destroy(&sc->sc_mtx);
 		device_free_softc(sc);
 	}
 }
 
 static void
 u3g_free(struct ucom_softc *ucom)
 {
 	u3g_free_softc(ucom->sc_parent);
 }
 
 static void
 u3g_start_read(struct ucom_softc *ucom)
 {
 	struct u3g_softc *sc = ucom->sc_parent;
 
 	/* start interrupt endpoint (if configured) */
 	usbd_transfer_start(sc->sc_xfer[ucom->sc_subunit][U3G_INTR]);
 
 	/* start read endpoint */
 	usbd_transfer_start(sc->sc_xfer[ucom->sc_subunit][U3G_BULK_RD]);
 }
 
 static void
 u3g_stop_read(struct ucom_softc *ucom)
 {
 	struct u3g_softc *sc = ucom->sc_parent;
 
 	/* stop interrupt endpoint (if configured) */
 	usbd_transfer_stop(sc->sc_xfer[ucom->sc_subunit][U3G_INTR]);
 
 	/* stop read endpoint */
 	usbd_transfer_stop(sc->sc_xfer[ucom->sc_subunit][U3G_BULK_RD]);
 }
 
 static void
 u3g_start_write(struct ucom_softc *ucom)
 {
 	struct u3g_softc *sc = ucom->sc_parent;
 
 	usbd_transfer_start(sc->sc_xfer[ucom->sc_subunit][U3G_BULK_WR]);
 }
 
 static void
 u3g_stop_write(struct ucom_softc *ucom)
 {
 	struct u3g_softc *sc = ucom->sc_parent;
 
 	usbd_transfer_stop(sc->sc_xfer[ucom->sc_subunit][U3G_BULK_WR]);
 }
 
 static void
 u3g_write_callback(struct usb_xfer *xfer, usb_error_t error)
 {
 	struct ucom_softc *ucom = usbd_xfer_softc(xfer);
 	struct usb_page_cache *pc;
 	uint32_t actlen;
 	uint32_t frame;
 
 	switch (USB_GET_STATE(xfer)) {
 	case USB_ST_TRANSFERRED:
 	case USB_ST_SETUP:
 tr_setup:
 		for (frame = 0; frame != U3G_TXFRAMES; frame++) {
 			usbd_xfer_set_frame_offset(xfer, frame * U3G_TXSIZE, frame);
 
 			pc = usbd_xfer_get_frame(xfer, frame);
 			if (ucom_get_data(ucom, pc, 0, U3G_TXSIZE, &actlen) == 0)
 				break;
 			usbd_xfer_set_frame_len(xfer, frame, actlen);
 		}
 		if (frame != 0) {
 			usbd_xfer_set_frames(xfer, frame);
 			usbd_transfer_submit(xfer);
 		}
 		break;
 
 	default:			/* Error */
 		if (error != USB_ERR_CANCELLED) {
 			/* do a builtin clear-stall */
 			usbd_xfer_set_stall(xfer);
 			goto tr_setup;
 		}
 		break;
 	}
 }
 
 static void
 u3g_read_callback(struct usb_xfer *xfer, usb_error_t error)
 {
 	struct ucom_softc *ucom = usbd_xfer_softc(xfer);
 	struct usb_page_cache *pc;
 	int actlen;
 
 	usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL);
 
 	switch (USB_GET_STATE(xfer)) {
 	case USB_ST_TRANSFERRED:
 		pc = usbd_xfer_get_frame(xfer, 0);
 		ucom_put_data(ucom, pc, 0, actlen);
 
 	case USB_ST_SETUP:
 tr_setup:
 		usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer));
 		usbd_transfer_submit(xfer);
 		break;
 
 	default:			/* Error */
 		if (error != USB_ERR_CANCELLED) {
 			/* do a builtin clear-stall */
 			usbd_xfer_set_stall(xfer);
 			goto tr_setup;
 		}
 		break;
 	}
 }
 
 static void
 u3g_cfg_get_status(struct ucom_softc *ucom, uint8_t *lsr, uint8_t *msr)
 {
 	struct u3g_softc *sc = ucom->sc_parent;
 
 	*lsr = sc->sc_lsr[ucom->sc_subunit];
 	*msr = sc->sc_msr[ucom->sc_subunit];
 }
 
 static void
 u3g_cfg_set_line(struct ucom_softc *ucom)
 {
 	struct u3g_softc *sc = ucom->sc_parent;
 	struct usb_device_request req;
 
 	req.bmRequestType = UT_WRITE_CLASS_INTERFACE;
 	req.bRequest = UCDC_SET_CONTROL_LINE_STATE;
 	USETW(req.wValue, sc->sc_line[ucom->sc_subunit]);
 	req.wIndex[0] = sc->sc_iface[ucom->sc_subunit];
 	req.wIndex[1] = 0;
 	USETW(req.wLength, 0);
 
 	ucom_cfg_do_request(sc->sc_udev, ucom,
 	    &req, NULL, 0, 1000);
 }
 
 static void
 u3g_cfg_set_dtr(struct ucom_softc *ucom, uint8_t onoff)
 {
 	struct u3g_softc *sc = ucom->sc_parent;
 
 	DPRINTF("onoff = %d\n", onoff);
 
 	if (onoff)
 		sc->sc_line[ucom->sc_subunit] |= UCDC_LINE_DTR;
 	else
 		sc->sc_line[ucom->sc_subunit] &= ~UCDC_LINE_DTR;
 
 	u3g_cfg_set_line(ucom);
 }
 
 static void
 u3g_cfg_set_rts(struct ucom_softc *ucom, uint8_t onoff)
 {
 	struct u3g_softc *sc = ucom->sc_parent;
 
 	DPRINTF("onoff = %d\n", onoff);
 
 	if (onoff)
 		sc->sc_line[ucom->sc_subunit] |= UCDC_LINE_RTS;
 	else
 		sc->sc_line[ucom->sc_subunit] &= ~UCDC_LINE_RTS;
 
 	u3g_cfg_set_line(ucom);
 }
 
 static void
 u3g_intr_callback(struct usb_xfer *xfer, usb_error_t error)
 {
 	struct ucom_softc *ucom = usbd_xfer_softc(xfer);
 	struct u3g_softc *sc = ucom->sc_parent;
 	struct usb_page_cache *pc;
 	struct usb_cdc_notification pkt;
 	int actlen;
 	uint16_t wLen;
 	uint8_t mstatus;
 
 	usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL);
 
 	switch (USB_GET_STATE(xfer)) {
 	case USB_ST_TRANSFERRED:
 		if (actlen < 8) {	/* usb_cdc_notification with 2 data bytes */
 			DPRINTF("message too short (expected 8, received %d)\n", actlen);
 			goto tr_setup;
 		}
 		pc = usbd_xfer_get_frame(xfer, 0);
 		usbd_copy_out(pc, 0, &pkt, actlen);
 
 		wLen = UGETW(pkt.wLength);
 		if (wLen < 2) {
 			DPRINTF("message too short (expected 2 data bytes, received %d)\n", wLen);
 			goto tr_setup;
 		}
 
 		if (pkt.bmRequestType == UCDC_NOTIFICATION
 		    && pkt.bNotification == UCDC_N_SERIAL_STATE) {
 			/*
 		         * Set the serial state in ucom driver based on
 		         * the bits from the notify message
 		         */
 			DPRINTF("notify bytes = 0x%02x, 0x%02x\n",
 			    pkt.data[0], pkt.data[1]);
 
 			/* currently, lsr is always zero. */
 			sc->sc_lsr[ucom->sc_subunit] = 0;
 			sc->sc_msr[ucom->sc_subunit] = 0;
 
 			mstatus = pkt.data[0];
 
 			if (mstatus & UCDC_N_SERIAL_RI)
 				sc->sc_msr[ucom->sc_subunit] |= SER_RI;
 			if (mstatus & UCDC_N_SERIAL_DSR)
 				sc->sc_msr[ucom->sc_subunit] |= SER_DSR;
 			if (mstatus & UCDC_N_SERIAL_DCD)
 				sc->sc_msr[ucom->sc_subunit] |= SER_DCD;
 			ucom_status_change(ucom);
 		}
 
 	case USB_ST_SETUP:
 tr_setup:
 		usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer));
 		usbd_transfer_submit(xfer);
 		return;
 
 	default:			/* Error */
 		if (error != USB_ERR_CANCELLED) {
 			/* try to clear stall first */
 			usbd_xfer_set_stall(xfer);
 			goto tr_setup;
 		}
 		return;
 	}
 }
 
 static void
 u3g_poll(struct ucom_softc *ucom)
 {
 	struct u3g_softc *sc = ucom->sc_parent;
 	usbd_transfer_poll(sc->sc_xfer[ucom->sc_subunit], U3G_N_TRANSFER);
 }
Index: user/dchagin/lemul/sys/dev/usb/usbdevs
===================================================================
--- user/dchagin/lemul/sys/dev/usb/usbdevs	(revision 274240)
+++ user/dchagin/lemul/sys/dev/usb/usbdevs	(revision 274241)
@@ -1,4587 +1,4589 @@
 $FreeBSD$
 /* $NetBSD: usbdevs,v 1.392 2004/12/29 08:38:44 imp Exp $ */
 
 /*-
  * Copyright (c) 1998-2004 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Lennart Augustsson (lennart@augustsson.net) at
  * Carlstedt Research & Technology.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * List of known USB vendors
  *
  * USB.org publishes a VID list of USB-IF member companies at
  * http://www.usb.org/developers/tools
  * Note that it does not show companies that have obtained a Vendor ID
  * without becoming full members.
  *
  * Please note that these IDs do not do anything. Adding an ID here and
  * regenerating the usbdevs.h and usbdevs_data.h only makes a symbolic name
  * available to the source code and does not change any functionality, nor
  * does it make your device available to a specific driver.
  * It will however make the descriptive string available if a device does not
  * provide the string itself.
  *
  * After adding a vendor ID VNDR and a product ID PRDCT you will have the
  * following extra defines:
  * #define USB_VENDOR_VNDR              0x????
  * #define USB_PRODUCT_VNDR_PRDCT       0x????
  *
  * You may have to add these defines to the respective probe routines to
  * make the device recognised by the appropriate device driver.
  */
 
 vendor UNKNOWN1		0x0053	Unknown vendor
 vendor UNKNOWN2		0x0105	Unknown vendor
 vendor EGALAX2		0x0123	eGalax, Inc.
 vendor CHIPSBANK	0x0204	Chipsbank Microelectronics Co.
 vendor HUMAX		0x02ad	HUMAX
 vendor LTS		0x0386	LTS
 vendor BWCT		0x03da	Bernd Walter Computer Technology
 vendor AOX		0x03e8	AOX
 vendor THESYS		0x03e9	Thesys
 vendor DATABROADCAST	0x03ea	Data Broadcasting
 vendor ATMEL		0x03eb	Atmel
 vendor IWATSU		0x03ec	Iwatsu America
 vendor MITSUMI		0x03ee	Mitsumi
 vendor HP		0x03f0	Hewlett Packard
 vendor GENOA		0x03f1	Genoa
 vendor OAK		0x03f2	Oak
 vendor ADAPTEC		0x03f3	Adaptec
 vendor DIEBOLD		0x03f4	Diebold
 vendor SIEMENSELECTRO	0x03f5	Siemens Electromechanical
 vendor EPSONIMAGING	0x03f8	Epson Imaging
 vendor KEYTRONIC	0x03f9	KeyTronic
 vendor OPTI		0x03fb	OPTi
 vendor ELITEGROUP	0x03fc	Elitegroup
 vendor XILINX		0x03fd	Xilinx
 vendor FARALLON		0x03fe	Farallon Communications
 vendor NATIONAL		0x0400	National Semiconductor
 vendor NATIONALREG	0x0401	National Registry
 vendor ACERLABS		0x0402	Acer Labs
 vendor FTDI		0x0403	Future Technology Devices
 vendor NCR		0x0404	NCR
 vendor SYNOPSYS2	0x0405	Synopsys
 vendor FUJITSUICL	0x0406	Fujitsu-ICL
 vendor FUJITSU2		0x0407	Fujitsu Personal Systems
 vendor QUANTA		0x0408	Quanta
 vendor NEC		0x0409	NEC
 vendor KODAK		0x040a	Eastman Kodak
 vendor WELTREND		0x040b	Weltrend
 vendor VIA		0x040d	VIA
 vendor MCCI		0x040e	MCCI
 vendor MELCO		0x0411	Melco
 vendor LEADTEK		0x0413	Leadtek
 vendor WINBOND		0x0416	Winbond
 vendor PHOENIX		0x041a	Phoenix
 vendor CREATIVE		0x041e	Creative Labs
 vendor NOKIA		0x0421	Nokia
 vendor ADI		0x0422	ADI Systems
 vendor CATC		0x0423	Computer Access Technology
 vendor SMC2		0x0424	Standard Microsystems
 vendor MOTOROLA_HK	0x0425	Motorola HK
 vendor GRAVIS		0x0428	Advanced Gravis Computer
 vendor CIRRUSLOGIC	0x0429	Cirrus Logic
 vendor INNOVATIVE	0x042c	Innovative Semiconductors
 vendor MOLEX		0x042f	Molex
 vendor SUN		0x0430	Sun Microsystems
 vendor UNISYS		0x0432	Unisys
 vendor TAUGA		0x0436	Taugagreining HF
 vendor AMD		0x0438	Advanced Micro Devices
 vendor LEXMARK		0x043d	Lexmark International
 vendor LG		0x043e	LG Electronics
 vendor NANAO		0x0440	NANAO
 vendor GATEWAY		0x0443	Gateway 2000
 vendor NMB		0x0446	NMB
 vendor ALPS		0x044e	Alps Electric
 vendor THRUST		0x044f	Thrustmaster
 vendor TI		0x0451	Texas Instruments
 vendor ANALOGDEVICES	0x0456	Analog Devices
 vendor SIS		0x0457	Silicon Integrated Systems Corp.
 vendor KYE		0x0458	KYE Systems
 vendor DIAMOND2		0x045a	Diamond (Supra)
 vendor RENESAS		0x045b	Renesas
 vendor MICROSOFT	0x045e	Microsoft
 vendor PRIMAX		0x0461	Primax Electronics
 vendor MGE		0x0463	MGE UPS Systems
 vendor AMP		0x0464	AMP
 vendor CHERRY		0x046a	Cherry Mikroschalter
 vendor MEGATRENDS	0x046b	American Megatrends
 vendor LOGITECH		0x046d	Logitech
 vendor BTC		0x046e	Behavior Tech. Computer
 vendor PHILIPS		0x0471	Philips
 vendor SUN2		0x0472	Sun Microsystems (offical)
 vendor SANYO		0x0474	Sanyo Electric
 vendor SEAGATE		0x0477	Seagate
 vendor CONNECTIX	0x0478	Connectix
 vendor SEMTECH		0x047a	Semtech
 vendor KENSINGTON	0x047d	Kensington
 vendor LUCENT		0x047e	Lucent
 vendor PLANTRONICS	0x047f	Plantronics
 vendor KYOCERA		0x0482	Kyocera Wireless Corp.
 vendor STMICRO		0x0483	STMicroelectronics
 vendor FOXCONN		0x0489	Foxconn
 vendor MEIZU		0x0492	Meizu Electronics
 vendor YAMAHA		0x0499	YAMAHA
 vendor COMPAQ		0x049f	Compaq
 vendor HITACHI		0x04a4	Hitachi
 vendor ACERP		0x04a5	Acer Peripherals
 vendor DAVICOM		0x04a6	Davicom
 vendor VISIONEER	0x04a7	Visioneer
 vendor CANON		0x04a9	Canon
 vendor NIKON		0x04b0	Nikon
 vendor PAN		0x04b1	Pan International
 vendor IBM		0x04b3	IBM
 vendor CYPRESS		0x04b4	Cypress Semiconductor
 vendor ROHM		0x04b5	ROHM
 vendor COMPAL		0x04b7	Compal
 vendor EPSON		0x04b8	Seiko Epson
 vendor RAINBOW		0x04b9	Rainbow Technologies
 vendor IODATA		0x04bb	I-O Data
 vendor TDK		0x04bf	TDK
 vendor 3COMUSR		0x04c1	U.S. Robotics
 vendor METHODE		0x04c2	Methode Electronics Far East
 vendor MAXISWITCH	0x04c3	Maxi Switch
 vendor LOCKHEEDMER	0x04c4	Lockheed Martin Energy Research
 vendor FUJITSU		0x04c5	Fujitsu
 vendor TOSHIBAAM	0x04c6	Toshiba America
 vendor MICROMACRO	0x04c7	Micro Macro Technologies
 vendor KONICA		0x04c8	Konica
 vendor LITEON		0x04ca	Lite-On Technology
 vendor FUJIPHOTO	0x04cb	Fuji Photo Film
 vendor PHILIPSSEMI	0x04cc	Philips Semiconductors
 vendor TATUNG		0x04cd	Tatung Co. Of America
 vendor SCANLOGIC	0x04ce	ScanLogic
 vendor MYSON		0x04cf	Myson Technology
 vendor DIGI2		0x04d0	Digi
 vendor ITTCANON		0x04d1	ITT Canon
 vendor ALTEC		0x04d2	Altec Lansing
 vendor LSI		0x04d4	LSI
 vendor MENTORGRAPHICS	0x04d6	Mentor Graphics
 vendor ITUNERNET	0x04d8	I-Tuner Networks
 vendor HOLTEK		0x04d9	Holtek Semiconductor, Inc.
 vendor PANASONIC	0x04da	Panasonic (Matsushita)
 vendor HUANHSIN		0x04dc	Huan Hsin
 vendor SHARP		0x04dd	Sharp
 vendor IIYAMA		0x04e1	Iiyama
 vendor SHUTTLE		0x04e6	Shuttle Technology
 vendor ELO		0x04e7	Elo TouchSystems
 vendor SAMSUNG		0x04e8	Samsung Electronics
 vendor NORTHSTAR	0x04eb	Northstar
 vendor TOKYOELECTRON	0x04ec	Tokyo Electron
 vendor ANNABOOKS	0x04ed	Annabooks
 vendor JVC		0x04f1	JVC
 vendor CHICONY		0x04f2	Chicony Electronics
 vendor ELAN		0x04f3	Elan
 vendor NEWNEX		0x04f7	Newnex
 vendor BROTHER		0x04f9	Brother Industries
 vendor DALLAS		0x04fa	Dallas Semiconductor
 vendor AIPTEK2		0x04fc	AIPTEK International
 vendor PFU		0x04fe	PFU
 vendor FUJIKURA		0x0501	Fujikura/DDK
 vendor ACER		0x0502	Acer
 vendor 3COM		0x0506	3Com
 vendor HOSIDEN		0x0507	Hosiden Corporation
 vendor AZTECH		0x0509	Aztech Systems
 vendor BELKIN		0x050d	Belkin Components
 vendor KAWATSU		0x050f	Kawatsu Semiconductor
 vendor FCI		0x0514	FCI
 vendor LONGWELL		0x0516	Longwell
 vendor COMPOSITE	0x0518	Composite
 vendor STAR		0x0519	Star Micronics
 vendor APC		0x051d	American Power Conversion
 vendor SCIATLANTA	0x051e	Scientific Atlanta
 vendor TSM		0x0520	TSM
 vendor CONNECTEK	0x0522	Advanced Connectek USA
 vendor NETCHIP		0x0525	NetChip Technology
 vendor ALTRA		0x0527	ALTRA
 vendor ATI		0x0528	ATI Technologies
 vendor AKS		0x0529	Aladdin Knowledge Systems
 vendor TEKOM		0x052b	Tekom
 vendor CANONDEV		0x052c	Canon
 vendor WACOMTECH	0x0531	Wacom
 vendor INVENTEC		0x0537	Inventec
 vendor SHYHSHIUN	0x0539	Shyh Shiun Terminals
 vendor PREHWERKE	0x053a	Preh Werke Gmbh & Co. KG
 vendor SYNOPSYS		0x053f	Synopsys
 vendor UNIACCESS	0x0540	Universal Access
 vendor VIEWSONIC	0x0543	ViewSonic
 vendor XIRLINK		0x0545	Xirlink
 vendor ANCHOR		0x0547	Anchor Chips
 vendor SONY		0x054c	Sony
 vendor FUJIXEROX	0x0550	Fuji Xerox
 vendor VISION		0x0553	VLSI Vision
 vendor ASAHIKASEI	0x0556	Asahi Kasei Microsystems
 vendor ATEN		0x0557	ATEN International
 vendor SAMSUNG2		0x055d	Samsung Electronics
 vendor MUSTEK		0x055f	Mustek Systems
 vendor TELEX		0x0562	Telex Communications
 vendor CHINON		0x0564	Chinon
 vendor PERACOM		0x0565	Peracom Networks
 vendor ALCOR2		0x0566	Alcor Micro
 vendor XYRATEX		0x0567	Xyratex
 vendor WACOM		0x056a	WACOM
 vendor ETEK		0x056c	e-TEK Labs
 vendor EIZO		0x056d	EIZO
 vendor ELECOM		0x056e	Elecom
 vendor CONEXANT		0x0572	Conexant
 vendor HAUPPAUGE	0x0573	Hauppauge Computer Works
 vendor BAFO		0x0576	BAFO/Quality Computer Accessories
 vendor YEDATA		0x057b	Y-E Data
 vendor AVM		0x057c	AVM
 vendor QUICKSHOT	0x057f	Quickshot
 vendor ROLAND		0x0582	Roland
 vendor ROCKFIRE		0x0583	Rockfire
 vendor RATOC		0x0584	RATOC Systems
 vendor ZYXEL		0x0586	ZyXEL Communication
 vendor INFINEON		0x058b	Infineon
 vendor MICREL		0x058d	Micrel
 vendor ALCOR		0x058f	Alcor Micro
 vendor OMRON		0x0590	OMRON
 vendor ZORAN		0x0595	Zoran Microelectronics
 vendor NIIGATA		0x0598	Niigata
 vendor IOMEGA		0x059b	Iomega
 vendor ATREND		0x059c	A-Trend Technology
 vendor AID		0x059d	Advanced Input Devices
 vendor LACIE		0x059f	LaCie
 vendor FUJIFILM		0x05a2	Fuji Film
 vendor ARC		0x05a3	ARC
 vendor ORTEK		0x05a4	Ortek
 vendor CISCOLINKSYS3	0x05a6	Cisco-Linksys
 vendor BOSE		0x05a7	Bose
 vendor OMNIVISION	0x05a9	OmniVision
 vendor INSYSTEM		0x05ab	In-System Design
 vendor APPLE		0x05ac	Apple Computer
 vendor YCCABLE		0x05ad	Y.C. Cable
 vendor DIGITALPERSONA	0x05ba	DigitalPersona
 vendor 3G		0x05bc	3G Green Green Globe
 vendor RAFI		0x05bd	RAFI
 vendor TYCO		0x05be	Tyco
 vendor KAWASAKI		0x05c1	Kawasaki
 vendor DIGI		0x05c5	Digi International
 vendor QUALCOMM2	0x05c6	Qualcomm
 vendor QTRONIX		0x05c7	Qtronix
 vendor FOXLINK		0x05c8	Foxlink
 vendor RICOH		0x05ca	Ricoh
 vendor ELSA		0x05cc	ELSA
 vendor SCIWORX		0x05ce	sci-worx
 vendor BRAINBOXES	0x05d1	Brainboxes Limited
 vendor ULTIMA		0x05d8	Ultima
 vendor AXIOHM		0x05d9	Axiohm Transaction Solutions
 vendor MICROTEK		0x05da	Microtek
 vendor SUNTAC		0x05db	SUN Corporation
 vendor LEXAR		0x05dc	Lexar Media
 vendor ADDTRON		0x05dd	Addtron
 vendor SYMBOL		0x05e0	Symbol Technologies
 vendor SYNTEK		0x05e1	Syntek
 vendor GENESYS		0x05e3	Genesys Logic
 vendor FUJI		0x05e5	Fuji Electric
 vendor KEITHLEY		0x05e6	Keithley Instruments
 vendor EIZONANAO	0x05e7	EIZO Nanao
 vendor KLSI		0x05e9	Kawasaki LSI
 vendor FFC		0x05eb	FFC
 vendor ANKO		0x05ef	Anko Electronic
 vendor PIENGINEERING	0x05f3	P.I. Engineering
 vendor AOC		0x05f6	AOC International
 vendor CHIC		0x05fe	Chic Technology
 vendor BARCO		0x0600	Barco Display Systems
 vendor BRIDGE		0x0607	Bridge Information
 vendor SOLIDYEAR	0x060b	Solid Year
 vendor BIORAD		0x0614	Bio-Rad Laboratories
 vendor MACALLY		0x0618	Macally
 vendor ACTLABS		0x061c	Act Labs
 vendor ALARIS		0x0620	Alaris
 vendor APEX		0x0624	Apex
 vendor CREATIVE3	0x062a	Creative Labs
 vendor MICRON		0x0634	Micron Technology
 vendor VIVITAR		0x0636	Vivitar
 vendor GUNZE		0x0637	Gunze Electronics USA
 vendor AVISION		0x0638	Avision
 vendor TEAC		0x0644	TEAC
 vendor ACTON		0x0647	Acton Research Corp.
 vendor OPTO		0x065a	Optoelectronics Co., Ltd
 vendor SGI		0x065e	Silicon Graphics
 vendor SANWASUPPLY	0x0663	Sanwa Supply
 vendor MEGATEC		0x0665	Megatec
 vendor LINKSYS		0x066b	Linksys
 vendor ACERSA		0x066e	Acer Semiconductor America
 vendor SIGMATEL		0x066f	Sigmatel
 vendor DRAYTEK		0x0675	DrayTek
 vendor AIWA		0x0677	Aiwa
 vendor ACARD		0x0678	ACARD Technology
 vendor PROLIFIC		0x067b	Prolific Technology
 vendor SIEMENS		0x067c	Siemens
 vendor AVANCELOGIC	0x0680	Avance Logic
 vendor SIEMENS2		0x0681	Siemens
 vendor MINOLTA		0x0686	Minolta
 vendor CHPRODUCTS	0x068e	CH Products
 vendor HAGIWARA		0x0693	Hagiwara Sys-Com
 vendor CTX		0x0698	Chuntex
 vendor ASKEY		0x069a	Askey Computer
 vendor SAITEK		0x06a3	Saitek
 vendor ALCATELT		0x06b9	Alcatel Telecom
 vendor AGFA		0x06bd	AGFA-Gevaert
 vendor ASIAMD		0x06be	Asia Microelectronic Development
 vendor BIZLINK		0x06c4	Bizlink International
 vendor KEYSPAN		0x06cd	Keyspan / InnoSys Inc.
 vendor CONTEC		0x06ce	Contec products
 vendor AASHIMA		0x06d6	Aashima Technology
 vendor LIEBERT		0x06da	Liebert
 vendor MULTITECH	0x06e0	MultiTech
 vendor ADS		0x06e1	ADS Technologies
 vendor ALCATELM		0x06e4	Alcatel Microelectronics
 vendor SIRIUS		0x06ea	Sirius Technologies
 vendor GUILLEMOT	0x06f8	Guillemot
 vendor BOSTON		0x06fd	Boston Acoustics
 vendor SMC		0x0707	Standard Microsystems
 vendor PUTERCOM		0x0708	Putercom
 vendor MCT		0x0711	MCT
 vendor IMATION		0x0718	Imation
 vendor TECLAST		0x071b	Teclast
 vendor SONYERICSSON	0x0731	Sony Ericsson
 vendor EICON		0x0734	Eicon Networks
 vendor SYNTECH		0x0745	Syntech Information
 vendor DIGITALSTREAM	0x074e	Digital Stream
 vendor AUREAL		0x0755	Aureal Semiconductor
 vendor MAUDIO		0x0763	M-Audio
 vendor CYBERPOWER	0x0764	Cyber Power Systems, Inc.
 vendor SURECOM		0x0769	Surecom Technology
 vendor HIDGLOBAL	0x076b	HID Global
 vendor LINKSYS2		0x077b	Linksys
 vendor GRIFFIN		0x077d	Griffin Technology
 vendor SANDISK		0x0781	SanDisk
 vendor JENOPTIK		0x0784	Jenoptik
 vendor LOGITEC		0x0789	Logitec
 vendor NOKIA2		0x078b	Nokia
 vendor BRIMAX		0x078e	Brimax
 vendor AXIS		0x0792	Axis Communications
 vendor ABL		0x0794	ABL Electronics
 vendor SAGEM		0x079b	Sagem
 vendor SUNCOMM		0x079c	Sun Communications, Inc.
 vendor ALFADATA		0x079d	Alfadata Computer
 vendor NATIONALTECH	0x07a2	National Technical Systems
 vendor ONNTO		0x07a3	Onnto
 vendor BE		0x07a4	Be
 vendor ADMTEK		0x07a6	ADMtek
 vendor COREGA		0x07aa	Corega
 vendor FREECOM		0x07ab	Freecom
 vendor MICROTECH	0x07af	Microtech
 vendor GENERALINSTMNTS	0x07b2	General Instruments (Motorola)
 vendor OLYMPUS		0x07b4	Olympus
 vendor ABOCOM		0x07b8	AboCom Systems
 vendor KEISOKUGIKEN	0x07c1	Keisokugiken
 vendor ONSPEC		0x07c4	OnSpec
 vendor APG		0x07c5	APG Cash Drawer
 vendor BUG		0x07c8	B.U.G.
 vendor ALLIEDTELESYN	0x07c9	Allied Telesyn International
 vendor AVERMEDIA	0x07ca	AVerMedia Technologies
 vendor SIIG		0x07cc	SIIG
 vendor CASIO		0x07cf	CASIO
 vendor DLINK2		0x07d1	D-Link
 vendor APTIO		0x07d2	Aptio Products
 vendor ARASAN		0x07da	Arasan Chip Systems
 vendor ALLIEDCABLE	0x07e6	Allied Cable
 vendor STSN		0x07ef	STSN
 vendor CENTURY		0x07f7	Century Corp
 vendor NEWLINK		0x07ff	NEWlink
 vendor MAGTEK		0x0801	Mag-Tek
 vendor ZOOM		0x0803	Zoom Telephonics
 vendor PCS		0x0810	Personal Communication Systems
 vendor ALPHASMART	0x081e	AlphaSmart, Inc.
 vendor BROADLOGIC	0x0827	BroadLogic
 vendor HANDSPRING	0x082d	Handspring
 vendor PALM		0x0830	Palm Computing
 vendor SOURCENEXT	0x0833	SOURCENEXT
 vendor ACTIONSTAR	0x0835	Action Star Enterprise
 vendor SAMSUNG_TECHWIN	0x0839	Samsung Techwin
 vendor ACCTON		0x083a	Accton Technology
 vendor DIAMOND		0x0841	Diamond
 vendor NETGEAR		0x0846	BayNETGEAR
 vendor TOPRE		0x0853	Topre Corporation
 vendor ACTIVEWIRE	0x0854	ActiveWire
 vendor BBELECTRONICS	0x0856	B&B Electronics
 vendor PORTGEAR		0x085a	PortGear
 vendor NETGEAR2		0x0864	Netgear
 vendor SYSTEMTALKS	0x086e	System Talks
 vendor METRICOM		0x0870	Metricom
 vendor ADESSOKBTEK	0x087c	ADESSO/Kbtek America
 vendor JATON		0x087d	Jaton
 vendor APT		0x0880	APT Technologies
 vendor BOCARESEARCH	0x0885	Boca Research
 vendor ANDREA		0x08a8	Andrea Electronics
 vendor BURRBROWN	0x08bb	Burr-Brown Japan
 vendor 2WIRE		0x08c8	2Wire
 vendor AIPTEK		0x08ca	AIPTEK International
 vendor SMARTBRIDGES	0x08d1	SmartBridges
 vendor FUJITSUSIEMENS	0x08d4	Fujitsu-Siemens
 vendor BILLIONTON	0x08dd	Billionton Systems
 vendor GEMALTO		0x08e6	Gemalto SA
 vendor EXTENDED		0x08e9	Extended Systems
 vendor MSYSTEMS		0x08ec	M-Systems
 vendor DIGIANSWER	0x08fd	Digianswer
 vendor AUTHENTEC	0x08ff	AuthenTec
 vendor AUDIOTECHNICA	0x0909	Audio-Technica
 vendor TRUMPION		0x090a	Trumpion Microelectronics
 vendor FEIYA		0x090c	Feiya
 vendor ALATION		0x0910	Alation Systems
 vendor GLOBESPAN	0x0915	Globespan
 vendor CONCORDCAMERA	0x0919	Concord Camera
 vendor GARMIN		0x091e	Garmin International
 vendor GOHUBS		0x0921	GoHubs
 vendor XEROX		0x0924	Xerox
 vendor BIOMETRIC	0x0929	American Biometric Company
 vendor TOSHIBA		0x0930	Toshiba
 vendor PLEXTOR		0x093b	Plextor
 vendor INTREPIDCS	0x093c	Intrepid
 vendor YANO		0x094f	Yano
 vendor KINGSTON		0x0951	Kingston Technology
 vendor BLUEWATER	0x0956	BlueWater Systems
 vendor AGILENT		0x0957	Agilent Technologies
 vendor GUDE		0x0959	Gude ADS
 vendor PORTSMITH	0x095a	Portsmith
 vendor ACERW		0x0967	Acer
 vendor ADIRONDACK	0x0976	Adirondack Wire & Cable
 vendor BECKHOFF		0x0978	Beckhoff
 vendor MINDSATWORK	0x097a	Minds At Work
 vendor POINTCHIPS	0x09a6	PointChips
 vendor INTERSIL		0x09aa	Intersil
 vendor ALTIUS		0x09b3	Altius Solutions
 vendor ARRIS		0x09c1	Arris Interactive
 vendor ACTIVCARD	0x09c3	ACTIVCARD
 vendor ACTISYS		0x09c4	ACTiSYS
 vendor NOVATEL2		0x09d7	Novatel Wireless
 vendor AFOURTECH	0x09da	A-FOUR TECH
 vendor AIMEX		0x09dc	AIMEX
 vendor ADDONICS		0x09df	Addonics Technologies
 vendor AKAI		0x09e8	AKAI professional M.I.
 vendor ARESCOM		0x09f5	ARESCOM
 vendor BAY		0x09f9	Bay Associates
 vendor ALTERA		0x09fb	Altera
 vendor CSR		0x0a12	Cambridge Silicon Radio
 vendor TREK		0x0a16	Trek Technology
 vendor ASAHIOPTICAL	0x0a17	Asahi Optical
 vendor BOCASYSTEMS	0x0a43	Boca Systems
 vendor SHANTOU		0x0a46	ShanTou
 vendor MEDIAGEAR	0x0a48	MediaGear
 vendor BROADCOM		0x0a5c	Broadcom
 vendor GREENHOUSE	0x0a6b	GREENHOUSE
 vendor MEDELI		0x0a67	Medeli
 vendor GEOCAST		0x0a79	Geocast Network Systems
 vendor EGO		0x0a92	EGO systems
 vendor IDQUANTIQUE	0x0aba	ID Quantique
 vendor IDTECH		0x0acd	ID TECH
 vendor ZYDAS		0x0ace	Zydas Technology Corporation
 vendor NEODIO		0x0aec	Neodio
 vendor OPTION		0x0af0	Option N.V.
 vendor ASUS		0x0b05	ASUSTeK Computer
 vendor TODOS		0x0b0c	Todos Data System
 vendor SIIG2		0x0b39	SIIG
 vendor TEKRAM		0x0b3b	Tekram Technology
 vendor HAL		0x0b41	HAL Corporation
 vendor EMS		0x0b43	EMS Production
 vendor NEC2		0x0b62	NEC
 vendor ADLINK		0x0b63	ADLINK Technoligy, Inc.
 vendor ATI2		0x0b6f	ATI
 vendor ZEEVO		0x0b7a	Zeevo, Inc.
 vendor KURUSUGAWA	0x0b7e	Kurusugawa Electronics, Inc.
 vendor SMART		0x0b8c	Smart Technologies
 vendor ASIX		0x0b95	ASIX Electronics
 vendor O2MICRO		0x0b97	O2 Micro, Inc.
 vendor USR		0x0baf	U.S. Robotics
 vendor AMBIT		0x0bb2	Ambit Microsystems
 vendor HTC		0x0bb4	HTC
 vendor REALTEK		0x0bda	Realtek
 vendor ERICSSON2	0x0bdb	Ericsson
 vendor MEI		0x0bed	MEI
 vendor ADDONICS2	0x0bf6	Addonics Technology
 vendor FSC		0x0bf8	Fujitsu Siemens Computers
 vendor AGATE		0x0c08	Agate Technologies
 vendor DMI		0x0c0b	DMI
 vendor CANYON		0x0c10	Canyon
 vendor ICOM		0x0c26	Icom Inc.
 vendor GNOTOMETRICS	0x0c33	GN Otometrics
 vendor CHICONY2		0x0c45	Chicony
 vendor REINERSCT	0x0c4b	Reiner-SCT
 vendor SEALEVEL		0x0c52	Sealevel System
 vendor JETI		0x0c6c	Jeti
 vendor LUWEN		0x0c76	Luwen
 vendor ELEKTOR		0x0c7d	ELEKTOR Electronics
 vendor KYOCERA2		0x0c88	Kyocera Wireless Corp.
 vendor ZCOM		0x0cde	Z-Com
 vendor ATHEROS2		0x0cf3	Atheros Communications
 vendor POSIFLEX		0x0d3a	POSIFLEX
 vendor TANGTOP		0x0d3d	Tangtop
 vendor KOBIL		0x0d46	KOBIL
 vendor SMC3		0x0d5c	Standard Microsystems
 vendor ADDON		0x0d7d	Add-on Technology
 vendor ACDC		0x0d7e	American Computer & Digital Components
 vendor CMEDIA		0x0d8c	CMEDIA
 vendor CONCEPTRONIC	0x0d8e	Conceptronic
 vendor SKANHEX		0x0d96	Skanhex Technology, Inc.
 vendor MSI		0x0db0	Micro Star International
 vendor ELCON		0x0db7	ELCON Systemtechnik
 vendor UNKNOWN4		0x0dcd	Unknown vendor
 vendor NETAC		0x0dd8	Netac
 vendor SITECOMEU	0x0df6	Sitecom Europe
 vendor MOBILEACTION	0x0df7	Mobile Action
 vendor AMIGO		0x0e0b	Amigo Technology
 vendor SPEEDDRAGON	0x0e55	Speed Dragon Multimedia
 vendor HAWKING		0x0e66	Hawking
 vendor FOSSIL		0x0e67	Fossil, Inc
 vendor GMATE		0x0e7e	G.Mate, Inc
 vendor MEDIATEK		0x0e8d	MediaTek, Inc.
 vendor OTI		0x0ea0	Ours Technology
 vendor YISO		0x0eab	Yiso Wireless Co.
 vendor PILOTECH		0x0eaf	Pilotech
 vendor NOVATECH		0x0eb0	NovaTech
 vendor ITEGNO		0x0eba	iTegno
 vendor WINMAXGROUP	0x0ed1	WinMaxGroup
 vendor TOD		0x0ede	TOD
 vendor EGALAX		0x0eef	eGalax, Inc.
 vendor AIRPRIME		0x0f3d	AirPrime, Inc.
 vendor MICROTUNE	0x0f4d	Microtune
 vendor VTECH		0x0f88	VTech
 vendor FALCOM		0x0f94	Falcom Wireless Communications GmbH
 vendor RIM		0x0fca	Research In Motion
 vendor DYNASTREAM	0x0fcf	Dynastream Innovations
 vendor LARSENBRUSGAARD	0x0fd8	Larsen and Brusgaard
 vendor OWL		0x0fde	OWL
 vendor KONTRON		0x0fe6	Kontron AG
 vendor QUALCOMM		0x1004	Qualcomm
 vendor APACER		0x1005	Apacer
 vendor MOTOROLA4	0x100d	Motorola
 vendor HP3		0x103c	Hewlett Packard
 vendor AIRPLUS		0x1011  Airplus
 vendor DESKNOTE		0x1019	Desknote
 vendor NEC3		0x1033	NEC
 vendor TTI		0x103e	Thurlby Thandar Instruments
 vendor GIGABYTE		0x1044	GIGABYTE
 vendor WESTERN		0x1058	Western Digital
 vendor MOTOROLA		0x1063	Motorola
 vendor CCYU		0x1065	CCYU Technology
 vendor CURITEL		0x106c	Curitel Communications Inc
 vendor SILABS2		0x10a6	SILABS2
 vendor USI		0x10ab	USI
 vendor LIEBERT2		0x10af	Liebert
 vendor PLX		0x10b5	PLX
 vendor ASANTE		0x10bd	Asante
 vendor SILABS		0x10c4	Silicon Labs
 vendor SILABS3		0x10c5	Silicon Labs
 vendor SILABS4		0x10ce	Silicon Labs
 vendor ACTIONS		0x10d6  Actions
 vendor ANALOG		0x1110	Analog Devices
 vendor TENX		0x1130	Ten X Technology, Inc.
 vendor ISSC		0x1131	Integrated System Solution Corp.
 vendor JRC		0x1145	Japan Radio Company
 vendor SPHAIRON		0x114b	Sphairon Access Systems GmbH
 vendor DELORME		0x1163	DeLorme
 vendor SERVERWORKS	0x1166	ServerWorks
 vendor DLINK3		0x1186	Dlink
 vendor ACERCM		0x1189	Acer Communications & Multimedia
 vendor SIERRA		0x1199	Sierra Wireless
 vendor SANWA		0x11ad	Sanwa Electric Instrument Co., Ltd.
 vendor TOPFIELD		0x11db	Topfield Co., Ltd
 vendor SIEMENS3		0x11f5	Siemens
 vendor NETINDEX		0x11f6	NetIndex
 vendor ALCATEL		0x11f7	Alcatel
 vendor INTERBIOMETRICS	0x1209	Interbiometrics
 vendor UNKNOWN3		0x1233	Unknown vendor
 vendor TSUNAMI		0x1241	Tsunami
 vendor PHEENET		0x124a	Pheenet
 vendor TARGUS		0x1267	Targus
 vendor TWINMOS		0x126f	TwinMOS
 vendor TENDA		0x1286	Tenda
 vendor TESTO		0x128d	Testo products
 vendor CREATIVE2	0x1292	Creative Labs
 vendor BELKIN2		0x1293	Belkin Components
 vendor CYBERTAN		0x129b	CyberTAN Technology
 vendor HUAWEI		0x12d1	Huawei Technologies
 vendor ARANEUS		0x12d8	Araneus Information Systems
 vendor TAPWAVE		0x12ef	Tapwave
 vendor AINCOMM		0x12fd	Aincomm
 vendor MOBILITY		0x1342	Mobility
 vendor DICKSMITH	0x1371	Dick Smith Electronics
 vendor NETGEAR3		0x1385	Netgear
 vendor BALTECH		0x13ad	Baltech
 vendor CISCOLINKSYS	0x13b1	Cisco-Linksys
 vendor SHARK		0x13d2	Shark
 vendor AZUREWAVE	0x13d3	AsureWave
 vendor INITIO		0x13fd	Initio Corporation
 vendor EMTEC		0x13fe	Emtec
 vendor NOVATEL		0x1410	Novatel Wireless
 vendor MERLIN		0x1416	Merlin
 vendor REDOCTANE	0x1430	RedOctane
 vendor WISTRONNEWEB	0x1435	Wistron NeWeb
 vendor RADIOSHACK	0x1453	Radio Shack
 vendor FIC		0x1457	FIC / OpenMoko
 vendor HUAWEI3COM	0x1472	Huawei-3Com
 vendor ABOCOM2		0x1482	AboCom Systems
 vendor SILICOM		0x1485	Silicom
 vendor RALINK		0x148f	Ralink Technology
 vendor IMAGINATION	0x149a	Imagination Technologies
 vendor ATP		0x14af	ATP Electronics
 vendor CONCEPTRONIC2	0x14b2	Conceptronic
 vendor SUPERTOP		0x14cd	Super Top
 vendor PLANEX3		0x14ea	Planex Communications
 vendor SILICONPORTALS	0x1527	Silicon Portals
 vendor UBIQUAM		0x1529	UBIQUAM Co., Ltd.
 vendor JMICRON		0x152d	JMicron
 vendor UBLOX		0x1546	U-blox
 vendor PNY		0x154b	PNY
 vendor OWEN		0x1555	Owen
 vendor OQO		0x1557	OQO
 vendor UMEDIA		0x157e	U-MEDIA Communications
 vendor FIBERLINE	0x1582	Fiberline
 vendor FREESCALE	0x15a2  Freescale Semiconductor, Inc.
 vendor AFATECH		0x15a4	Afatech Technologies, Inc.
 vendor SPARKLAN		0x15a9	SparkLAN
 vendor OLIMEX		0x15ba	Olimex
 vendor SOUNDGRAPH 	0x15c2	Soundgraph, Inc.
 vendor AMIT2		0x15c5	AMIT
 vendor TEXTECH		0x15ca	Textech International Ltd.
 vendor SOHOWARE		0x15e8	SOHOware
 vendor UMAX		0x1606	UMAX Data Systems
 vendor INSIDEOUT	0x1608	Inside Out Networks
 vendor AMOI		0x1614	Amoi Electronics
 vendor GOODWAY		0x1631	Good Way Technology
 vendor ENTREGA		0x1645	Entrega
 vendor ACTIONTEC	0x1668	Actiontec Electronics
 vendor CLIPSAL		0x166a	Clipsal
 vendor CISCOLINKSYS2	0x167b  Cisco-Linksys
 vendor ATHEROS		0x168c	Atheros Communications
 vendor GIGASET		0x1690	Gigaset
 vendor GLOBALSUN	0x16ab	Global Sun Technology
 vendor ANYDATA		0x16d5	AnyDATA Corporation
 vendor JABLOTRON	0x16d6	Jablotron
 vendor CMOTECH		0x16d8	C-motech
 vendor WIENERPLEINBAUS	0x16dc	WIENER Plein & Baus GmbH.
 vendor AXESSTEL		0x1726  Axesstel Co., Ltd.
 vendor LINKSYS4		0x1737	Linksys
 vendor SENAO		0x1740	Senao
 vendor ASUS2		0x1761	ASUS
 vendor SWEEX2		0x177f	Sweex
 vendor METAGEEK		0x1781	MetaGeek
 vendor KAMSTRUP		0x17a8	Kamstrup A/S
 vendor LENOVO		0x17ef	Lenovo
 vendor WAVESENSE	0x17f4	WaveSense
 vendor VAISALA		0x1843	Vaisala
 vendor AMIT		0x18c5	AMIT
 vendor GOOGLE		0x18d1	Google
 vendor QCOM		0x18e8	Qcom
 vendor ELV		0x18ef	ELV
 vendor LINKSYS3		0x1915	Linksys
 vendor QUALCOMMINC	0x19d2	Qualcomm, Incorporated
 vendor BAYER		0x1a79	Bayer
 vendor WCH2		0x1a86	QinHeng Electronics
 vendor STELERA		0x1a8d	Stelera Wireless
 vendor SEL		0x1adb	Schweitzer Engineering Laboratories
 vendor CORSAIR		0x1b1c	Corsair
 vendor MATRIXORBITAL	0x1b3d	Matrix Orbital
 vendor OVISLINK		0x1b75	OvisLink
 vendor TML		0x1b91	The Mobility Lab
 vendor TCTMOBILE	0x1bbb  TCT Mobile
 vendor ALTI2		0x1bc9	Alti-2 products
 vendor SUNPLUS		0x1bcf	Sunplus Innovation Technology Inc.
 vendor WAGO		0x1be3  WAGO Kontakttechnik GmbH.
 vendor TELIT		0x1bc7  Telit
 vendor IONICS		0x1c0c	Ionics PlugComputer
 vendor LONGCHEER	0x1c9e	Longcheer Holdings, Ltd.
 vendor MPMAN		0x1cae	MpMan
 vendor DRESDENELEKTRONIK 0x1cf1 dresden elektronik
 vendor NEOTEL		0x1d09	Neotel
 vendor DREAMLINK	0x1d34	Dream Link
 vendor PEGATRON		0x1d4d	Pegatron
 vendor QISDA		0x1da5  Qisda
 vendor METAGEEK2	0x1dd5	MetaGeek
 vendor ALINK		0x1e0e  Alink
 vendor AIRTIES		0x1eda	AirTies
 vendor FESTO		0x1e29	Festo
 vendor LAKESHORE	0x1fb9	Lake Shore Cryotronics, Inc.
 vendor VERTEX		0x1fe7  Vertex Wireless Co., Ltd.
 vendor DLINK		0x2001	D-Link
 vendor PLANEX2		0x2019	Planex Communications
 vendor HAUPPAUGE2	0x2040	Hauppauge Computer Works
 vendor TLAYTECH		0x20b9	Tlay Tech
 vendor ENCORE		0x203d	Encore
 vendor QIHARDWARE	0x20b7	QI-hardware
 vendor PARA		0x20b8	PARA Industrial
 vendor SIMTEC		0x20df	Simtec Electronics
 vendor TRENDNET		0x20f4  TRENDnet
 vendor RTSYSTEMS	0x2100	RTSYSTEMS
 vendor VIALABS		0x2109	VIA Labs
 vendor ERICSSON		0x2282	Ericsson
 vendor MOTOROLA2	0x22b8	Motorola
 vendor WETELECOM	0x22de	WeTelecom
 vendor WESTMOUNTAIN	0x2405	West Mountain Radio
 vendor TRIPPLITE	0x2478	Tripp-Lite
 vendor HIROSE		0x2631	Hirose Electric
 vendor NHJ		0x2770	NHJ
 vendor PLANEX		0x2c02	Planex Communications
 vendor VIDZMEDIA	0x3275	VidzMedia Pte Ltd
 vendor LINKINSTRUMENTS	0x3195	Link Instruments Inc.
 vendor AEI		0x3334	AEI
 vendor HANK		0x3353	Hank Connection
 vendor PQI		0x3538	PQI
 vendor DAISY		0x3579	Daisy Technology
 vendor NI		0x3923	National Instruments
 vendor MICRONET		0x3980	Micronet Communications
 vendor IODATA2		0x40bb	I-O Data
 vendor IRIVER		0x4102	iRiver
 vendor DELL		0x413c	Dell
 vendor WCH		0x4348	QinHeng Electronics
 vendor ACEECA		0x4766	Aceeca
 vendor FEIXUN		0x4855	FeiXun Communication
 vendor PAPOUCH		0x5050	Papouch products
 vendor AVERATEC		0x50c2	Averatec
 vendor SWEEX		0x5173	Sweex
 vendor PROLIFIC2	0x5372	Prolific Technologies
 vendor ONSPEC2		0x55aa	OnSpec Electronic Inc.
 vendor ZINWELL		0x5a57	Zinwell
 vendor SITECOM		0x6189	Sitecom
 vendor ARKMICRO		0x6547	Arkmicro Technologies Inc.
 vendor 3COM2		0x6891	3Com
 vendor EDIMAX		0x7392	Edimax
 vendor INTEL		0x8086	Intel
 vendor INTEL2		0x8087	Intel
 vendor ALLWIN		0x8516	ALLWIN Tech
 vendor SITECOM2		0x9016	Sitecom
 vendor MOSCHIP		0x9710	MosChip Semiconductor
 vendor NETGEAR4		0x9846  Netgear
 vendor MARVELL		0x9e88	Marvell Technology Group Ltd.
 vendor 3COM3		0xa727	3Com
 vendor EVOLUTION	0xdeee	Evolution Robotics products
 vendor DATAAPEX		0xdaae	DataApex
 vendor HP2		0xf003	Hewlett Packard
 vendor LOGILINK		0xfc08	LogiLink
 vendor USRP		0xfffe	GNU Radio USRP
 
 /*
  * List of known products.  Grouped by vendor.
  */
 
 /* 3Com products */
 product 3COM HOMECONN		0x009d	HomeConnect Camera
 product 3COM 3CREB96		0x00a0	Bluetooth USB Adapter
 product 3COM 3C19250		0x03e8	3C19250 Ethernet Adapter
 product 3COM 3CRSHEW696		0x0a01	3CRSHEW696 Wireless Adapter
 product 3COM 3C460		0x11f8	HomeConnect 3C460
 product 3COM USR56K		0x3021	U.S.Robotics 56000 Voice FaxModem Pro
 product 3COM 3C460B		0x4601	HomeConnect 3C460B
 product 3COM2 3CRUSB10075	0xa727	3CRUSB10075
 product 3COM3 AR5523_1		0x6893	AR5523
 product 3COM3 AR5523_2		0x6895	AR5523
 product 3COM3 AR5523_3		0x6897	AR5523
 
 product 3COMUSR OFFICECONN	0x0082	3Com OfficeConnect Analog Modem
 product 3COMUSR USRISDN		0x008f	3Com U.S. Robotics Pro ISDN TA
 product 3COMUSR HOMECONN	0x009d	3Com HomeConnect Camera
 product 3COMUSR USR56K		0x3021	U.S. Robotics 56000 Voice FaxModem Pro
 
 /* AboCom products */
 product ABOCOM XX1		0x110c	XX1
 product ABOCOM XX2		0x200c	XX2
 product ABOCOM RT2770		0x2770	RT2770
 product ABOCOM RT2870		0x2870	RT2870
 product ABOCOM RT3070		0x3070	RT3070
 product ABOCOM RT3071		0x3071	RT3071
 product ABOCOM RT3072		0x3072	RT3072
 product ABOCOM2 RT2870_1	0x3c09	RT2870
 product ABOCOM URE450		0x4000	URE450 Ethernet Adapter
 product ABOCOM UFE1000		0x4002	UFE1000 Fast Ethernet Adapter
 product ABOCOM DSB650TX_PNA	0x4003	1/10/100 Ethernet Adapter
 product ABOCOM XX4		0x4004	XX4
 product ABOCOM XX5		0x4007	XX5
 product ABOCOM XX6		0x400b	XX6
 product ABOCOM XX7		0x400c	XX7
 product ABOCOM RTL8151		0x401a	RTL8151
 product ABOCOM XX8		0x4102	XX8
 product ABOCOM XX9		0x4104	XX9
 product ABOCOM UF200		0x420a	UF200 Ethernet
 product ABOCOM WL54		0x6001	WL54
 product ABOCOM XX10		0xabc1	XX10
 product ABOCOM BWU613		0xb000	BWU613
 product ABOCOM HWU54DM		0xb21b	HWU54DM
 product ABOCOM RT2573_2		0xb21c	RT2573
 product ABOCOM RT2573_3		0xb21d	RT2573
 product ABOCOM RT2573_4		0xb21e	RT2573
 product ABOCOM RTL8188CU_1	0x8188  RTL8188CU
 product ABOCOM RTL8188CU_2	0x8189  RTL8188CU
 product ABOCOM RTL8192CU	0x8178  RTL8192CU
 product ABOCOM WUG2700		0xb21f	WUG2700
 
 /* Acton Research Corp. */
 product ACTON SPECTRAPRO	0x0100	FTDI compatible adapter
 
 /* Accton products */
 product ACCTON USB320_EC	0x1046	USB320-EC Ethernet Adapter
 product ACCTON 2664W		0x3501	2664W
 product ACCTON 111		0x3503	T-Sinus 111 Wireless Adapter
 product ACCTON SMCWUSBG_NF	0x4505	SMCWUSB-G (no firmware)
 product ACCTON SMCWUSBG		0x4506	SMCWUSB-G
 product ACCTON SMCWUSBTG2_NF	0x4507	SMCWUSBT-G2 (no firmware)
 product ACCTON SMCWUSBTG2	0x4508	SMCWUSBT-G2
 product ACCTON PRISM_GT		0x4521	PrismGT USB 2.0 WLAN
 product ACCTON SS1001		0x5046	SpeedStream Ethernet Adapter
 product ACCTON RT2870_2		0x6618	RT2870
 product ACCTON RT3070		0x7511	RT3070
 product ACCTON RT2770		0x7512	RT2770
 product ACCTON RT2870_3		0x7522	RT2870
 product ACCTON RT2870_5		0x8522	RT2870
 product ACCTON RT3070_4		0xa512	RT3070
 product ACCTON RT2870_4		0xa618	RT2870
 product	ACCTON RT3070_1		0xa701	RT3070
 product	ACCTON RT3070_2		0xa702	RT3070
 product ACCTON RT2870_1		0xb522	RT2870
 product	ACCTON RT3070_3		0xc522	RT3070
 product	ACCTON RT3070_5		0xd522	RT3070
 product ACCTON RTL8192SU        0xc512  RTL8192SU
 product ACCTON ZD1211B		0xe501	ZD1211B
 
 /* Aceeca products */
 product ACEECA MEZ1000		0x0001	MEZ1000 RDA
 
 /* Acer Communications & Multimedia (oemd by Surecom) */
 product ACERCM EP1427X2		0x0893	EP-1427X-2 Ethernet Adapter
 
 /* Acer Labs products */
 product ACERLABS M5632		0x5632	USB 2.0 Data Link
 
 /* Acer Peripherals, Inc. products */
 product ACERP ACERSCAN_C310U	0x12a6	Acerscan C310U
 product ACERP ACERSCAN_320U	0x2022	Acerscan 320U
 product ACERP ACERSCAN_640U	0x2040	Acerscan 640U
 product ACERP ACERSCAN_620U	0x2060	Acerscan 620U
 product ACERP ACERSCAN_4300U	0x20b0	Benq 3300U/4300U
 product ACERP ACERSCAN_640BT	0x20be	Acerscan 640BT
 product ACERP ACERSCAN_1240U	0x20c0	Acerscan 1240U
 product ACERP S81		0x4027	BenQ S81 phone
 product ACERP H10		0x4068	AWL400 Wireless Adapter
 product ACERP ATAPI		0x6003	ATA/ATAPI Adapter
 product ACERP AWL300		0x9000	AWL300 Wireless Adapter
 product ACERP AWL400		0x9001	AWL400 Wireless Adapter
 
 /* Acer Warp products */
 product ACERW WARPLINK		0x0204	Warplink
 
 /* Actions products */
 product ACTIONS MP4		0x1101  Actions MP4 Player
 
 /* Actiontec, Inc. products */
 product ACTIONTEC PRISM_25	0x0408	Prism2.5 Wireless Adapter
 product ACTIONTEC PRISM_25A	0x0421	Prism2.5 Wireless Adapter A
 product ACTIONTEC FREELAN	0x6106	ROPEX FreeLan 802.11b
 product ACTIONTEC UAT1		0x7605	UAT1 Wireless Ethernet Adapter
 
 /* ACTiSYS products */
 product ACTISYS IR2000U		0x0011	ACT-IR2000U FIR
 
 /* ActiveWire, Inc. products */
 product ACTIVEWIRE IOBOARD	0x0100	I/O Board
 product ACTIVEWIRE IOBOARD_FW1	0x0101	I/O Board, rev. 1 firmware
 
 /* Adaptec products */
 product ADAPTEC AWN8020		0x0020	AWN-8020 WLAN
 
 /* Addtron products */
 product ADDTRON AWU120		0xff31	AWU-120
 
 /* ADLINK Texhnology products */
 product ADLINK ND6530		0x6530	ND-6530 USB-Serial
 
 /* ADMtek products */
 product ADMTEK PEGASUSII_4	0x07c2	AN986A Ethernet
 product ADMTEK PEGASUS		0x0986	AN986 Ethernet
 product ADMTEK PEGASUSII	0x8511	AN8511 Ethernet
 product ADMTEK PEGASUSII_2	0x8513	AN8513 Ethernet
 product ADMTEK PEGASUSII_3	0x8515	AN8515 Ethernet
 
 /* ADDON products */
 /* PNY OEMs these */
 product ADDON ATTACHE		0x1300	USB 2.0 Flash Drive
 product ADDON ATTACHE		0x1300	USB 2.0 Flash Drive
 product ADDON A256MB		0x1400	Attache 256MB USB 2.0 Flash Drive
 product ADDON DISKPRO512	0x1420	USB 2.0 Flash Drive (DANE-ELEC zMate 512MB USB flash drive)
 
 /* Addonics products */
 product ADDONICS2 CABLE_205	0xa001	Cable 205
 
 /* ADS products */
 product ADS UBS10BT		0x0008	UBS-10BT Ethernet
 product ADS UBS10BTX		0x0009	UBS-10BT Ethernet
 
 /* AEI products */
 product AEI FASTETHERNET	0x1701	Fast Ethernet
 
 /* Afatech Technologies, Inc. */
 product AFATECH AFATECH1336	0x1336	Flash Card Reader
 
 /* Agate Technologies products */
 product AGATE QDRIVE		0x0378	Q-Drive
 
 /* AGFA products */
 product AGFA SNAPSCAN1212U	0x0001	SnapScan 1212U
 product AGFA SNAPSCAN1236U	0x0002	SnapScan 1236U
 product AGFA SNAPSCANTOUCH	0x0100	SnapScan Touch
 product AGFA SNAPSCAN1212U2	0x2061	SnapScan 1212U
 product AGFA SNAPSCANE40	0x208d	SnapScan e40
 product AGFA SNAPSCANE50	0x208f	SnapScan e50
 product AGFA SNAPSCANE20	0x2091	SnapScan e20
 product AGFA SNAPSCANE25	0x2095	SnapScan e25
 product AGFA SNAPSCANE26	0x2097	SnapScan e26
 product AGFA SNAPSCANE52	0x20fd	SnapScan e52
 
 /* Ain Communication Technology products */
 product AINCOMM AWU2000B	0x1001	AWU2000B Wireless Adapter
 
 /* AIPTEK products */
 product AIPTEK POCKETCAM3M	0x2011	PocketCAM 3Mega
 product AIPTEK2 PENCAM_MEGA_1_3 0x504a	PenCam Mega 1.3
 product AIPTEK2 SUNPLUS_TECH	0x0c15	Sunplus Technology Inc.
 
 /* AirPlis products */
 product AIRPLUS MCD650		0x3198	MCD650 modem
 
 /* AirPrime products */
 product AIRPRIME PC5220		0x0112	CDMA Wireless PC Card
 product AIRPRIME USB308		0x68A3	USB308 HSPA+ USB Modem
 product AIRPRIME AC313U		0x68aa	Sierra Wireless AirCard 313U
 
 /* AirTies products */
 product AIRTIES RT3070		0x2310	RT3070
 
 /* AKS products */
 product AKS USBHASP		0x0001	USB-HASP 0.06
 
 /* Alcatel products */
 product ALCATEL OT535		0x02df	One Touch 535/735
 
 /* Alcor Micro, Inc. products */
 product ALCOR2 KBD_HUB		0x2802	Kbd Hub
 
 product ALCOR DUMMY		0x0000	Dummy product
 product ALCOR SDCR_6335		0x6335	SD/MMC Card Reader
 product ALCOR SDCR_6362		0x6362	SD/MMC Card Reader
 product ALCOR SDCR_6366		0x6366	SD/MMC Card Reader
 product ALCOR TRANSCEND		0x6387	Transcend JetFlash Drive
 product ALCOR MA_KBD_HUB	0x9213	MacAlly Kbd Hub
 product ALCOR AU9814		0x9215	AU9814 Hub
 product ALCOR UMCR_9361		0x9361	USB Multimedia Card Reader
 product ALCOR SM_KBD		0x9410	MicroConnectors/StrongMan Keyboard
 product ALCOR NEC_KBD_HUB	0x9472	NEC Kbd Hub
 product ALCOR AU9720 		0x9720	USB2 - RS-232
 product ALCOR AU6390	0x6390	AU6390 USB-IDE converter
 
 /* Alink products */
 product ALINK DWM652U5		0xce16	DWM-652
 product ALINK 3G		0x9000	3G modem
 product ALINK 3GU		0x9200	3G modem
 
 /* Altec Lansing products */
 product ALTEC ADA70		0x0070	ADA70 Speakers
 product ALTEC ASC495		0xff05	ASC495 Speakers
 
 /* Alti-2 products */
 product ALTI2 N3		0x6001	FTDI compatible adapter
 
 /* Allied Telesyn International products */
 product ALLIEDTELESYN ATUSB100	0xb100	AT-USB100
 
 /* ALLWIN Tech products */
 product ALLWIN RT2070		0x2070	RT2070
 product ALLWIN RT2770		0x2770	RT2770
 product ALLWIN RT2870		0x2870	RT2870
 product ALLWIN RT3070		0x3070	RT3070
 product ALLWIN RT3071		0x3071	RT3071
 product ALLWIN RT3072		0x3072	RT3072
 product ALLWIN RT3572		0x3572	RT3572
 
 /* AlphaSmart, Inc. products */
 product ALPHASMART DANA_KB	0xdbac	AlphaSmart Dana Keyboard
 product ALPHASMART DANA_SYNC	0xdf00	AlphaSmart Dana HotSync
 
 /* Amoi products */
 product AMOI H01		0x0800	H01 3G modem
 product AMOI H01A		0x7002	H01A 3G modem
 product AMOI H02		0x0802	H02 3G modem
 
 /* American Power Conversion products */
 product APC UPS			0x0002	Uninterruptible Power Supply
 
 /* Ambit Microsystems products */
 product AMBIT WLAN		0x0302	WLAN
 product AMBIT NTL_250		0x6098	NTL 250 cable modem
 
 /* Apacer products */
 product APACER HT202		0xb113	USB 2.0 Flash Drive
 
 /* American Power Conversion products */
 product APC UPS			0x0002	Uninterruptible Power Supply
 
 /* Amigo Technology products */
 product AMIGO RT2870_1		0x9031	RT2870
 product AMIGO RT2870_2		0x9041	RT2870
 
 /* AMIT products */
 product AMIT CGWLUSB2GO		0x0002	CG-WLUSB2GO
 product AMIT CGWLUSB2GNR	0x0008	CG-WLUSB2GNR
 product AMIT RT2870_1		0x0012	RT2870
 
 /* AMIT(2) products */
 product AMIT2 RT2870		0x0008	RT2870
 
 /* Analog Devices products */
 product ANALOGDEVICES GNICE	0xf000	FTDI compatible adapter
 product ANALOGDEVICES GNICEPLUS	0xf001	FTDI compatible adapter
 
 /* Anchor products */
 product ANCHOR SERIAL		0x2008	Serial
 product ANCHOR EZUSB		0x2131	EZUSB
 product ANCHOR EZLINK		0x2720	EZLINK
 
 /* AnyData products */
 product ANYDATA ADU_620UW	0x6202	CDMA 2000 EV-DO USB Modem
 product ANYDATA ADU_E100X	0x6501	CDMA 2000 1xRTT/EV-DO USB Modem
 product ANYDATA ADU_500A	0x6502	CDMA 2000 EV-DO USB Modem
 
 /* AOX, Inc. products */
 product AOX USB101		0x0008	Ethernet
 
 /* American Power Conversion products */
 product APC UPS			0x0002	Uninterruptible Power Supply
 
 /* Apple Computer products */
 product APPLE DUMMY		0x0000	Dummy product
 product APPLE IMAC_KBD		0x0201	USB iMac Keyboard
 product APPLE KBD		0x0202	USB Keyboard M2452
 product APPLE EXT_KBD		0x020c	Apple Extended USB Keyboard
 /* MacbookAir, aka wellspring */
 product APPLE WELLSPRING_ANSI	0x0223	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING_ISO	0x0224	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING_JIS	0x0225	Apple Internal Keyboard/Trackpad
 /* MacbookProPenryn, aka wellspring2 */
 product APPLE WELLSPRING2_ANSI	0x0230	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING2_ISO	0x0231	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING2_JIS	0x0232	Apple Internal Keyboard/Trackpad
 /* Macbook5,1 (unibody), aka wellspring3 */
 product APPLE WELLSPRING3_ANSI	0x0236	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING3_ISO	0x0237	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING3_JIS	0x0238	Apple Internal Keyboard/Trackpad
 /* MacbookAir3,2 (unibody), aka wellspring4 */
 product APPLE WELLSPRING4_ANSI	0x023f	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING4_ISO	0x0240	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING4_JIS	0x0241	Apple Internal Keyboard/Trackpad
 /* MacbookAir3,1 (unibody), aka wellspring4 */
 product APPLE WELLSPRING4A_ANSI	0x0242	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING4A_ISO	0x0243	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING4A_JIS	0x0244	Apple Internal Keyboard/Trackpad
 /* Macbook8 (unibody, March 2011) */
 product APPLE WELLSPRING5_ANSI	0x0245	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING5_ISO	0x0246	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING5_JIS	0x0247	Apple Internal Keyboard/Trackpad
 /* MacbookAir4,1 (unibody, July 2011) */
 product APPLE WELLSPRING6A_ANSI	0x0249	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING6A_ISO	0x024a	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING6A_JIS	0x024b	Apple Internal Keyboard/Trackpad
 /* MacbookAir4,2 (unibody, July 2011) */
 product APPLE WELLSPRING6_ANSI	0x024c	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING6_ISO	0x024d	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING6_JIS	0x024e	Apple Internal Keyboard/Trackpad
 /* Macbook8,2 (unibody) */
 product APPLE WELLSPRING5A_ANSI	0x0252	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING5A_ISO	0x0253	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING5A_JIS	0x0254	Apple Internal Keyboard/Trackpad
 /* MacbookPro10,1 (unibody, June 2012) */
 product APPLE WELLSPRING7_ANSI	0x0262	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING7_ISO	0x0263	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING7_JIS	0x0264	Apple Internal Keyboard/Trackpad
 /* MacbookPro10,2 (unibody, October 2012) */
 product APPLE WELLSPRING7A_ANSI	0x0259	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING7A_ISO	0x025a	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING7A_JIS	0x025b	Apple Internal Keyboard/Trackpad
 /* MacbookAir6,2 (unibody, June 2013) */
 product APPLE WELLSPRING8_ANSI	0x0290	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING8_ISO	0x0291	Apple Internal Keyboard/Trackpad
 product APPLE WELLSPRING8_JIS	0x0292	Apple Internal Keyboard/Trackpad
 product APPLE MOUSE		0x0301	Mouse M4848
 product APPLE OPTMOUSE		0x0302	Optical mouse
 product APPLE MIGHTYMOUSE	0x0304	Mighty Mouse
 product APPLE KBD_HUB		0x1001	Hub in Apple USB Keyboard
 product APPLE EXT_KBD_HUB	0x1003	Hub in Apple Extended USB Keyboard
 product APPLE SPEAKERS		0x1101	Speakers
 product APPLE IPOD		0x1201	iPod
 product APPLE IPOD2G		0x1202	iPod 2G
 product APPLE IPOD3G		0x1203	iPod 3G
 product APPLE IPOD_04		0x1204	iPod '04'
 product APPLE IPODMINI		0x1205	iPod Mini
 product APPLE IPOD_06		0x1206	iPod '06'
 product APPLE IPOD_07		0x1207	iPod '07'
 product APPLE IPOD_08		0x1208	iPod '08'
 product APPLE IPODVIDEO		0x1209	iPod Video
 product APPLE IPODNANO		0x120a	iPod Nano
 product APPLE IPHONE		0x1290	iPhone
 product APPLE IPOD_TOUCH	0x1291	iPod Touch
 product APPLE IPHONE_3G		0x1292	iPhone 3G
 product APPLE IPHONE_3GS	0x1294	iPhone 3GS
 product APPLE IPHONE_4		0x1297	iPhone 4
 product APPLE IPHONE_4S		0x12a0	iPhone 4S
 product APPLE IPHONE_5		0x12a8  iPhone 5
 product APPLE IPAD		0x129a	iPad
 product APPLE ETHERNET		0x1402	Ethernet A1277
 
 /* Arkmicro Technologies */
 product ARKMICRO ARK3116	0x0232	ARK3116 Serial
 
 /* Asahi Optical products */
 product ASAHIOPTICAL OPTIO230	0x0004	Digital camera
 product ASAHIOPTICAL OPTIO330	0x0006	Digital camera
 
 /* Asante products */
 product ASANTE EA		0x1427	Ethernet
 
 /* ASIX Electronics products */
 product ASIX AX88172		0x1720	10/100 Ethernet
 product ASIX AX88178		0x1780	AX88178
 product ASIX AX88178A		0x178a	AX88178A USB 2.0 10/100/1000 Ethernet
 product ASIX AX88179		0x1790	AX88179 USB 3.0 10/100/1000 Ethernet
 product ASIX AX88772		0x7720	AX88772
 product ASIX AX88772A		0x772a	AX88772A USB 2.0 10/100 Ethernet
 product ASIX AX88772B		0x772b	AX88772B USB 2.0 10/100 Ethernet
 product ASIX AX88772B_1		0x7e2b	AX88772B USB 2.0 10/100 Ethernet
 
 /* ASUS products */
 product ASUS2 USBN11		0x0b05	USB-N11
 product ASUS RT2570		0x1706	RT2500USB Wireless Adapter
 product ASUS WL167G		0x1707	WL-167g Wireless Adapter
 product ASUS WL159G		0x170c	WL-159g
 product ASUS A9T_WIFI		0x171b	A9T wireless
 product ASUS P5B_WIFI		0x171d	P5B wireless
 product ASUS RT2573_1		0x1723	RT2573
 product ASUS RT2573_2		0x1724	RT2573
 product ASUS LCM		0x1726	LCM display
 product ASUS RT2870_1		0x1731	RT2870
 product ASUS RT2870_2		0x1732	RT2870
 product ASUS RT2870_3		0x1742	RT2870
 product ASUS RT2870_4		0x1760	RT2870
 product ASUS RT2870_5		0x1761	RT2870
 product	ASUS USBN13		0x1784	USB-N13
 product ASUS USBN10		0x1786	USB-N10
 product ASUS RT3070_1		0x1790	RT3070
 product ASUS RTL8192SU		0x1791  RTL8192SU
 product ASUS USB_N53		0x179d	ASUS Black Diamond Dual Band USB-N53
 product ASUS RTL8192CU		0x17ab	RTL8192CU
 product ASUS USBN66		0x17ad	USB-N66
 product ASUS USBN10NANO		0x17ba	USB-N10 Nano
 product ASUS USBAC51		0x17d1	USB-AC51
 product ASUS A730W		0x4202	ASUS MyPal A730W
 product ASUS P535		0x420f	ASUS P535 PDA
 product ASUS GMSC		0x422f	ASUS Generic Mass Storage
 
 /* ATen products */
 product ATEN UC1284		0x2001	Parallel printer
 product ATEN UC10T		0x2002	10Mbps Ethernet
 product ATEN UC110T		0x2007	UC-110T Ethernet
 product ATEN UC232A		0x2008	Serial
 product ATEN UC210T		0x2009	UC-210T Ethernet
 product ATEN DSB650C		0x4000	DSB-650C
 
 /* ATP Electronics products */
 product ATP EUSB		0xaf01	ATP IG eUSB SSD
 
 /* Atheros Communications products */
 product ATHEROS AR5523		0x0001	AR5523
 product ATHEROS AR5523_NF	0x0002	AR5523 (no firmware)
 product ATHEROS2 AR5523_1	0x0001	AR5523
 product ATHEROS2 AR5523_1_NF	0x0002	AR5523 (no firmware)
 product ATHEROS2 AR5523_2	0x0003	AR5523
 product ATHEROS2 AR5523_2_NF	0x0004	AR5523 (no firmware)
 product ATHEROS2 AR5523_3	0x0005	AR5523
 product ATHEROS2 AR5523_3_NF	0x0006	AR5523 (no firmware)
 
 /* Atmel Comp. products */
 product ATMEL STK541		0x2109	Zigbee Controller
 product ATMEL UHB124		0x3301	UHB124 hub
 product ATMEL DWL120		0x7603	DWL-120 Wireless Adapter
 product ATMEL BW002		0x7605	BW002 Wireless Adapter
 product ATMEL WL1130USB		0x7613	WL-1130 USB
 product ATMEL AT76C505A		0x7614	AT76c505a Wireless Adapter
 
 /* AuthenTec products */
 product AUTHENTEC AES1610	0x1600	AES1610 Fingerprint Sensor
 
 /* Avision products */
 product AVISION 1200U		0x0268	1200U scanner
 
 /* Axesstel products */
 product AXESSTEL DATAMODEM	0x1000  Data Modem
 
 /* AsureWave products */
 product AZUREWAVE RT2870_1	0x3247	RT2870
 product AZUREWAVE RT2870_2	0x3262	RT2870
 product AZUREWAVE RT3070_1	0x3273	RT3070
 product	AZUREWAVE RT3070_2	0x3284	RT3070
 product	AZUREWAVE RT3070_3	0x3305	RT3070
 product AZUREWAVE RTL8188CU	0x3357	RTL8188CU
 product AZUREWAVE RTL8188CE_1	0x3358  RTL8188CE
 product AZUREWAVE RTL8188CE_2	0x3359  RTL8188CE
 product AZUREWAVE RTL8192SU_1	0x3306	RTL8192SU
 product AZUREWAVE RTL8192SU_2	0x3309	RTL8192SU
 product AZUREWAVE RTL8192SU_3	0x3310	RTL8192SU
 product AZUREWAVE RTL8192SU_4	0x3311	RTL8192SU
 product AZUREWAVE RTL8192SU_5	0x3325	RTL8192SU
 
 /* Baltech products */
 product BALTECH CARDREADER	0x9999	Card reader
 
 /* Bayer products */
 product BAYER CONTOUR_CABLE	0x6001	FTDI compatible adapter
 
 /* B&B Electronics products */
 product BBELECTRONICS USOTL4	0xAC01	RS-422/485
 product BBELECTRONICS 232USB9M		0xac27	FTDI compatible adapter
 product BBELECTRONICS 485USB9F_2W	0xac25	FTDI compatible adapter
 product BBELECTRONICS 485USB9F_4W	0xac26	FTDI compatible adapter
 product BBELECTRONICS 485USBTB_2W	0xac33	FTDI compatible adapter
 product BBELECTRONICS 485USBTB_4W	0xac34	FTDI compatible adapter
 product BBELECTRONICS TTL3USB9M		0xac50	FTDI compatible adapter
 product BBELECTRONICS TTL5USB9M		0xac49	FTDI compatible adapter
 product BBELECTRONICS USO9ML2		0xac03	FTDI compatible adapter
 product BBELECTRONICS USO9ML2DR		0xac17	FTDI compatible adapter
 product BBELECTRONICS USO9ML2DR_2	0xac16	FTDI compatible adapter
 product BBELECTRONICS USOPTL4		0xac11	FTDI compatible adapter
 product BBELECTRONICS USOPTL4DR		0xac19	FTDI compatible adapter
 product BBELECTRONICS USOPTL4DR2	0xac18	FTDI compatible adapter
 product BBELECTRONICS USPTL4		0xac12	FTDI compatible adapter
 product BBELECTRONICS USTL4		0xac02	FTDI compatible adapter
 product BBELECTRONICS ZZ_PROG1_USB	0xba02	FTDI compatible adapter
 
 /* Belkin products */
 /*product BELKIN F5U111		0x????	F5U111 Ethernet*/
 product BELKIN F5D6050		0x0050	F5D6050 802.11b Wireless Adapter
 product BELKIN FBT001V		0x0081	FBT001v2 Bluetooth
 product BELKIN FBT003V		0x0084	FBT003v2 Bluetooth
 product BELKIN F5U103		0x0103	F5U103 Serial
 product BELKIN F5U109		0x0109	F5U109 Serial
 product BELKIN USB2SCSI		0x0115	USB to SCSI
 product BELKIN F8T012		0x0121	F8T012xx1 Bluetooth USB Adapter
 product BELKIN USB2LAN		0x0121	USB to LAN
 product BELKIN F5U208		0x0208	F5U208 VideoBus II
 product BELKIN F5U237		0x0237	F5U237 USB 2.0 7-Port Hub
 product BELKIN F5U257		0x0257	F5U257 Serial
 product BELKIN F5U409		0x0409	F5U409 Serial
 product BELKIN F6C550AVR	0x0551	F6C550-AVR UPS
 product BELKIN F5U120		0x1203	F5U120-PC Hub
 product BELKIN RTL8188CU	0x1102  RTL8188CU Wireless Adapter
 product BELKIN F9L1103		0x1103	F9L1103 Wireless Adapter
 product BELKIN RTL8192CU	0x2102	RTL8192CU Wireless Adapter
 product BELKIN F7D2102		0x2103  F7D2102 Wireless Adapter
 product BELKIN ZD1211B		0x4050	ZD1211B
 product BELKIN F5D5055		0x5055	F5D5055
 product BELKIN F5D7050		0x7050	F5D7050 Wireless Adapter
 product BELKIN F5D7051		0x7051	F5D7051 54g USB Network Adapter
 product BELKIN F5D7050A		0x705a	F5D7050A Wireless Adapter
 /* Also sold as 'Ativa 802.11g wireless card' */
 product BELKIN F5D7050_V4000	0x705c	F5D7050 v4000 Wireless Adapter
 product BELKIN F5D7050E		0x705e	F5D7050E Wireless Adapter
 product BELKIN RT2870_1		0x8053	RT2870
 product BELKIN RT2870_2		0x805c	RT2870
 product BELKIN F5D8053V3	0x815c	F5D8053 v3
 product BELKIN RTL8192SU_1	0x815f	RTL8192SU
 product BELKIN RTL8192SU_2	0x845a	RTL8192SU
 product BELKIN RTL8192SU_3	0x945a	RTL8192SU
 product BELKIN F5D8055		0x825a	F5D8055
 product BELKIN F5D8055V2	0x825b	F5D8055 v2
 product BELKIN F5D9050V3	0x905b	F5D9050 ver 3 Wireless Adapter
 product BELKIN2 F5U002		0x0002	F5U002 Parallel printer
 product BELKIN F6D4050V1	0x935a	F6D4050 v1
 product BELKIN F6D4050V2	0x935b	F6D4050 v2
 
 /* Billionton products */
 product BILLIONTON USB100	0x0986	USB100N 10/100 FastEthernet
 product BILLIONTON USBLP100	0x0987	USB100LP
 product BILLIONTON USBEL100	0x0988	USB100EL
 product BILLIONTON USBE100	0x8511	USBE100
 product BILLIONTON USB2AR	0x90ff	USB2AR Ethernet
 
 /* Broadcom products */
 product BROADCOM BCM2033	0x2033	BCM2033 Bluetooth USB dongle
 
 /* Brother Industries products */
 product BROTHER HL1050		0x0002	HL-1050 laser printer
 product BROTHER MFC8600_9650	0x0100	MFC8600/9650 multifunction device
 
 /* Behavior Technology Computer products */
 product BTC BTC6100		0x5550	6100C Keyboard
 product BTC BTC7932		0x6782	Keyboard with mouse port
 
 /* Canon, Inc. products */
 product CANON N656U		0x2206	CanoScan N656U
 product CANON N1220U		0x2207	CanoScan N1220U
 product CANON D660U		0x2208	CanoScan D660U
 product CANON N676U		0x220d	CanoScan N676U
 product CANON N1240U		0x220e	CanoScan N1240U
 product CANON LIDE25		0x2220	CanoScan LIDE 25
 product CANON S10		0x3041	PowerShot S10
 product CANON S100		0x3045	PowerShot S100
 product CANON S200		0x3065	PowerShot S200
 product CANON REBELXT		0x30ef	Digital Rebel XT
 
 /* CATC products */
 product CATC NETMATE		0x000a	Netmate Ethernet
 product CATC NETMATE2		0x000c	Netmate2 Ethernet
 product CATC CHIEF		0x000d	USB Chief Bus & Protocol Analyzer
 product CATC ANDROMEDA		0x1237	Andromeda hub
 
 /* CASIO products */
 product CASIO QV_DIGICAM	0x1001	QV DigiCam
 product CASIO EXS880		0x1105	Exilim EX-S880
 product CASIO BE300		0x2002	BE-300 PDA
 product CASIO NAMELAND		0x4001	CASIO Nameland EZ-USB
 
 /* CCYU products */
 product CCYU ED1064		0x2136	EasyDisk ED1064
 
 /* Century products */
 product CENTURY EX35QUAT	0x011e	Century USB Disk Enclosure
 product CENTURY EX35SW4_SB4	0x011f	Century USB Disk Enclosure
 
 /* Cherry products */
 product CHERRY MY3000KBD	0x0001	My3000 keyboard
 product CHERRY MY3000HUB	0x0003	My3000 hub
 product CHERRY CYBOARD		0x0004	CyBoard Keyboard
 
 /* Chic Technology products */
 product CHIC MOUSE1		0x0001	mouse
 product CHIC CYPRESS		0x0003	Cypress USB Mouse
 
 /* Chicony products */
 product CHICONY KB8933		0x0001	KB-8933 keyboard
 product CHICONY KU0325		0x0116	KU-0325 keyboard
 product CHICONY CNF7129		0xb071	Notebook Web Camera
 product CHICONY HDUVCCAM	0xb40a	HD UVC WebCam
 product	CHICONY RTL8188CUS_1	0xaff7	RTL8188CUS
 product	CHICONY RTL8188CUS_2	0xaff8	RTL8188CUS
 product	CHICONY RTL8188CUS_3	0xaff9	RTL8188CUS
 product	CHICONY RTL8188CUS_4	0xaffa	RTL8188CUS
 product	CHICONY RTL8188CUS_5	0xaffa	RTL8188CUS
 product CHICONY2 TWINKLECAM	0x600d	TwinkleCam USB camera
 
 /* CH Products */
 product CHPRODUCTS PROTHROTTLE	0x00f1	Pro Throttle
 product CHPRODUCTS PROPEDALS	0x00f2	Pro Pedals
 product CHPRODUCTS FIGHTERSTICK 0x00f3	Fighterstick
 product CHPRODUCTS FLIGHTYOKE	0x00ff	Flight Sim Yoke
 
 /* Cisco-Linksys products */
 product CISCOLINKSYS WUSB54AG	0x000c	WUSB54AG Wireless Adapter
 product CISCOLINKSYS WUSB54G	0x000d	WUSB54G Wireless Adapter
 product CISCOLINKSYS WUSB54GP	0x0011	WUSB54GP Wireless Adapter
 product CISCOLINKSYS USB200MV2	0x0018	USB200M v2
 product CISCOLINKSYS HU200TS	0x001a	HU200TS Wireless Adapter
 product CISCOLINKSYS WUSB54GC	0x0020	WUSB54GC
 product CISCOLINKSYS WUSB54GR	0x0023	WUSB54GR
 product CISCOLINKSYS WUSBF54G	0x0024	WUSBF54G
 product CISCOLINKSYS AE1000	0x002f	AE1000
 product CISCOLINKSYS2 RT3070	0x4001	RT3070
 product CISCOLINKSYS3 RT3070	0x0101	RT3070
 
 /* Clipsal products */
 product CLIPSAL 560884		0x0101	560884 C-Bus Audio Matrix Switch
 product CLIPSAL 5500PACA	0x0201	5500PACA C-Bus Pascal Automation Controller
 product CLIPSAL 5800PC		0x0301	5800PC C-Bus Wireless Interface
 product CLIPSAL 5500PCU		0x0303	5500PCU C-Bus Interface
 product CLIPSAL 5000CT2		0x0304	5000CT2 C-Bus Touch Screen
 product CLIPSAL C5000CT2	0x0305	C5000CT2 C-Bus Touch Screen
 product CLIPSAL L51xx		0x0401	L51xx C-Bus Dimmer
 
 /* CMOTECH products */
 product CMOTECH CNU510		0x5141	CDMA Technologies USB modem
 product CMOTECH CNU550		0x5543	CDMA 2000 1xRTT/1xEVDO USB modem
 product CMOTECH CGU628		0x6006	CGU-628
 product CMOTECH CDMA_MODEM1	0x6280	CDMA Technologies USB modem
 product CMOTECH DISK		0xf000	disk mode
 
 /* Compaq products */
 product COMPAQ IPAQPOCKETPC	0x0003	iPAQ PocketPC
 product COMPAQ PJB100		0x504a	Personal Jukebox PJB100
 product COMPAQ IPAQLINUX	0x505a	iPAQ Linux
 
 /* Composite Corp products looks the same as "TANGTOP" */
 product COMPOSITE USBPS2	0x0001	USB to PS2 Adaptor
 
 /* Conceptronic products */
 product CONCEPTRONIC PRISM_GT	0x3762	PrismGT USB 2.0 WLAN
 product CONCEPTRONIC C11U	0x7100	C11U
 product CONCEPTRONIC WL210	0x7110	WL-210
 product CONCEPTRONIC AR5523_1	0x7801	AR5523
 product CONCEPTRONIC AR5523_1_NF	0x7802	AR5523 (no firmware)
 product CONCEPTRONIC AR5523_2	0x7811	AR5523
 product CONCEPTRONIC AR5523_2_NF	0x7812	AR5523 (no firmware)
 product CONCEPTRONIC2 RTL8192SU_1	0x3300	RTL8192SU
 product CONCEPTRONIC2 RTL8192SU_2	0x3301	RTL8192SU
 product CONCEPTRONIC2 RTL8192SU_3	0x3302	RTL8192SU
 product CONCEPTRONIC2 C54RU	0x3c02	C54RU WLAN
 product CONCEPTRONIC2 C54RU2	0x3c22	C54RU
 product CONCEPTRONIC2 RT3070_1	0x3c08	RT3070
 product CONCEPTRONIC2 RT3070_2	0x3c11	RT3070
 product CONCEPTRONIC2 VIGORN61	0x3c25	VIGORN61
 product CONCEPTRONIC2 RT2870_1	0x3c06	RT2870
 product CONCEPTRONIC2 RT2870_2	0x3c07	RT2870
 product CONCEPTRONIC2 RT2870_7	0x3c09	RT2870
 product CONCEPTRONIC2 RT2870_8	0x3c12	RT2870
 product CONCEPTRONIC2 RT2870_3	0x3c23	RT2870
 product CONCEPTRONIC2 RT2870_4	0x3c25	RT2870
 product CONCEPTRONIC2 RT2870_5	0x3c27	RT2870
 product CONCEPTRONIC2 RT2870_6	0x3c28	RT2870
 
 /* Connectix products */
 product CONNECTIX QUICKCAM	0x0001	QuickCam
 
 /* Conect products */
 product CONTEC COM1USBH		0x8311	FTDI compatible adapter
 
 /* Corega products */
 product COREGA ETHER_USB_T	0x0001	Ether USB-T
 product COREGA FETHER_USB_TX	0x0004	FEther USB-TX
 product COREGA WLAN_USB_USB_11	0x000c	WirelessLAN USB-11
 product COREGA FETHER_USB_TXS	0x000d	FEther USB-TXS
 product COREGA WLANUSB		0x0012	Wireless LAN Stick-11
 product COREGA FETHER_USB2_TX	0x0017	FEther USB2-TX
 product COREGA WLUSB_11_KEY	0x001a	ULUSB-11 Key
 product COREGA CGUSBRS232R	0x002a	CG-USBRS232R
 product COREGA CGWLUSB2GL	0x002d	CG-WLUSB2GL
 product COREGA CGWLUSB2GPX	0x002e	CG-WLUSB2GPX
 product COREGA RT2870_1		0x002f	RT2870
 product COREGA RT2870_2		0x003c	RT2870
 product COREGA RT2870_3		0x003f	RT2870
 product COREGA RT3070		0x0041	RT3070
 product COREGA CGWLUSB300GNM	0x0042	CG-WLUSB300GNM
 product COREGA RTL8192SU        0x0047  RTL8192SU
 product COREGA RTL8192CU	0x0056  RTL8192CU
 
 product COREGA WLUSB_11_STICK	0x7613	WLAN USB Stick 11
 product COREGA FETHER_USB_TXC	0x9601	FEther USB-TXC
 
 /* Corsair products */
 product CORSAIR K60		0x0a60	Corsair Vengeance K60 keyboard
 product CORSAIR K70		0x1b09	Corsair Vengeance K70 keyboard
 
 /* Creative products */
 product CREATIVE NOMAD_II	0x1002	Nomad II MP3 player
 product CREATIVE NOMAD_IIMG	0x4004	Nomad II MG
 product CREATIVE NOMAD		0x4106	Nomad
 product CREATIVE2 VOIP_BLASTER	0x0258	Voip Blaster
 product CREATIVE3 OPTICAL_MOUSE	0x0001	Notebook Optical Mouse
 
 /* Cambridge Silicon Radio Ltd. products */
 product CSR BT_DONGLE		0x0001	Bluetooth USB dongle
 product CSR CSRDFU		0xffff	USB Bluetooth Device in DFU State
 
 /* Chipsbank Microelectronics Co., Ltd */
 product CHIPSBANK USBMEMSTICK	0x6025	CBM2080 Flash drive controller
 product CHIPSBANK USBMEMSTICK1	0x6026	CBM1180 Flash drive controller
 
 /* CTX products */
 product CTX EX1300		0x9999	Ex1300 hub
 
 /* Curitel products */
 product CURITEL HX550C		0x1101	CDMA 2000 1xRTT USB modem (HX-550C)
 product CURITEL HX57XB		0x2101	CDMA 2000 1xRTT USB modem (HX-570/575B/PR-600)
 product CURITEL PC5740		0x3701	Broadband Wireless modem
 product CURITEL UM150		0x3711	EVDO modem
 product CURITEL UM175		0x3714	EVDO modem
 
 /* CyberPower products */
 product CYBERPOWER 1500CAVRLCD	0x0501	1500CAVRLCD
 
 /* CyberTAN Technology products */
 product CYBERTAN TG54USB	0x1666	TG54USB
 product CYBERTAN RT2870		0x1828	RT2870
 
 /* Cypress Semiconductor products */
 product CYPRESS MOUSE		0x0001	mouse
 product CYPRESS THERMO		0x0002	thermometer
 product CYPRESS WISPY1A		0x0bad	MetaGeek Wi-Spy
 product CYPRESS KBDHUB		0x0101	Keyboard/Hub
 product CYPRESS FMRADIO		0x1002	FM Radio
 product CYPRESS IKARILASER	0x121f	Ikari Laser SteelSeries ApS
 
 product CYPRESS USBRS232	0x5500	USB-RS232 Interface
 product CYPRESS SLIM_HUB	0x6560	Slim Hub
 product CYPRESS XX6830XX	0x6830	PATA Storage Device
 product CYPRESS SILVERSHIELD	0xfd13	Gembird Silver Shield PM
 
 /* Daisy Technology products */
 product DAISY DMC		0x6901	USB MultiMedia Reader
 
 /* Dallas Semiconductor products */
 product DALLAS J6502		0x4201	J-6502 speakers
 
 /* DataApex products */
 product DATAAPEX MULTICOM	0xead6	MultiCom
 
 /* Dell products */
 product DELL PORT		0x0058	Port Replicator
 product DELL AIO926		0x5115	Photo AIO Printer 926
 product DELL BC02		0x8000	BC02 Bluetooth USB Adapter
 product DELL PRISM_GT_1		0x8102	PrismGT USB 2.0 WLAN
 product DELL TM350		0x8103	TrueMobile 350 Bluetooth USB Adapter
 product DELL PRISM_GT_2		0x8104	PrismGT USB 2.0 WLAN
 product DELL U5700		0x8114	Dell 5700 3G
 product DELL U5500		0x8115	Dell 5500 3G
 product DELL U5505		0x8116	Dell 5505 3G
 product DELL U5700_2		0x8117	Dell 5700 3G
 product DELL U5510		0x8118	Dell 5510 3G
 product DELL U5700_3		0x8128	Dell 5700 3G
 product DELL U5700_4		0x8129	Dell 5700 3G
 product DELL U5720		0x8133	Dell 5720 3G
 product DELL U5720_2		0x8134	Dell 5720 3G
 product DELL U740		0x8135	Dell U740 CDMA
 product DELL U5520		0x8136	Dell 5520 3G
 product DELL U5520_2		0x8137	Dell 5520 3G
 product DELL U5520_3		0x8138	Dell 5520 3G
 product DELL U5730		0x8180	Dell 5730 3G
 product DELL U5730_2		0x8181	Dell 5730 3G
 product DELL U5730_3		0x8182	Dell 5730 3G
 product DELL DW700		0x9500	Dell DW700 GPS
 
 /* Delorme Paublishing products */
 product DELORME EARTHMATE	0x0100	Earthmate GPS
 
 /* Desknote products */
 product DESKNOTE UCR_61S2B	0x0c55	UCR-61S2B
 
 /* Diamond products */
 product DIAMOND RIO500USB	0x0001	Rio 500 USB
 
 /* Dick Smith Electronics (really C-Net) products */
 product DICKSMITH RT2573	0x9022	RT2573
 product DICKSMITH CWD854F	0x9032	C-Net CWD-854 rev F
 
 /* Digi International products */
 product DIGI ACCELEPORT2	0x0002	AccelePort USB 2
 product DIGI ACCELEPORT4	0x0004	AccelePort USB 4
 product DIGI ACCELEPORT8	0x0008	AccelePort USB 8
 
 /* Digianswer A/S products */
 product DIGIANSWER ZIGBEE802154	0x000a	ZigBee/802.15.4 MAC
 
 /* D-Link products */
 /*product DLINK DSBS25		0x0100	DSB-S25 serial*/
 product DLINK DUBE100		0x1a00	10/100 Ethernet
 product DLINK DUBE100C1		0x1a02	DUB-E100 rev C1
 product DLINK DSB650TX4		0x200c	10/100 Ethernet
 product DLINK DWL120E		0x3200	DWL-120 rev E
 product DLINK DWA125D1		0x330f	DWA-125 rev D1
 product DLINK DWA123D1		0x3310	DWA-123 rev D1
 product DLINK DWL122		0x3700	DWL-122
 product DLINK DWLG120		0x3701	DWL-G120
 product DLINK DWL120F		0x3702	DWL-120 rev F
 product DLINK DWLAG132		0x3a00	DWL-AG132
 product DLINK DWLAG132_NF	0x3a01	DWL-AG132 (no firmware)
 product DLINK DWLG132		0x3a02	DWL-G132
 product DLINK DWLG132_NF	0x3a03	DWL-G132 (no firmware)
 product DLINK DWLAG122		0x3a04	DWL-AG122
 product DLINK DWLAG122_NF	0x3a05	DWL-AG122 (no firmware)
 product DLINK DWLG122		0x3c00	DWL-G122 b1 Wireless Adapter
 product DLINK DUBE100B1		0x3c05	DUB-E100 rev B1
 product DLINK RT2870		0x3c09	RT2870
 product DLINK RT3072		0x3c0a	RT3072
 product DLINK DWA140B3		0x3c15	DWA-140 rev B3
 product DLINK DWA160B2		0x3c1a	DWA-160 rev B2
 product DLINK DWA127		0x3c1b	DWA-127 Wireless Adapter
 product DLINK DWA162		0x3c1f	DWA-162 Wireless Adapter
 product DLINK DWA140D1		0x3c20	DWA-140 rev D1
 product DLINK DSB650C		0x4000	10Mbps Ethernet
 product DLINK DSB650TX1		0x4001	10/100 Ethernet
 product DLINK DSB650TX		0x4002	10/100 Ethernet
 product DLINK DSB650TX_PNA	0x4003	1/10/100 Ethernet
 product DLINK DSB650TX3		0x400b	10/100 Ethernet
 product DLINK DSB650TX2		0x4102	10/100 Ethernet
 product DLINK DUB1312		0x4a00	10/100/1000 Ethernet
 product DLINK DSB650		0xabc1	10/100 Ethernet
 product DLINK DUBH7		0xf103	DUB-H7 USB 2.0 7-Port Hub
 product DLINK DWR510_CD		0xa805	DWR-510 CD-ROM Mode
 product DLINK DWR510		0x7e12	DWR-510
 product DLINK RTL8188CU		0x3308	RTL8188CU
 product	DLINK RTL8192CU_1	0x3307	RTL8192CU
 product	DLINK RTL8192CU_2	0x3309	RTL8192CU
 product	DLINK RTL8192CU_3	0x330a	RTL8192CU
 product	DLINK DWA131B		0x330d	DWA-131 rev B
 product DLINK2 RTL8192SU_1	0x3300	RTL8192SU
 product DLINK2 RTL8192SU_2	0x3302	RTL8192SU
 product DLINK2 DWA131A1		0x3303	DWA-131 A1
 product DLINK2 DWA120		0x3a0c	DWA-120
 product DLINK2 DWA120_NF	0x3a0d	DWA-120 (no firmware)
 product DLINK2 DWLG122C1	0x3c03	DWL-G122 c1
 product DLINK2 WUA1340		0x3c04	WUA-1340
 product DLINK2 DWA111		0x3c06	DWA-111
 product DLINK2 RT2870_1		0x3c09	RT2870
 product DLINK2 DWA110		0x3c07	DWA-110
 product DLINK2 RT3072		0x3c0a	RT3072
 product DLINK2 RT3072_1		0x3c0b	RT3072
 product DLINK2 RT3070_1		0x3c0d	RT3070
 product DLINK2 RT3070_2		0x3c0e	RT3070
 product DLINK2 RT3070_3		0x3c0f	RT3070
 product DLINK2 RT2870_2		0x3c11	RT2870
 product DLINK2 DWA130		0x3c13	DWA-130
 product DLINK2 RT3070_4		0x3c15	RT3070
 product DLINK2 RT3070_5		0x3c16	RT3070
 product DLINK3 DWM652		0x3e04	DWM-652
 
 /* DMI products */
 product DMI CFSM_RW		0xa109	CF/SM Reader/Writer
 product DMI DISK		0x2bcf	Generic Disk
 
 /* DrayTek products */
 product DRAYTEK VIGOR550	0x0550	Vigor550
 
 /* Dream Link products */
 product DREAMLINK DL100B	0x0004	USB Webmail Notifier
 
 /* dresden elektronik products */
 product DRESDENELEKTRONIK SENSORTERMINALBOARD  0x0001 SensorTerminalBoard
 product DRESDENELEKTRONIK WIRELESSHANDHELDTERMINAL  0x0004 Wireless Handheld Terminal
 product DRESDENELEKTRONIK DE_RFNODE                 0x001c deRFnode
 product DRESDENELEKTRONIK LEVELSHIFTERSTICKLOWCOST  0x0022 Levelshifter Stick Low Cost
 
 /* Dynastream Innovations */
 product DYNASTREAM ANTDEVBOARD	0x1003	ANT dev board
 product DYNASTREAM ANT2USB	0x1004	ANT2USB
 product DYNASTREAM ANTDEVBOARD2	0x1006	ANT dev board
 
 /* Edimax products */
 product EDIMAX EW7318USG	0x7318	USB Wireless dongle
 product EDIMAX RTL8192SU_1	0x7611	RTL8192SU
 product EDIMAX RTL8192SU_2	0x7612	RTL8192SU
 product EDIMAX EW7622UMN	0x7622	EW-7622UMn
 product EDIMAX RT2870_1		0x7711	RT2870
 product EDIMAX EW7717		0x7717	EW-7717
 product EDIMAX EW7718		0x7718	EW-7718
 product EDIMAX EW7733UND	0x7733	EW-7733UnD
 product EDIMAX EW7811UN		0x7811	EW-7811Un
 product	EDIMAX RTL8192CU	0x7822	RTL8192CU
 
 /* eGalax Products */
 product EGALAX TPANEL		0x0001	Touch Panel
 product EGALAX TPANEL2		0x0002	Touch Panel
 product EGALAX2 TPANEL		0x0001	Touch Panel
 
 /* EGO Products */
 product EGO DUMMY		0x0000	Dummy Product
 product EGO M4U			0x1020	ESI M4U
 
 /* Eicon Networks */
 product EICON DIVA852		0x4905	Diva 852 ISDN TA
 
 /* EIZO products */
 product EIZO HUB		0x0000	hub
 product EIZO MONITOR		0x0001	monitor
 
 /* ELCON Systemtechnik products */
 product ELCON PLAN		0x0002	Goldpfeil P-LAN
 
 /* Elecom products */
 product ELECOM MOUSE29UO	0x0002	mouse 29UO
 product ELECOM LDUSBTX0		0x200c	LD-USB/TX
 product ELECOM LDUSBTX1		0x4002	LD-USB/TX
 product ELECOM LDUSBLTX		0x4005	LD-USBL/TX
 product ELECOM WDC150SU2M	0x4008	WDC-150SU2M
 product ELECOM LDUSBTX2		0x400b	LD-USB/TX
 product ELECOM LDUSB20		0x4010	LD-USB20
 product ELECOM UCSGT		0x5003	UC-SGT
 product ELECOM UCSGT0		0x5004	UC-SGT
 product ELECOM LDUSBTX3		0xabc1	LD-USB/TX
 
 /* Elektor products */
 product ELEKTOR FT323R		0x0005	FTDI compatible adapter
 
 /* Elsa products */
 product ELSA MODEM1		0x2265	ELSA Modem Board
 product ELSA USB2ETHERNET	0x3000	Microlink USB2Ethernet
 
 /* ELV products */
 product ELV USBI2C		0xe00f	USB-I2C interface
 
 /* EMS products */
 product EMS DUAL_SHOOTER	0x0003	PSX gun controller converter
 
 /* Emtec products */
 product EMTEC RUF2PS 		0x2240 Flash Drive
 
 /* Encore products */
 product ENCORE RT3070_1		0x1480	RT3070
 product ENCORE RT3070_2		0x14a1	RT3070
 product ENCORE RT3070_3		0x14a9	RT3070
 
 /* Entrega products */
 product ENTREGA 1S		0x0001	1S serial
 product ENTREGA 2S		0x0002	2S serial
 product ENTREGA 1S25		0x0003	1S25 serial
 product ENTREGA 4S		0x0004	4S serial
 product ENTREGA E45		0x0005	E45 Ethernet
 product ENTREGA CENTRONICS	0x0006	Parallel Port
 product ENTREGA XX1		0x0008	Ethernet
 product ENTREGA 1S9		0x0093	1S9 serial
 product ENTREGA EZUSB		0x8000	EZ-USB
 /*product ENTREGA SERIAL	0x8001	DB25 Serial*/
 product ENTREGA 2U4S		0x8004	2U4S serial/usb hub
 product ENTREGA XX2		0x8005	Ethernet
 /*product ENTREGA SERIAL_DB9	0x8093	DB9 Serial*/
 
 /* Epson products */
 product EPSON PRINTER1		0x0001	USB Printer
 product EPSON PRINTER2		0x0002	ISD USB Smart Cable for Mac
 product EPSON PRINTER3		0x0003	ISD USB Smart Cable
 product EPSON PRINTER5		0x0005	USB Printer
 product EPSON 636		0x0101	Perfection 636U / 636Photo scanner
 product EPSON 610		0x0103	Perfection 610 scanner
 product EPSON 1200		0x0104	Perfection 1200U / 1200Photo scanner
 product EPSON 1600		0x0107	Expression 1600 scanner
 product EPSON 1640		0x010a	Perfection 1640SU scanner
 product EPSON 1240		0x010b	Perfection 1240U / 1240Photo scanner
 product EPSON 640U		0x010c	Perfection 640U scanner
 product EPSON 1250		0x010f	Perfection 1250U / 1250Photo scanner
 product EPSON 1650		0x0110	Perfection 1650 scanner
 product EPSON GT9700F		0x0112	GT-9700F scanner
 product EPSON GT9300UF		0x011b	GT-9300UF scanner
 product EPSON 3200		0x011c	Perfection 3200 scanner
 product EPSON 1260		0x011d	Perfection 1260 scanner
 product EPSON 1660		0x011e	Perfection 1660 scanner
 product EPSON 1670		0x011f	Perfection 1670 scanner
 product EPSON 1270		0x0120	Perfection 1270 scanner
 product EPSON 2480		0x0121	Perfection 2480 scanner
 product EPSON 3590		0x0122	Perfection 3590 scanner
 product EPSON 4990		0x012a	Perfection 4990 Photo scanner
 product EPSON CRESSI_EDY	0x0521	Cressi Edy diving computer
 product EPSON N2ITION3		0x0522	Zeagle N2iTion3 diving computer
 product EPSON STYLUS_875DC	0x0601	Stylus Photo 875DC Card Reader
 product EPSON STYLUS_895	0x0602	Stylus Photo 895 Card Reader
 product EPSON CX5400		0x0808	CX5400 scanner
 product EPSON 3500		0x080e	CX-3500/3600/3650 MFP
 product EPSON RX425		0x080f	Stylus Photo RX425 scanner
 product EPSON DX3800		0x0818	CX3700/CX3800/DX38x0 MFP scanner
 product EPSON 4800		0x0819	CX4700/CX4800/DX48x0 MFP scanner
 product EPSON 4200		0x0820	CX4100/CX4200/DX4200 MFP scanner
 product EPSON 5000		0x082b  CX4900/CX5000/DX50x0 MFP scanner
 product EPSON 6000		0x082e  CX5900/CX6000/DX60x0 MFP scanner
 product EPSON DX4000		0x082f  DX4000 MFP scanner
 product EPSON DX7400		0x0838	CX7300/CX7400/DX7400 MFP scanner
 product EPSON DX8400		0x0839	CX8300/CX8400/DX8400 MFP scanner
 product EPSON SX100		0x0841	SX100/NX100 MFP scanner
 product EPSON NX300		0x0848	NX300 MFP scanner
 product EPSON SX200		0x0849	SX200/SX205 MFP scanner
 product EPSON SX400		0x084a	SX400/NX400/TX400 MFP scanner
 
 /* e-TEK Labs products */
 product ETEK 1COM		0x8007	Serial
 
 /* Evolution products */
 product EVOLUTION ER1		0x0300	FTDI compatible adapter
 product EVOLUTION HYBRID	0x0302	FTDI compatible adapter
 product EVOLUTION RCM4		0x0303	FTDI compatible adapter
 
 /* Extended Systems products */
 product EXTENDED XTNDACCESS	0x0100	XTNDAccess IrDA
 
 /* Falcom products */
 product FALCOM TWIST		0x0001	USB GSM/GPRS Modem
 product FALCOM SAMBA		0x0005	FTDI compatible adapter
 
 /* FEIYA products */
 product FEIYA DUMMY		0x0000	Dummy product
 product FEIYA 5IN1		0x1132	5-in-1 Card Reader
 product FEIYA ELANGO		0x6200	MicroSDHC Card Reader
 product FEIYA AC110		0x6300	AC-110 Card Reader
 
 /* FeiXun Communication products */
 product FEIXUN RTL8188CU	0x0090	RTL8188CU
 product FEIXUN RTL8192CU	0x0091	RTL8192CU
 
 /* Festo */
 product FESTO CPX_USB		0x0102	CPX-USB
 product FESTO CMSP		0x0501	CMSP
 
 /* Fiberline */
 product FIBERLINE WL430U	0x6003	WL-430U
 
 /* FIC / OpenMoko */
 product FIC NEO1973_DEBUG	0x5118	FTDI compatible adapter
 
 /* Fossil, Inc products */
 product FOSSIL WRISTPDA		0x0002	Wrist PDA
 
 /* Foxconn products */
 product FOXCONN TCOM_TC_300	0xe000	T-Com TC 300
 product FOXCONN PIRELLI_DP_L10	0xe003	Pirelli DP-L10
 
 /* Freecom products */
 product FREECOM DVD		0xfc01	DVD drive
 product FREECOM HDD		0xfc05	Classic SL Hard Drive
 
 /* Fujitsu Siemens Computers products */
 product FSC E5400		0x1009	PrismGT USB 2.0 WLAN
 
 /* Future Technology Devices products */
 product FTDI SERIAL_8U100AX	0x8372	8U100AX Serial
 product FTDI SERIAL_8U232AM	0x6001	8U232AM Serial
 product FTDI SERIAL_8U232AM4	0x6004	8U232AM Serial
 product FTDI SERIAL_232RL	0x6006	FT232RL Serial
 product FTDI SERIAL_2232C	0x6010	FT2232C Dual port Serial
 product FTDI 232H		0x6014	FTDI compatible adapter
 product FTDI 232EX		0x6015	FTDI compatible adapter
 product FTDI SERIAL_2232D	0x9e90	FT2232D Dual port Serial
 product FTDI SERIAL_4232H	0x6011	FT4232H Quad port Serial
 product FTDI XDS100V2		0xa6d0	TI XDS100V1/V2 and early Beaglebones
 product FTDI XDS100V3		0xa6d1	TI XDS100V3
 product FTDI KTLINK		0xbbe2	KT-LINK Embedded Hackers Multitool
 product FTDI TURTELIZER2	0xbdc8	egnite Turtelizer 2 JTAG/RS232 Adapter
 /* Gude Analog- und Digitalsysteme products also uses FTDI's id: */
 product FTDI TACTRIX_OPENPORT_13M 0xcc48 OpenPort 1.3 Mitsubishi
 product FTDI TACTRIX_OPENPORT_13S 0xcc49 OpenPort 1.3 Subaru
 product FTDI TACTRIX_OPENPORT_13U 0xcc4a OpenPort 1.3 Universal
 product FTDI GAMMASCOUT		0xd678	Gamma-Scout
 product FTDI KBS		0xe6c8	Pyramid KBS USB LCD
 product FTDI EISCOU		0xe888	Expert ISDN Control USB
 product FTDI UOPTBR		0xe889	USB-RS232 OptoBridge
 product FTDI EMCU2D		0xe88a	Expert mouseCLOCK USB II
 product FTDI PCMSFU		0xe88b	Precision Clock MSF USB
 product FTDI EMCU2H		0xe88c	Expert mouseCLOCK USB II HBG
 product FTDI MAXSTREAM		0xee18	Maxstream PKG-U
 product FTDI USB_UIRT		0xf850	USB-UIRT
 product FTDI USBSERIAL		0xfa00	Matrix Orbital USB Serial
 product FTDI MX2_3		0xfa01	Matrix Orbital MX2 or MX3
 product FTDI MX4_5		0xfa02	Matrix Orbital MX4 or MX5
 product FTDI LK202		0xfa03	Matrix Orbital VK/LK202 Family
 product FTDI LK204		0xfa04	Matrix Orbital VK/LK204 Family
 product FTDI CFA_632		0xfc08	Crystalfontz CFA-632 USB LCD
 product FTDI CFA_634		0xfc09	Crystalfontz CFA-634 USB LCD
 product FTDI CFA_633		0xfc0b	Crystalfontz CFA-633 USB LCD
 product FTDI CFA_631		0xfc0c	Crystalfontz CFA-631 USB LCD
 product FTDI CFA_635		0xfc0d	Crystalfontz CFA-635 USB LCD
 product FTDI SEMC_DSS20		0xfc82	SEMC DSS-20 SyncStation
 /* Commerzielle und Technische Informationssysteme GmbH products */
 product FTDI CTI_USB_NANO_485	0xf60b	CTI USB-Nano 485
 product FTDI CTI_USB_MINI_485	0xf608	CTI USB-Mini 485
 /* Other products */
 product FTDI 232RL		0xfbfa	FTDI compatible adapter
 product FTDI 4N_GALAXY_DE_1	0xf3c0	FTDI compatible adapter
 product FTDI 4N_GALAXY_DE_2	0xf3c1	FTDI compatible adapter
 product FTDI 4N_GALAXY_DE_3	0xf3c2	FTDI compatible adapter
 product FTDI 8U232AM_ALT	0x6006	FTDI compatible adapter
 product FTDI ACCESSO		0xfad0	FTDI compatible adapter
 product FTDI ACG_HFDUAL		0xdd20	FTDI compatible adapter
 product FTDI ACTIVE_ROBOTS	0xe548	FTDI compatible adapter
 product FTDI ACTZWAVE		0xf2d0	FTDI compatible adapter
 product FTDI AMC232		0xff00	FTDI compatible adapter
 product FTDI ARTEMIS		0xdf28	FTDI compatible adapter
 product FTDI ASK_RDR400		0xc991	FTDI compatible adapter
 product FTDI ATIK_ATK16		0xdf30	FTDI compatible adapter
 product FTDI ATIK_ATK16C	0xdf32	FTDI compatible adapter
 product FTDI ATIK_ATK16HR	0xdf31	FTDI compatible adapter
 product FTDI ATIK_ATK16HRC	0xdf33	FTDI compatible adapter
 product FTDI ATIK_ATK16IC	0xdf35	FTDI compatible adapter
 product FTDI BCS_SE923		0xfb99	FTDI compatible adapter
 product FTDI CANDAPTER		0x9f80	FTDI compatible adapter
 product FTDI CANUSB		0xffa8	FTDI compatible adapter
 product FTDI CCSICDU20_0	0xf9d0	FTDI compatible adapter
 product FTDI CCSICDU40_1	0xf9d1	FTDI compatible adapter
 product FTDI CCSICDU64_4	0xf9d4	FTDI compatible adapter
 product FTDI CCSLOAD_N_GO_3	0xf9d3	FTDI compatible adapter
 product FTDI CCSMACHX_2		0xf9d2	FTDI compatible adapter
 product FTDI CCSPRIME8_5	0xf9d5	FTDI compatible adapter
 product FTDI CHAMSYS_24_MASTER_WING	0xdaf8	FTDI compatible adapter
 product FTDI CHAMSYS_MAXI_WING	0xdafd	FTDI compatible adapter
 product FTDI CHAMSYS_MEDIA_WING	0xdafe	FTDI compatible adapter
 product FTDI CHAMSYS_MIDI_TIMECODE	0xdafb	FTDI compatible adapter
 product FTDI CHAMSYS_MINI_WING	0xdafc	FTDI compatible adapter
 product FTDI CHAMSYS_PC_WING	0xdaf9	FTDI compatible adapter
 product FTDI CHAMSYS_USB_DMX	0xdafa	FTDI compatible adapter
 product FTDI CHAMSYS_WING	0xdaff	FTDI compatible adapter
 product FTDI COM4SM		0xd578	FTDI compatible adapter
 product FTDI CONVERTER_0	0xd388	FTDI compatible adapter
 product FTDI CONVERTER_1	0xd389	FTDI compatible adapter
 product FTDI CONVERTER_2	0xd38a	FTDI compatible adapter
 product FTDI CONVERTER_3	0xd38b	FTDI compatible adapter
 product FTDI CONVERTER_4	0xd38c	FTDI compatible adapter
 product FTDI CONVERTER_5	0xd38d	FTDI compatible adapter
 product FTDI CONVERTER_6	0xd38e	FTDI compatible adapter
 product FTDI CONVERTER_7	0xd38f	FTDI compatible adapter
 product FTDI DMX4ALL		0xc850	FTDI compatible adapter
 product FTDI DOMINTELL_DGQG	0xef50	FTDI compatible adapter
 product FTDI DOMINTELL_DUSB	0xef51	FTDI compatible adapter
 product FTDI DOTEC		0x9868	FTDI compatible adapter
 product FTDI ECLO_COM_1WIRE	0xea90	FTDI compatible adapter
 product FTDI ECO_PRO_CDS	0xe520	FTDI compatible adapter
 product FTDI ELSTER_UNICOM	0xe700	FTDI compatible adapter
 product FTDI ELV_ALC8500	0xf06e	FTDI compatible adapter
 product FTDI ELV_CLI7000	0xfb59	FTDI compatible adapter
 product FTDI ELV_CSI8		0xe0f0	FTDI compatible adapter
 product FTDI ELV_EC3000		0xe006	FTDI compatible adapter
 product FTDI ELV_EM1000DL	0xe0f1	FTDI compatible adapter
 product FTDI ELV_EM1010PC	0xe0ef	FTDI compatible adapter
 product FTDI ELV_FEM		0xe00a	FTDI compatible adapter
 product FTDI ELV_FHZ1000PC	0xf06f	FTDI compatible adapter
 product FTDI ELV_FHZ1300PC	0xe0e8	FTDI compatible adapter
 product FTDI ELV_FM3RX		0xe0ed	FTDI compatible adapter
 product FTDI ELV_FS20SIG	0xe0f4	FTDI compatible adapter
 product FTDI ELV_HS485		0xe0ea	FTDI compatible adapter
 product FTDI ELV_KL100		0xe002	FTDI compatible adapter
 product FTDI ELV_MSM1		0xe001	FTDI compatible adapter
 product FTDI ELV_PCD200		0xf06c	FTDI compatible adapter
 product FTDI ELV_PCK100		0xe0f2	FTDI compatible adapter
 product FTDI ELV_PPS7330	0xfb5c	FTDI compatible adapter
 product FTDI ELV_RFP500		0xe0f3	FTDI compatible adapter
 product FTDI ELV_T1100		0xf06b	FTDI compatible adapter
 product FTDI ELV_TFD128		0xe0ec	FTDI compatible adapter
 product FTDI ELV_TFM100		0xfb5d	FTDI compatible adapter
 product FTDI ELV_TWS550		0xe009	FTDI compatible adapter
 product FTDI ELV_UAD8		0xf068	FTDI compatible adapter
 product FTDI ELV_UDA7		0xf069	FTDI compatible adapter
 product FTDI ELV_UDF77		0xfb5e	FTDI compatible adapter
 product FTDI ELV_UIO88		0xfb5f	FTDI compatible adapter
 product FTDI ELV_ULA200		0xf06d	FTDI compatible adapter
 product FTDI ELV_UM100		0xfb5a	FTDI compatible adapter
 product FTDI ELV_UMS100		0xe0eb	FTDI compatible adapter
 product FTDI ELV_UO100		0xfb5b	FTDI compatible adapter
 product FTDI ELV_UR100		0xfb58	FTDI compatible adapter
 product FTDI ELV_USI2		0xf06a	FTDI compatible adapter
 product FTDI ELV_USR		0xe000	FTDI compatible adapter
 product FTDI ELV_UTP8		0xe0f5	FTDI compatible adapter
 product FTDI ELV_WS300PC	0xe0f6	FTDI compatible adapter
 product FTDI ELV_WS444PC	0xe0f7	FTDI compatible adapter
 product FTDI ELV_WS500		0xe0e9	FTDI compatible adapter
 product FTDI ELV_WS550		0xe004	FTDI compatible adapter
 product FTDI ELV_WS777		0xe0ee	FTDI compatible adapter
 product FTDI ELV_WS888		0xe008	FTDI compatible adapter
 product FTDI FUTURE_0		0xf44a	FTDI compatible adapter
 product FTDI FUTURE_1		0xf44b	FTDI compatible adapter
 product FTDI FUTURE_2		0xf44c	FTDI compatible adapter
 product FTDI GENERIC		0x9378	FTDI compatible adapter
 product FTDI GUDEADS_E808	0xe808	FTDI compatible adapter
 product FTDI GUDEADS_E809	0xe809	FTDI compatible adapter
 product FTDI GUDEADS_E80A	0xe80a	FTDI compatible adapter
 product FTDI GUDEADS_E80B	0xe80b	FTDI compatible adapter
 product FTDI GUDEADS_E80C	0xe80c	FTDI compatible adapter
 product FTDI GUDEADS_E80D	0xe80d	FTDI compatible adapter
 product FTDI GUDEADS_E80E	0xe80e	FTDI compatible adapter
 product FTDI GUDEADS_E80F	0xe80f	FTDI compatible adapter
 product FTDI GUDEADS_E88D	0xe88d	FTDI compatible adapter
 product FTDI GUDEADS_E88E	0xe88e	FTDI compatible adapter
 product FTDI GUDEADS_E88F	0xe88f	FTDI compatible adapter
 product FTDI HD_RADIO		0x937c	FTDI compatible adapter
 product FTDI HO720		0xed72	FTDI compatible adapter
 product FTDI HO730		0xed73	FTDI compatible adapter
 product FTDI HO820		0xed74	FTDI compatible adapter
 product FTDI HO870		0xed71	FTDI compatible adapter
 product FTDI IBS_APP70		0xff3d	FTDI compatible adapter
 product FTDI IBS_PCMCIA		0xff3a	FTDI compatible adapter
 product FTDI IBS_PEDO		0xff3e	FTDI compatible adapter
 product FTDI IBS_PICPRO		0xff39	FTDI compatible adapter
 product FTDI IBS_PK1		0xff3b	FTDI compatible adapter
 product FTDI IBS_PROD		0xff3f	FTDI compatible adapter
 product FTDI IBS_RS232MON	0xff3c	FTDI compatible adapter
 product FTDI IBS_US485		0xff38	FTDI compatible adapter
 product FTDI IPLUS		0xd070	FTDI compatible adapter
 product FTDI IPLUS2		0xd071	FTDI compatible adapter
 product FTDI IRTRANS		0xfc60	FTDI compatible adapter
 product FTDI LENZ_LIUSB		0xd780	FTDI compatible adapter
 product FTDI LM3S_DEVEL_BOARD	0xbcd8	FTDI compatible adapter
 product FTDI LM3S_EVAL_BOARD	0xbcd9	FTDI compatible adapter
 product FTDI LM3S_ICDI_B_BOARD	0xbcda	FTDI compatible adapter
 product FTDI MASTERDEVEL2	0xf449	FTDI compatible adapter
 product FTDI MHAM_DB9		0xeeed	FTDI compatible adapter
 product FTDI MHAM_IC		0xeeec	FTDI compatible adapter
 product FTDI MHAM_KW		0xeee8	FTDI compatible adapter
 product FTDI MHAM_RS232		0xeeee	FTDI compatible adapter
 product FTDI MHAM_Y6		0xeeea	FTDI compatible adapter
 product FTDI MHAM_Y8		0xeeeb	FTDI compatible adapter
 product FTDI MHAM_Y9		0xeeef	FTDI compatible adapter
 product FTDI MHAM_YS		0xeee9	FTDI compatible adapter
 product FTDI MICRO_CHAMELEON	0xcaa0	FTDI compatible adapter
 product FTDI MTXORB_5		0xfa05	FTDI compatible adapter
 product FTDI MTXORB_6		0xfa06	FTDI compatible adapter
 product FTDI NXTCAM		0xabb8	FTDI compatible adapter
 product FTDI OCEANIC		0xf460	FTDI compatible adapter
 product FTDI OOCDLINK		0xbaf8	FTDI compatible adapter
 product FTDI OPENDCC		0xbfd8	FTDI compatible adapter
 product FTDI OPENDCC_GATEWAY	0xbfdb	FTDI compatible adapter
 product FTDI OPENDCC_GBM	0xbfdc	FTDI compatible adapter
 product FTDI OPENDCC_SNIFFER	0xbfd9	FTDI compatible adapter
 product FTDI OPENDCC_THROTTLE	0xbfda	FTDI compatible adapter
 product FTDI PCDJ_DAC2		0xfa88	FTDI compatible adapter
 product FTDI PERLE_ULTRAPORT	0xf0c0	FTDI compatible adapter
 product FTDI PHI_FISCO		0xe40b	FTDI compatible adapter
 product FTDI PIEGROUP		0xf208	FTDI compatible adapter
 product FTDI PROPOX_JTAGCABLEII	0xd738	FTDI compatible adapter
 product FTDI R2000KU_TRUE_RNG	0xfb80	FTDI compatible adapter
 product FTDI R2X0		0xfc71	FTDI compatible adapter
 product FTDI RELAIS		0xfa10	FTDI compatible adapter
 product FTDI REU_TINY		0xed22	FTDI compatible adapter
 product FTDI RMP200		0xe729	FTDI compatible adapter
 product FTDI RM_CANVIEW		0xfd60	FTDI compatible adapter
 product FTDI RRCIRKITS_LOCOBUFFER	0xc7d0	FTDI compatible adapter
 product FTDI SCIENCESCOPE_HS_LOGBOOK	0xff1d	FTDI compatible adapter
 product FTDI SCIENCESCOPE_LOGBOOKML	0xff18	FTDI compatible adapter
 product FTDI SCIENCESCOPE_LS_LOGBOOK	0xff1c	FTDI compatible adapter
 product FTDI SCS_DEVICE_0	0xd010	FTDI compatible adapter
 product FTDI SCS_DEVICE_1	0xd011	FTDI compatible adapter
 product FTDI SCS_DEVICE_2	0xd012	FTDI compatible adapter
 product FTDI SCS_DEVICE_3	0xd013	FTDI compatible adapter
 product FTDI SCS_DEVICE_4	0xd014	FTDI compatible adapter
 product FTDI SCS_DEVICE_5	0xd015	FTDI compatible adapter
 product FTDI SCS_DEVICE_6	0xd016	FTDI compatible adapter
 product FTDI SCS_DEVICE_7	0xd017	FTDI compatible adapter
 product FTDI SDMUSBQSS		0xf448	FTDI compatible adapter
 product FTDI SIGNALYZER_SH2	0xbca2	FTDI compatible adapter
 product FTDI SIGNALYZER_SH4	0xbca4	FTDI compatible adapter
 product FTDI SIGNALYZER_SLITE	0xbca1	FTDI compatible adapter
 product FTDI SIGNALYZER_ST	0xbca0	FTDI compatible adapter
 product FTDI SPECIAL_1		0xfc70	FTDI compatible adapter
 product FTDI SPECIAL_3		0xfc72	FTDI compatible adapter
 product FTDI SPECIAL_4		0xfc73	FTDI compatible adapter
 product FTDI SPROG_II		0xf0c8	FTDI compatible adapter
 product FTDI SR_RADIO		0x9379	FTDI compatible adapter
 product FTDI SUUNTO_SPORTS	0xf680	FTDI compatible adapter
 product FTDI TAVIR_STK500	0xfa33	FTDI compatible adapter
 product FTDI TERATRONIK_D2XX	0xec89	FTDI compatible adapter
 product FTDI TERATRONIK_VCP	0xec88	FTDI compatible adapter
 product FTDI THORLABS		0xfaf0	FTDI compatible adapter
 product FTDI TNC_X		0xebe0	FTDI compatible adapter
 product FTDI TTUSB		0xff20	FTDI compatible adapter
 product FTDI USBX_707		0xf857	FTDI compatible adapter
 product FTDI USINT_CAT		0xb810	FTDI compatible adapter
 product FTDI USINT_RS232	0xb812	FTDI compatible adapter
 product FTDI USINT_WKEY		0xb811	FTDI compatible adapter
 product FTDI VARDAAN		0xf070	FTDI compatible adapter
 product FTDI VNHCPCUSB_D	0xfe38	FTDI compatible adapter
 product FTDI WESTREX_MODEL_777		0xdc00	FTDI compatible adapter
 product FTDI WESTREX_MODEL_8900F	0xdc01	FTDI compatible adapter
 product FTDI XF_547		0xfc0a	FTDI compatible adapter
 product FTDI XF_640		0xfc0e	FTDI compatible adapter
 product FTDI XF_642		0xfc0f	FTDI compatible adapter
 product FTDI XM_RADIO		0x937a	FTDI compatible adapter
 product FTDI YEI_SERVOCENTER31	0xe050	FTDI compatible adapter
 
 /* Fuji photo products */
 product FUJIPHOTO MASS0100	0x0100	Mass Storage
 
 /* Fujitsu protducts */
 product FUJITSU AH_F401U	0x105b	AH-F401U Air H device
 
 /* Fujitsu-Siemens protducts */
 product FUJITSUSIEMENS SCR	0x0009	Fujitsu-Siemens SCR USB Reader
 
 /* Garmin products */
 product GARMIN IQUE_3600	0x0004	iQue 3600
 
 /* Gemalto products */
 product GEMALTO PROXPU		0x5501	Prox-PU/CU RFID Card Reader
 
 /* General Instruments (Motorola) products */
 product GENERALINSTMNTS SB5100	0x5100	SURFboard SB5100 Cable modem
 
 /* Genesys Logic products */
 product GENESYS GL620USB	0x0501	GL620USB Host-Host interface
 product GENESYS GL650		0x0604	GL650 HUB
 product GENESYS GL606		0x0606	USB 2.0 HUB
 product GENESYS GL641USB	0x0700	GL641USB CompactFlash Card Reader
 product GENESYS GL641USB2IDE_2	0x0701	GL641USB USB-IDE Bridge No 2
 product GENESYS GL641USB2IDE	0x0702	GL641USB USB-IDE Bridge
 product GENESYS GL641USB_2	0x0760	GL641USB 6-in-1 Card Reader
 
 /* GIGABYTE products */
 product GIGABYTE GN54G		0x8001	GN-54G
 product GIGABYTE GNBR402W	0x8002	GN-BR402W
 product GIGABYTE GNWLBM101	0x8003	GN-WLBM101
 product GIGABYTE GNWBKG		0x8007	GN-WBKG
 product GIGABYTE GNWB01GS	0x8008	GN-WB01GS
 product GIGABYTE GNWI05GS	0x800a	GN-WI05GS
 
 /* Gigaset products */
 product GIGASET WLAN		0x0701	WLAN
 product GIGASET SMCWUSBTG	0x0710	SMCWUSBT-G
 product GIGASET SMCWUSBTG_NF	0x0711	SMCWUSBT-G (no firmware)
 product GIGASET AR5523		0x0712	AR5523
 product GIGASET AR5523_NF	0x0713	AR5523 (no firmware)
 product GIGASET RT2573		0x0722	RT2573
 product GIGASET RT3070_1	0x0740	RT3070
 product GIGASET RT3070_2	0x0744	RT3070
 product GIGABYTE RT2870_1	0x800b	RT2870
 product GIGABYTE GNWB31N	0x800c	GN-WB31N
 product GIGABYTE GNWB32L	0x800d	GN-WB32L
 
 /* Global Sun Technology product */
 product GLOBALSUN AR5523_1	0x7801	AR5523
 product GLOBALSUN AR5523_1_NF	0x7802	AR5523 (no firmware)
 product GLOBALSUN AR5523_2	0x7811	AR5523
 product GLOBALSUN AR5523_2_NF	0x7812	AR5523 (no firmware)
 
 /* Globespan products */
 product GLOBESPAN PRISM_GT_1	0x2000	PrismGT USB 2.0 WLAN
 product GLOBESPAN PRISM_GT_2	0x2002	PrismGT USB 2.0 WLAN
 
 /* G.Mate, Inc products */
 product GMATE YP3X00		0x1001	YP3X00 PDA
 
 /* GN Otometrics */
 product GNOTOMETRICS USB	0x0010	FTDI compatible adapter
 
 /* GoHubs products */
 product GOHUBS GOCOM232		0x1001	GoCOM232 Serial
 
 /* Good Way Technology products */
 product GOODWAY GWUSB2E		0x6200	GWUSB2E
 product GOODWAY RT2573		0xc019	RT2573
 
 /* Google products */
 product GOOGLE NEXUSONE		0x4e11	Nexus One
 
 /* Gravis products */
 product GRAVIS GAMEPADPRO	0x4001	GamePad Pro
 
 /* GREENHOUSE products */
 product GREENHOUSE KANA21	0x0001	CF-writer with MP3
 
 /* Griffin Technology */
 product GRIFFIN IMATE		0x0405	iMate, ADB Adapter
 
 /* Guillemot Corporation */
 product GUILLEMOT DALEADER	0xa300	DA Leader
 product GUILLEMOT HWGUSB254	0xe000	HWGUSB2-54 WLAN
 product GUILLEMOT HWGUSB254LB	0xe010	HWGUSB2-54-LB
 product GUILLEMOT HWGUSB254V2AP	0xe020	HWGUSB2-54V2-AP
 product GUILLEMOT HWNU300	0xe030	HWNU-300
 product GUILLEMOT HWNUM300	0xe031	HWNUm-300
 product GUILLEMOT HWGUN54	0xe032	HWGUn-54
 product GUILLEMOT HWNUP150	0xe033	HWNUP-150
 
 /* Hagiwara products */
 product HAGIWARA FGSM		0x0002	FlashGate SmartMedia Card Reader
 product HAGIWARA FGCF		0x0003	FlashGate CompactFlash Card Reader
 product HAGIWARA FG		0x0005	FlashGate
 
 /* HAL Corporation products */
 product HAL IMR001		0x0011	Crossam2+USB IR commander
 
 /* Handspring, Inc. */
 product HANDSPRING VISOR	0x0100	Handspring Visor
 product HANDSPRING TREO		0x0200	Handspring Treo
 product HANDSPRING TREO600	0x0300	Handspring Treo 600
 
 /* Hauppauge Computer Works */
 product HAUPPAUGE WINTV_USB_FM	0x4d12	WinTV USB FM
 product HAUPPAUGE2 NOVAT500	0x9580	NovaT 500Stick
 
 /* Hawking Technologies products */
 product HAWKING RT2870_1	0x0001	RT2870
 product HAWKING RT2870_2	0x0003	RT2870
 product HAWKING HWUN2		0x0009	HWUN2
 product HAWKING RT3070		0x000b	RT3070
 product HAWKING RTL8192CU	0x0019  RTL8192CU
 product HAWKING UF100		0x400c	10/100 USB Ethernet
 product HAWKING RTL8192SU_1	0x0015	RTL8192SU
 product HAWKING RTL8192SU_2	0x0016	RTL8192SU
 
 /* HID Global GmbH products */
 product HIDGLOBAL CM2020	0x0596	Omnikey Cardman 2020
 product HIDGLOBAL CM6020	0x1784	Omnikey Cardman 6020
 
 /* Hitachi, Ltd. products */
 product HITACHI DVDCAM_DZ_MV100A	0x0004	DVD-CAM DZ-MV100A Camcorder
 product HITACHI DVDCAM_USB	0x001e	DVDCAM USB HS Interface
 
 /* HP products */
 product HP 895C			0x0004	DeskJet 895C
 product HP 4100C		0x0101	Scanjet 4100C
 product HP S20			0x0102	Photosmart S20
 product HP 880C			0x0104	DeskJet 880C
 product HP 4200C		0x0105	ScanJet 4200C
 product HP CDWRITERPLUS		0x0107	CD-Writer Plus
 product HP KBDHUB		0x010c	Multimedia Keyboard Hub
 product HP G55XI		0x0111	OfficeJet G55xi
 product HP HN210W		0x011c	HN210W 802.11b WLAN
 product HP 49GPLUS		0x0121	49g+ graphing calculator
 product HP 6200C		0x0201	ScanJet 6200C
 product HP S20b			0x0202	PhotoSmart S20
 product HP 815C			0x0204	DeskJet 815C
 product HP 3300C		0x0205	ScanJet 3300C
 product HP CDW8200		0x0207	CD-Writer Plus 8200e
 product HP MMKEYB		0x020c	Multimedia keyboard
 product HP 1220C		0x0212	DeskJet 1220C
 product HP UN2420_QDL		0x241d	UN2420 QDL Firmware Loader
 product HP UN2420		0x251d	UN2420 WWAN/GPS Module
 product HP 810C			0x0304	DeskJet 810C/812C
 product HP 4300C		0x0305	Scanjet 4300C
 product HP CDW4E		0x0307	CD-Writer+ CD-4e
 product HP G85XI		0x0311	OfficeJet G85xi
 product HP 1200			0x0317	LaserJet 1200
 product HP 5200C		0x0401	Scanjet 5200C
 product HP 830C			0x0404	DeskJet 830C
 product HP 3400CSE		0x0405	ScanJet 3400cse
 product HP 6300C		0x0601	Scanjet 6300C
 product HP 840C			0x0604	DeskJet 840c
 product HP 2200C		0x0605	ScanJet 2200C
 product HP 5300C		0x0701	Scanjet 5300C
 product HP 4400C		0x0705	Scanjet 4400C
 product	HP 4470C		0x0805	Scanjet 4470C
 product HP 82x0C		0x0b01	Scanjet 82x0C
 product HP 2300D 		0x0b17	Laserjet 2300d
 product HP 970CSE		0x1004	Deskjet 970Cse
 product HP 5400C		0x1005	Scanjet 5400C
 product HP 2215			0x1016	iPAQ 22xx/Jornada 548
 product HP 568J			0x1116	Jornada 568
 product HP 930C			0x1204	DeskJet 930c
 product HP3 RTL8188CU		0x1629	RTL8188CU
 product HP P2000U		0x1801	Inkjet P-2000U
 product HP HS2300		0x1e1d  HS2300 HSDPA (aka MC8775)
 product HP 640C			0x2004	DeskJet 640c
 product HP 4670V		0x3005	ScanJet 4670v
 product HP P1100		0x3102	Photosmart P1100
 product HP LD220		0x3524	LD220 POS Display
 product HP OJ4215		0x3d11	OfficeJet 4215
 product HP HN210E		0x811c	Ethernet HN210E
 product HP2 C500		0x6002	PhotoSmart C500
 product HP EV2200		0x1b1d  ev2200 HSDPA (aka MC5720)
 product HP HS2300		0x1e1d  hs2300 HSDPA (aka MC8775)
 
 /* HTC products */
 product HTC WINMOBILE		0x00ce	HTC USB Sync
 product HTC PPC6700MODEM	0x00cf	PPC6700 Modem
 product HTC SMARTPHONE		0x0a51	SmartPhone USB Sync
 product HTC WIZARD		0x0bce	HTC Wizard USB Sync
 product HTC LEGENDSYNC		0x0c97	HTC Legend USB Sync
 product HTC LEGEND		0x0ff9	HTC Legend
 product HTC LEGENDINTERNET	0x0ffe	HTC Legend Internet Sharing
 
 /* HUAWEI products */
 product HUAWEI MOBILE		0x1001	Huawei Mobile
 product HUAWEI E220		0x1003	HSDPA modem
 product HUAWEI E220BIS		0x1004	HSDPA modem
 product HUAWEI E1401		0x1401	3G modem
 product HUAWEI E1402		0x1402	3G modem
 product HUAWEI E1403		0x1403	3G modem
 product HUAWEI E1404		0x1404	3G modem
 product HUAWEI E1405		0x1405	3G modem
 product HUAWEI E1406		0x1406	3G modem
 product HUAWEI E1407		0x1407	3G modem
 product HUAWEI E1408		0x1408	3G modem
 product HUAWEI E1409		0x1409	3G modem
 product HUAWEI E140A		0x140a	3G modem
 product HUAWEI E140B		0x140b	3G modem
 product HUAWEI E180V		0x140c	E180V
 product HUAWEI E140D		0x140d	3G modem
 product HUAWEI E140E		0x140e	3G modem
 product HUAWEI E140F		0x140f	3G modem
 product HUAWEI E1410		0x1410	3G modem
 product HUAWEI E1411		0x1411	3G modem
 product HUAWEI E1412		0x1412	3G modem
 product HUAWEI E1413		0x1413	3G modem
 product HUAWEI E1414		0x1414	3G modem
 product HUAWEI E1415		0x1415	3G modem
 product HUAWEI E1416		0x1416	3G modem
 product HUAWEI E1417		0x1417	3G modem
 product HUAWEI E1418		0x1418	3G modem
 product HUAWEI E1419		0x1419	3G modem
 product HUAWEI E141A		0x141a	3G modem
 product HUAWEI E141B		0x141b	3G modem
 product HUAWEI E141C		0x141c	3G modem
 product HUAWEI E141D		0x141d	3G modem
 product HUAWEI E141E		0x141e	3G modem
 product HUAWEI E141F		0x141f	3G modem
 product HUAWEI E1420		0x1420	3G modem
 product HUAWEI E1421		0x1421	3G modem
 product HUAWEI E1422		0x1422	3G modem
 product HUAWEI E1423		0x1423	3G modem
 product HUAWEI E1424		0x1424	3G modem
 product HUAWEI E1425		0x1425	3G modem
 product HUAWEI E1426		0x1426	3G modem
 product HUAWEI E1427		0x1427	3G modem
 product HUAWEI E1428		0x1428	3G modem
 product HUAWEI E1429		0x1429	3G modem
 product HUAWEI E142A		0x142a	3G modem
 product HUAWEI E142B		0x142b	3G modem
 product HUAWEI E142C		0x142c	3G modem
 product HUAWEI E142D		0x142d	3G modem
 product HUAWEI E142E		0x142e	3G modem
 product HUAWEI E142F		0x142f	3G modem
 product HUAWEI E1430		0x1430	3G modem
 product HUAWEI E1431		0x1431	3G modem
 product HUAWEI E1432		0x1432	3G modem
 product HUAWEI E1433		0x1433	3G modem
 product HUAWEI E1434		0x1434	3G modem
 product HUAWEI E1435		0x1435	3G modem
 product HUAWEI E1436		0x1436	3G modem
 product HUAWEI E1437		0x1437	3G modem
 product HUAWEI E1438		0x1438	3G modem
 product HUAWEI E1439		0x1439	3G modem
 product HUAWEI E143A		0x143a	3G modem
 product HUAWEI E143B		0x143b	3G modem
 product HUAWEI E143C		0x143c	3G modem
 product HUAWEI E143D		0x143d	3G modem
 product HUAWEI E143E		0x143e	3G modem
 product HUAWEI E143F		0x143f	3G modem
 product HUAWEI E1752		0x1446	3G modem
 product HUAWEI K4505		0x1464	3G modem
 product HUAWEI K3765		0x1465	3G modem
 product HUAWEI E1820		0x14ac	E1820 HSPA+ USB Slider
 product HUAWEI K3770		0x14c9  3G modem
 product HUAWEI K3772		0x14cf	K3772
 product HUAWEI K3770_INIT	0x14d1  K3770 Initial
 product HUAWEI E3131_INIT	0x14fe	3G modem initial
 product HUAWEI E392		0x1505	LTE modem
 product HUAWEI E3131		0x1506	3G modem
 product HUAWEI K3765_INIT	0x1520	K3765 Initial
 product HUAWEI K4505_INIT	0x1521	K4505 Initial
 product HUAWEI K3772_INIT	0x1526	K3772 Initial
 product HUAWEI E3272_INIT	0x155b	LTE modem initial
 product HUAWEI R215_INIT	0x1582	LTE modem initial
 product HUAWEI R215		0x1588	LTE modem
 product HUAWEI ETS2055		0x1803	CDMA modem
 product HUAWEI E173		0x1c05	3G modem
 product HUAWEI E173_INIT	0x1c0b	3G modem initial
 product HUAWEI E3272		0x1c1e	LTE modem
 
 /* HUAWEI 3com products */
 product HUAWEI3COM WUB320G	0x0009	Aolynk WUB320g
 
 /* IBM Corporation */
 product IBM USBCDROMDRIVE	0x4427	USB CD-ROM Drive
 
 /* Icom products */
 product ICOM SP1		0x0004	FTDI compatible adapter
 product ICOM OPC_U_UC		0x0018	FTDI compatible adapter
 product ICOM RP2C1		0x0009	FTDI compatible adapter
 product ICOM RP2C2		0x000a	FTDI compatible adapter
 product ICOM RP2D		0x000b	FTDI compatible adapter
 product ICOM RP2KVR		0x0013	FTDI compatible adapter
 product ICOM RP2KVT		0x0012	FTDI compatible adapter
 product ICOM RP2VR		0x000d	FTDI compatible adapter
 product ICOM RP2VT		0x000c	FTDI compatible adapter
 product ICOM RP4KVR		0x0011	FTDI compatible adapter
 product ICOM RP4KVT		0x0010	FTDI compatible adapter
 
 /* ID-tech products */
 product IDTECH IDT1221U		0x0300	FTDI compatible adapter
 
 /* Imagination Technologies products */
 product IMAGINATION DBX1	0x2107	DBX1 DSP core
 
 /* Initio Corporation products */
 product INITIO DUMMY		0x0000	Dummy product
 product INITIO INIC_1610P	0x1e40	USB to SATA Bridge
 
 /* Inside Out Networks products */
 product INSIDEOUT EDGEPORT4	0x0001	EdgePort/4 serial ports
 
 /* In-System products */
 product INSYSTEM F5U002		0x0002	Parallel printer
 product INSYSTEM ATAPI		0x0031	ATAPI Adapter
 product INSYSTEM ISD110		0x0200	IDE Adapter ISD110
 product INSYSTEM ISD105		0x0202	IDE Adapter ISD105
 product INSYSTEM USBCABLE	0x081a	USB cable
 product INSYSTEM STORAGE_V2	0x5701	USB Storage Adapter V2
 
 /* Intel products */
 product INTEL EASYPC_CAMERA	0x0110	Easy PC Camera
 product INTEL TESTBOARD		0x9890	82930 test board
 product INTEL2 IRMH        	0x0020	Integrated Rate Matching Hub
 product INTEL2 IRMH2        	0x0024	Integrated Rate Matching Hub
 
 /* Interbiometric products */
 product INTERBIOMETRICS IOBOARD		0x1002	FTDI compatible adapter
 product INTERBIOMETRICS MINI_IOBOARD		0x1006	FTDI compatible adapter
 
 /* Intersil products */
 product INTERSIL PRISM_GT	0x1000	PrismGT USB 2.0 WLAN
 product INTERSIL PRISM_2X	0x3642	Prism2.x or Atmel WLAN
 
 /* Interpid Control Systems products */
 product INTREPIDCS VALUECAN	0x0601	ValueCAN CAN bus interface
 product INTREPIDCS NEOVI	0x0701	NeoVI Blue vehicle bus interface
 
 /* I/O DATA products */
 product IODATA IU_CD2		0x0204	DVD Multi-plus unit iU-CD2
 product IODATA DVR_UEH8		0x0206	DVD Multi-plus unit DVR-UEH8
 product IODATA USBSSMRW		0x0314	USB-SSMRW SD-card
 product IODATA USBSDRW		0x031e	USB-SDRW SD-card
 product IODATA USBETT		0x0901	USB ETT
 product IODATA USBETTX		0x0904	USB ETTX
 product IODATA USBETTXS		0x0913	USB ETTX
 product IODATA USBWNB11A	0x0919	USB WN-B11
 product IODATA USBWNB11		0x0922	USB Airport WN-B11
 product IODATA ETGUS2		0x0930	ETG-US2
 product IODATA RT3072_1		0x0944	RT3072
 product IODATA RT3072_2		0x0945	RT3072
 product IODATA RT3072_3		0x0947	RT3072
 product IODATA RT3072_4		0x0948	RT3072
 product IODATA USBRSAQ		0x0a03	Serial USB-RSAQ1
 product IODATA USBRSAQ5		0x0a0e  Serial USB-RSAQ5
 product IODATA2 USB2SC		0x0a09	USB2.0-SCSI Bridge USB2-SC
 
 /* Iomega products */
 product IOMEGA ZIP100		0x0001	Zip 100
 product IOMEGA ZIP250		0x0030	Zip 250
 
 /* Ionic products */
 product IONICS PLUGCOMPUTER		0x0102	FTDI compatible adapter
 
 /* Integrated System Solution Corp. products */
 product ISSC ISSCBTA		0x1001	Bluetooth USB Adapter
 
 /* iTegno products */
 product ITEGNO WM1080A		0x1080	WM1080A GSM/GPRS modem
 product ITEGNO WM2080A		0x2080	WM2080A CDMA modem
 
 /* Ituner networks products */
 product ITUNERNET USBLCD2X20	0x0002	USB-LCD 2x20
 product ITUNERNET USBLCD4X20	0xc001	USB-LCD 4x20
 
 /* Jablotron products */
 product JABLOTRON PC60B		0x0001	PC-60B
 
 /* Jaton products */
 product JATON EDA		0x5704	Ethernet
 
 /* Jeti products */
 product JETI SPC1201		0x04b2	FTDI compatible adapter
 
 /* JMicron products */
 product JMICRON JM20336		0x2336	USB to SATA Bridge
 product JMICRON JM20337		0x2338	USB to ATA/ATAPI Bridge
 
 /* JVC products */
 product JVC GR_DX95		0x000a	GR-DX95
 product JVC MP_PRX1		0x3008	MP-PRX1 Ethernet
 
 /* JRC products */
 product JRC AH_J3001V_J3002V	0x0001	AirH PHONE AH-J3001V/J3002V
 
 /* Kamstrrup products */
 product KAMSTRUP OPTICALEYE	0x0001	Optical Eye/3-wire
 product KAMSTRUP MBUS_250D	0x0005	M-Bus Master MultiPort 250D
 
 /* Kawatsu products */
 product KAWATSU MH4000P		0x0003	MiniHub 4000P
 
 /* Keisokugiken Corp. products */
 product KEISOKUGIKEN USBDAQ	0x0068	HKS-0200 USBDAQ
 
 /* Kensington products */
 product KENSINGTON ORBIT	0x1003	Orbit USB/PS2 trackball
 product KENSINGTON TURBOBALL	0x1005	TurboBall
 
 /* Keyspan products */
 product KEYSPAN USA28_NF	0x0101	USA-28 serial Adapter (no firmware)
 product KEYSPAN USA28X_NF	0x0102	USA-28X serial Adapter (no firmware)
 product KEYSPAN USA19_NF	0x0103	USA-19 serial Adapter (no firmware)
 product KEYSPAN USA18_NF	0x0104	USA-18 serial Adapter (no firmware)
 product KEYSPAN USA18X_NF	0x0105	USA-18X serial Adapter (no firmware)
 product KEYSPAN USA19W_NF	0x0106	USA-19W serial Adapter (no firmware)
 product KEYSPAN USA19		0x0107	USA-19 serial Adapter
 product KEYSPAN USA19W		0x0108	USA-19W serial Adapter
 product KEYSPAN USA49W_NF	0x0109	USA-49W serial Adapter (no firmware)
 product KEYSPAN USA49W		0x010a	USA-49W serial Adapter
 product KEYSPAN USA19QI_NF	0x010b	USA-19QI serial Adapter (no firmware)
 product KEYSPAN USA19QI		0x010c	USA-19QI serial Adapter
 product KEYSPAN USA19Q_NF	0x010d	USA-19Q serial Adapter (no firmware)
 product KEYSPAN USA19Q		0x010e	USA-19Q serial Adapter
 product KEYSPAN USA28		0x010f	USA-28 serial Adapter
 product KEYSPAN USA28XXB	0x0110	USA-28X/XB serial Adapter
 product KEYSPAN USA18		0x0111	USA-18 serial Adapter
 product KEYSPAN USA18X		0x0112	USA-18X serial Adapter
 product KEYSPAN USA28XB_NF	0x0113	USA-28XB serial Adapter (no firmware)
 product KEYSPAN USA28XA_NF	0x0114	USA-28XB serial Adapter (no firmware)
 product KEYSPAN USA28XA		0x0115	USA-28XA serial Adapter
 product KEYSPAN USA18XA_NF	0x0116	USA-18XA serial Adapter (no firmware)
 product KEYSPAN USA18XA		0x0117	USA-18XA serial Adapter
 product KEYSPAN USA19QW_NF	0x0118	USA-19WQ serial Adapter (no firmware)
 product KEYSPAN USA19QW		0x0119	USA-19WQ serial Adapter
 product KEYSPAN USA19HA		0x0121	USA-19HS serial Adapter
 product KEYSPAN UIA10		0x0201	UIA-10 remote control
 product KEYSPAN UIA11		0x0202	UIA-11 remote control
 
 /* Kingston products */
 product KINGSTON XX1		0x0008	Ethernet
 product KINGSTON KNU101TX	0x000a	KNU101TX USB Ethernet
 product KINGSTON HYPERX3_0	0x162b	DT HyperX 3.0
 
 /* Kawasaki products */
 product KLSI DUH3E10BT		0x0008	USB Ethernet
 product KLSI DUH3E10BTN		0x0009	USB Ethernet
 
 /* Kobil products */
 product KOBIL CONV_B1		0x2020	FTDI compatible adapter
 product KOBIL CONV_KAAN		0x2021	FTDI compatible adapter
 
 /* Kodak products */
 product KODAK DC220		0x0100	Digital Science DC220
 product KODAK DC260		0x0110	Digital Science DC260
 product KODAK DC265		0x0111	Digital Science DC265
 product KODAK DC290		0x0112	Digital Science DC290
 product KODAK DC240		0x0120	Digital Science DC240
 product KODAK DC280		0x0130	Digital Science DC280
 
 /* Kontron AG products */
 product KONTRON DM9601		0x8101	USB Ethernet
 product KONTRON JP1082		0x9700	USB Ethernet
 
 /* Konica Corp. Products */
 product KONICA CAMERA		0x0720	Digital Color Camera
 
 /* KYE products */
 product KYE NICHE		0x0001	Niche mouse
 product KYE NETSCROLL		0x0003	Genius NetScroll mouse
 product KYE FLIGHT2000		0x1004	Flight 2000 joystick
 product KYE VIVIDPRO		0x2001	ColorPage Vivid-Pro scanner
 
 /* Kyocera products */
 product KYOCERA FINECAM_S3X	0x0100	Finecam S3x
 product KYOCERA FINECAM_S4	0x0101	Finecam S4
 product KYOCERA FINECAM_S5	0x0103	Finecam S5
 product KYOCERA FINECAM_L3	0x0105	Finecam L3
 product KYOCERA AHK3001V	0x0203	AH-K3001V
 product KYOCERA2 CDMA_MSM_K	0x17da	Qualcomm Kyocera CDMA Technologies MSM
 product KYOCERA2 KPC680		0x180a	Qualcomm Kyocera CDMA Technologies MSM
 
 /* LaCie products */
 product LACIE HD		0xa601	Hard Disk
 product LACIE CDRW		0xa602	CD R/W
 
 /* Lake Shore Cryotronics products */
 product LAKESHORE 121		0x0100	121 Current Source
 product LAKESHORE 218A		0x0200	218A Temperature Monitor
 product LAKESHORE 219		0x0201	219 Temperature Monitor
 product LAKESHORE 233		0x0202	233 Temperature Transmitter
 product LAKESHORE 235		0x0203	235 Temperature Transmitter
 product LAKESHORE 335		0x0300	335 Temperature Controller
 product LAKESHORE 336		0x0301	336 Temperature Controller
 product LAKESHORE 350		0x0302	350 Temperature Controller
 product LAKESHORE 371		0x0303	371 AC Bridge
 product LAKESHORE 411		0x0400	411 Handheld Gaussmeter
 product LAKESHORE 425		0x0401	425 Gaussmeter
 product LAKESHORE 455A		0x0402	455A DSP Gaussmeter
 product LAKESHORE 475A		0x0403	475A DSP Gaussmeter
 product LAKESHORE 465		0x0404	465 Gaussmeter
 product LAKESHORE 625A		0x0600	625A Magnet PSU
 product LAKESHORE 642A		0x0601	642A Magnet PSU
 product LAKESHORE 648		0x0602	648 Magnet PSU
 product LAKESHORE 737		0x0700	737 VSM Controller
 product LAKESHORE 776		0x0701	776 Matrix Switch
 
 /* Larsen and Brusgaard products */
 product LARSENBRUSGAARD ALTITRACK	0x0001	FTDI compatible adapter
 
 /* Leadtek products */
 product LEADTEK 9531		0x2101	9531 GPS
 
 /* Lenovo products */
 product LENOVO ETHERNET		0x7203	USB 2.0 Ethernet
 
 /* Lexar products */
 product LEXAR JUMPSHOT		0x0001	jumpSHOT CompactFlash Reader
 product LEXAR CF_READER		0xb002	USB CF Reader
 
 /* Lexmark products */
 product LEXMARK S2450		0x0009	Optra S 2450
 
 /* Liebert products */
 product LIEBERT POWERSURE_PXT	0xffff	PowerSure Personal XT
 product LIEBERT2 PSI1000	0x0004	UPS PSI 1000 FW:08
 
 /* Link Instruments Inc. products */
 product	LINKINSTRUMENTS MSO19	0xf190	Link Instruments MSO-19
 product	LINKINSTRUMENTS MSO28	0xf280	Link Instruments MSO-28
 product	LINKINSTRUMENTS MSO28_2	0xf281	Link Instruments MSO-28
 
 /* Linksys products */
 product LINKSYS MAUSB2		0x0105	Camedia MAUSB-2
 product LINKSYS USB10TX1	0x200c	USB10TX
 product LINKSYS USB10T		0x2202	USB10T Ethernet
 product LINKSYS USB100TX	0x2203	USB100TX Ethernet
 product LINKSYS USB100H1	0x2204	USB100H1 Ethernet/HPNA
 product LINKSYS USB10TA		0x2206	USB10TA Ethernet
 product LINKSYS USB10TX2	0x400b	USB10TX
 product LINKSYS2 WUSB11		0x2219	WUSB11 Wireless Adapter
 product LINKSYS2 USB200M	0x2226	USB 2.0 10/100 Ethernet
 product LINKSYS3 WUSB11v28	0x2233	WUSB11 v2.8 Wireless Adapter
 product LINKSYS4 USB1000	0x0039	USB1000
 product LINKSYS4 WUSB100	0x0070	WUSB100
 product LINKSYS4 WUSB600N	0x0071	WUSB600N
 product LINKSYS4 WUSB54GCV2	0x0073	WUSB54GC v2
 product LINKSYS4 WUSB54GCV3	0x0077	WUSB54GC v3
 product LINKSYS4 RT3070		0x0078	RT3070
 product LINKSYS4 WUSB600NV2	0x0079	WUSB600N v2
 
 /* Logilink products */
 product LOGILINK DUMMY		0x0000	Dummy product
 product LOGILINK U2M		0x0101	LogiLink USB MIDI Cable
 
 /* Logitech products */
 product LOGITECH M2452		0x0203	M2452 keyboard
 product LOGITECH M4848		0x0301	M4848 mouse
 product LOGITECH PAGESCAN	0x040f	PageScan
 product LOGITECH QUICKCAMWEB	0x0801	QuickCam Web
 product LOGITECH QUICKCAMPRO	0x0810	QuickCam Pro
 product LOGITECH WEBCAMC100	0X0817	Webcam C100
 product LOGITECH QUICKCAMEXP	0x0840	QuickCam Express
 product LOGITECH QUICKCAM	0x0850	QuickCam
 product LOGITECH QUICKCAMPRO3	0x0990	QuickCam Pro 9000
 product LOGITECH N43		0xc000	N43
 product LOGITECH N48		0xc001	N48 mouse
 product LOGITECH MBA47		0xc002	M-BA47 mouse
 product LOGITECH WMMOUSE	0xc004	WingMan Gaming Mouse
 product LOGITECH BD58		0xc00c	BD58 mouse
 product LOGITECH UN58A		0xc030	iFeel Mouse
 product LOGITECH UN53B		0xc032	iFeel MouseMan
 product LOGITECH WMPAD		0xc208	WingMan GamePad Extreme
 product LOGITECH WMRPAD		0xc20a	WingMan RumblePad
 product LOGITECH WMJOY		0xc281	WingMan Force joystick
 product LOGITECH BB13		0xc401	USB-PS/2 Trackball
 product LOGITECH RK53		0xc501	Cordless mouse
 product LOGITECH RB6		0xc503	Cordless keyboard
 product LOGITECH MX700		0xc506	Cordless optical mouse
 product LOGITECH QUICKCAMPRO2	0xd001	QuickCam Pro
 
 /* Logitec Corp. products */
 product LOGITEC LDR_H443SU2	0x0033	DVD Multi-plus unit LDR-H443SU2
 product LOGITEC LDR_H443U2	0x00b3	DVD Multi-plus unit LDR-H443U2
 product LOGITEC LAN_GTJU2A	0x0160	LAN-GTJ/U2A Ethernet
 product LOGITEC RT2870_1	0x0162	RT2870
 product LOGITEC RT2870_2	0x0163	RT2870
 product LOGITEC RT2870_3	0x0164	RT2870
 product LOGITEC LANW300NU2	0x0166  LAN-W300N/U2
 product LOGITEC LANW150NU2	0x0168  LAN-W150N/U2
 product LOGITEC LANW300NU2S	0x0169	LAN-W300N/U2S
 
 /* Longcheer Holdings, Ltd. products */
 product LONGCHEER WM66		0x6061	Longcheer WM66 HSDPA
 product LONGCHEER W14		0x9603	Mobilcom W14
 product LONGCHEER DISK		0xf000	Driver disk
 product LONGCHEER XSSTICK	0x9605	4G Systems XSStick P14
 
 
 /* Lucent products */
 product LUCENT EVALKIT		0x1001	USS-720 evaluation kit
 
 /* Luwen products */
 product LUWEN EASYDISK		0x0005	EasyDisc
 
 /* Macally products */
 product MACALLY MOUSE1		0x0101	mouse
 
 /* Mag-Tek products */
 product MAGTEK USBSWIPE		0x0002	USB Mag Stripe Swipe Reader
 
 /* Marvell Technology Group, Ltd. products */
 product MARVELL SHEEVAPLUG	0x9e8f	SheevaPlug serial interface
 
 /* Matrix Orbital products */
 product MATRIXORBITAL FTDI_RANGE_0100		0x0100	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0101		0x0101	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0102		0x0102	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0103		0x0103	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0104		0x0104	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0105		0x0105	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0106		0x0106	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0107		0x0107	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0108		0x0108	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0109		0x0109	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_010A		0x010a	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_010B		0x010b	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_010C		0x010c	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_010D		0x010d	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_010E		0x010e	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_010F		0x010f	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0110		0x0110	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0111		0x0111	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0112		0x0112	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0113		0x0113	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0114		0x0114	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0115		0x0115	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0116		0x0116	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0117		0x0117	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0118		0x0118	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0119		0x0119	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_011A		0x011a	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_011B		0x011b	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_011C		0x011c	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_011D		0x011d	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_011E		0x011e	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_011F		0x011f	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0120		0x0120	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0121		0x0121	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0122		0x0122	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0123		0x0123	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0124		0x0124	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0125		0x0125	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0126		0x0126	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0128		0x0128	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0129		0x0129	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_012A		0x012a	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_012B		0x012b	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_012D		0x012d	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_012E		0x012e	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_012F		0x012f	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0130		0x0130	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0131		0x0131	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0132		0x0132	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0133		0x0133	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0134		0x0134	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0135		0x0135	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0136		0x0136	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0137		0x0137	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0138		0x0138	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0139		0x0139	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_013A		0x013a	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_013B		0x013b	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_013C		0x013c	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_013D		0x013d	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_013E		0x013e	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_013F		0x013f	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0140		0x0140	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0141		0x0141	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0142		0x0142	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0143		0x0143	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0144		0x0144	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0145		0x0145	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0146		0x0146	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0147		0x0147	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0148		0x0148	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0149		0x0149	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_014A		0x014a	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_014B		0x014b	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_014C		0x014c	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_014D		0x014d	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_014E		0x014e	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_014F		0x014f	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0150		0x0150	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0151		0x0151	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0152		0x0152	FTDI compatible adapter
 product MATRIXORBITAL MOUA	0x0153	Martrix Orbital MOU-Axxxx LCD displays
 product MATRIXORBITAL FTDI_RANGE_0159		0x0159	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_015A		0x015a	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_015B		0x015b	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_015C		0x015c	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_015D		0x015d	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_015E		0x015e	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_015F		0x015f	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0160		0x0160	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0161		0x0161	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0162		0x0162	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0163		0x0163	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0164		0x0164	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0165		0x0165	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0166		0x0166	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0167		0x0167	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0168		0x0168	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0169		0x0169	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_016A		0x016a	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_016B		0x016b	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_016C		0x016c	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_016D		0x016d	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_016E		0x016e	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_016F		0x016f	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0170		0x0170	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0171		0x0171	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0172		0x0172	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0173		0x0173	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0174		0x0174	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0175		0x0175	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0176		0x0176	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0177		0x0177	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0178		0x0178	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0179		0x0179	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_017A		0x017a	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_017B		0x017b	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_017C		0x017c	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_017D		0x017d	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_017E		0x017e	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_017F		0x017f	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0180		0x0180	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0181		0x0181	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0182		0x0182	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0183		0x0183	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0184		0x0184	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0185		0x0185	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0186		0x0186	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0187		0x0187	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0188		0x0188	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0189		0x0189	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_018A		0x018a	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_018B		0x018b	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_018C		0x018c	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_018D		0x018d	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_018E		0x018e	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_018F		0x018f	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0190		0x0190	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0191		0x0191	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0192		0x0192	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0193		0x0193	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0194		0x0194	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0195		0x0195	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0196		0x0196	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0197		0x0197	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0198		0x0198	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_0199		0x0199	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_019A		0x019a	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_019B		0x019b	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_019C		0x019c	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_019D		0x019d	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_019E		0x019e	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_019F		0x019f	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01A0		0x01a0	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01A1		0x01a1	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01A2		0x01a2	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01A3		0x01a3	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01A4		0x01a4	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01A5		0x01a5	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01A6		0x01a6	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01A7		0x01a7	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01A8		0x01a8	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01A9		0x01a9	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01AA		0x01aa	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01AB		0x01ab	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01AC		0x01ac	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01AD		0x01ad	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01AE		0x01ae	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01AF		0x01af	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01B0		0x01b0	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01B1		0x01b1	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01B2		0x01b2	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01B3		0x01b3	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01B4		0x01b4	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01B5		0x01b5	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01B6		0x01b6	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01B7		0x01b7	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01B8		0x01b8	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01B9		0x01b9	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01BA		0x01ba	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01BB		0x01bb	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01BC		0x01bc	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01BD		0x01bd	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01BE		0x01be	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01BF		0x01bf	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01C0		0x01c0	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01C1		0x01c1	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01C2		0x01c2	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01C3		0x01c3	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01C4		0x01c4	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01C5		0x01c5	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01C6		0x01c6	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01C7		0x01c7	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01C8		0x01c8	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01C9		0x01c9	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01CA		0x01ca	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01CB		0x01cb	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01CC		0x01cc	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01CD		0x01cd	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01CE		0x01ce	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01CF		0x01cf	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01D0		0x01d0	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01D1		0x01d1	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01D2		0x01d2	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01D3		0x01d3	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01D4		0x01d4	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01D5		0x01d5	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01D6		0x01d6	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01D7		0x01d7	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01D8		0x01d8	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01D9		0x01d9	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01DA		0x01da	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01DB		0x01db	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01DC		0x01dc	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01DD		0x01dd	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01DE		0x01de	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01DF		0x01df	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01E0		0x01e0	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01E1		0x01e1	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01E2		0x01e2	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01E3		0x01e3	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01E4		0x01e4	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01E5		0x01e5	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01E6		0x01e6	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01E7		0x01e7	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01E8		0x01e8	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01E9		0x01e9	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01EA		0x01ea	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01EB		0x01eb	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01EC		0x01ec	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01ED		0x01ed	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01EE		0x01ee	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01EF		0x01ef	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01F0		0x01f0	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01F1		0x01f1	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01F2		0x01f2	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01F3		0x01f3	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01F4		0x01f4	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01F5		0x01f5	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01F6		0x01f6	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01F7		0x01f7	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01F8		0x01f8	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01F9		0x01f9	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01FA		0x01fa	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01FB		0x01fb	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01FC		0x01fc	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01FD		0x01fd	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01FE		0x01fe	FTDI compatible adapter
 product MATRIXORBITAL FTDI_RANGE_01FF		0x01ff	FTDI compatible adapter
 
 /* MCT Corp. */
 product MCT HUB0100		0x0100	Hub
 product MCT DU_H3SP_USB232	0x0200	D-Link DU-H3SP USB BAY Hub
 product MCT USB232		0x0210	USB-232 Interface
 product MCT SITECOM_USB232	0x0230	Sitecom USB-232 Products
 
 /* Medeli */
 product MEDELI DD305		0x5011	DD305 Digital Drum Set
 
 /* MediaTek, Inc. */
 product MEDIATEK MTK3329	0x3329	MTK II GPS Receiver
 
 /* Meizu Electronics */
 product MEIZU M6_SL		0x0140	MiniPlayer M6 (SL)
 
 /* Melco, Inc products */
 product MELCO LUATX1		0x0001	LUA-TX Ethernet
 product MELCO LUATX5		0x0005	LUA-TX Ethernet
 product MELCO LUA2TX5		0x0009	LUA2-TX Ethernet
 product MELCO LUAKTX		0x0012	LUA-KTX Ethernet
 product MELCO DUBPXXG		0x001c	DUB-PxxG
 product MELCO LUAU2KTX		0x003d	LUA-U2-KTX Ethernet
 product MELCO KG54YB		0x005e	WLI-U2-KG54-YB WLAN
 product MELCO KG54		0x0066	WLI-U2-KG54 WLAN
 product MELCO KG54AI		0x0067	WLI-U2-KG54-AI WLAN
 product MELCO LUA3U2AGT		0x006e	LUA3-U2-AGT
 product MELCO NINWIFI		0x008b	Nintendo Wi-Fi
 product MELCO PCOPRS1		0x00b3	PC-OP-RS1 RemoteStation
 product MELCO SG54HP		0x00d8	WLI-U2-SG54HP
 product MELCO G54HP		0x00d9	WLI-U2-G54HP
 product MELCO KG54L		0x00da	WLI-U2-KG54L
 product MELCO WLIUCG300N	0x00e8	WLI-UC-G300N
 product MELCO SG54HG		0x00f4	WLI-U2-SG54HG
 product MELCO WLRUCG		0x0116	WLR-UC-G
 product MELCO WLRUCGAOSS	0x0119	WLR-UC-G-AOSS
 product MELCO WLIUCAG300N	0x012e	WLI-UC-AG300N
 product MELCO WLIUCG		0x0137	WLI-UC-G
 product MELCO RT2870_1		0x0148	RT2870
 product MELCO RT2870_2		0x0150	RT2870
 product MELCO WLIUCGN		0x015d	WLI-UC-GN
 product MELCO WLIUCG301N	0x016f	WLI-UC-G301N
 product MELCO WLIUCGNM		0x01a2	WLI-UC-GNM
 product MELCO WLIUCGNM2		0x01ee	WLI-UC-GNM2
 
 /* Merlin products */
 product MERLIN V620             0x1110  Merlin V620
 
 /* MetaGeek products */
 product METAGEEK TELLSTICK	0x0c30	FTDI compatible adapter
 product METAGEEK WISPY1B	0x083e	MetaGeek Wi-Spy
 product METAGEEK WISPY24X	0x083f	MetaGeek Wi-Spy 2.4x
 product METAGEEK2 WISPYDBX	0x5000	MetaGeek Wi-Spy DBx
 
 /* Metricom products */
 product METRICOM RICOCHET_GS	0x0001	Ricochet GS
 
 /* MGE UPS Systems */
 product MGE UPS1		0x0001	MGE UPS SYSTEMS PROTECTIONCENTER 1
 product MGE UPS2		0xffff	MGE UPS SYSTEMS PROTECTIONCENTER 2
 
 /* MEI products */
 product MEI CASHFLOW_SC		0x1100	Cashflow-SC Cash Acceptor
 product MEI S2000		0x1101	Series 2000 Combo Acceptor
 
 /* Micro Star International products */
 product MSI BT_DONGLE		0x1967	Bluetooth USB dongle
 product MSI RT3070_1		0x3820	RT3070
 product MSI RT3070_2		0x3821	RT3070
 product MSI RT3070_8		0x3822	RT3070
 product MSI RT3070_3		0x3870	RT3070
 product MSI RT3070_9		0x3871	RT3070
 product MSI UB11B		0x6823	UB11B
 product MSI RT2570		0x6861	RT2570
 product MSI RT2570_2		0x6865	RT2570
 product MSI RT2570_3		0x6869	RT2570
 product MSI RT2573_1		0x6874	RT2573
 product MSI RT2573_2		0x6877	RT2573
 product MSI RT3070_4		0x6899	RT3070
 product MSI RT3070_5		0x821a	RT3070
 product MSI RT3070_10		0x822a	RT3070
 product MSI RT3070_6		0x870a	RT3070
 product MSI RT3070_11		0x871a	RT3070
 product MSI RT3070_7		0x899a	RT3070
 product MSI RT2573_3		0xa861	RT2573
 product MSI RT2573_4		0xa874	RT2573
 
 /* Micron products */
 product MICRON REALSSD		0x0655	Real SSD eUSB
 
 /* Microsoft products */
 product MICROSOFT SIDEPREC	0x0008	SideWinder Precision Pro
 product MICROSOFT INTELLIMOUSE	0x0009	IntelliMouse
 product MICROSOFT NATURALKBD	0x000b	Natural Keyboard Elite
 product MICROSOFT DDS80		0x0014	Digital Sound System 80
 product MICROSOFT SIDEWINDER	0x001a	Sidewinder Precision Racing Wheel
 product MICROSOFT INETPRO	0x001c	Internet Keyboard Pro
 product MICROSOFT TBEXPLORER	0x0024	Trackball Explorer
 product MICROSOFT INTELLIEYE	0x0025	IntelliEye mouse
 product MICROSOFT INETPRO2	0x002b	Internet Keyboard Pro
 product MICROSOFT INTELLIMOUSE5	0x0039	IntelliMouse 1.1 5-Button Mouse
 product MICROSOFT WHEELMOUSE	0x0040	Wheel Mouse Optical
 product MICROSOFT MN510		0x006e	MN510 Wireless
 product MICROSOFT 700WX		0x0079	Palm 700WX
 product MICROSOFT MN110		0x007a	10/100 USB NIC
 product MICROSOFT WLINTELLIMOUSE 0x008c	Wireless Optical IntelliMouse
 product MICROSOFT WLNOTEBOOK	0x00b9	Wireless Optical Mouse (Model 1023)
 product MICROSOFT COMFORT3000	0x00d1	Comfort Optical Mouse 3000 (Model 1043)
 product MICROSOFT WLNOTEBOOK3	0x00d2	Wireless Optical Mouse 3000 (Model 1049)
 product MICROSOFT NATURAL4000	0x00db	Natural Ergonomic Keyboard 4000
 product MICROSOFT WLNOTEBOOK2	0x00e1	Wireless Optical Mouse 3000 (Model 1056)
 product MICROSOFT XBOX360	0x0292	XBOX 360 WLAN
 
 /* Microtech products */
 product MICROTECH SCSIDB25	0x0004	USB-SCSI-DB25
 product MICROTECH SCSIHD50	0x0005	USB-SCSI-HD50
 product MICROTECH DPCM		0x0006	USB CameraMate
 product MICROTECH FREECOM	0xfc01	Freecom USB-IDE
 
 /* Microtek products */
 product MICROTEK 336CX		0x0094	Phantom 336CX - C3 scanner
 product MICROTEK X6U		0x0099	ScanMaker X6 - X6U
 product MICROTEK C6		0x009a	Phantom C6 scanner
 product MICROTEK 336CX2		0x00a0	Phantom 336CX - C3 scanner
 product MICROTEK V6USL		0x00a3	ScanMaker V6USL
 product MICROTEK V6USL2		0x80a3	ScanMaker V6USL
 product MICROTEK V6UL		0x80ac	ScanMaker V6UL
 
 /* Microtune, Inc. products */
 product MICROTUNE BT_DONGLE	0x1000	Bluetooth USB dongle
 
 /* Midiman products */
 product MAUDIO MIDISPORT2X2	0x1001	Midisport 2x2
 product MAUDIO FASTTRACKULTRA	0x2080	Fast Track Ultra
 product MAUDIO FASTTRACKULTRA8R	0x2081	Fast Track Ultra 8R
 
 /* MindsAtWork products */
 product MINDSATWORK WALLET	0x0001	Digital Wallet
 
 /* Minolta Co., Ltd. */
 product MINOLTA 2300		0x4001	Dimage 2300
 product MINOLTA S304		0x4007	Dimage S304
 product MINOLTA X		0x4009	Dimage X
 product MINOLTA 5400		0x400e	Dimage 5400
 product MINOLTA F300		0x4011	Dimage F300
 product MINOLTA E223		0x4017	Dimage E223
 
 /* Mitsumi products */
 product MITSUMI CDRRW		0x0000	CD-R/RW Drive
 product MITSUMI BT_DONGLE	0x641f	Bluetooth USB dongle
 product MITSUMI FDD		0x6901	USB FDD
 
 /* Mobile Action products */
 product MOBILEACTION MA620	0x0620	MA-620 Infrared Adapter
 
 /* Mobility products */
 product MOBILITY USB_SERIAL	0x0202	FTDI compatible adapter
 product MOBILITY EA		0x0204	Ethernet
 product MOBILITY EASIDOCK	0x0304	EasiDock Ethernet
 
 /* MosChip products */
 product MOSCHIP MCS7703		0x7703	MCS7703 Serial Port Adapter
 product MOSCHIP MCS7730		0x7730  MCS7730 Ethernet
 product MOSCHIP MCS7820		0x7820	MCS7820 Serial Port Adapter
 product MOSCHIP MCS7830		0x7830	MCS7830 Ethernet
 product MOSCHIP MCS7832		0x7832	MCS7832 Ethernet
 product MOSCHIP MCS7840		0x7840	MCS7840 Serial Port Adapter
 
 /* Motorola products */
 product MOTOROLA MC141555	0x1555	MC141555 hub controller
 product MOTOROLA SB4100		0x4100	SB4100 USB Cable Modem
 product MOTOROLA2 T720C		0x2822	T720c
 product MOTOROLA2 A41XV32X	0x2a22	A41x/V32x Mobile Phones
 product MOTOROLA2 E398		0x4810	E398 Mobile Phone
 product MOTOROLA2 USBLAN	0x600c	USBLAN
 product MOTOROLA2 USBLAN2	0x6027	USBLAN
 product MOTOROLA2 MB886		0x710f	MB886 Mobile Phone (Atria HD)
 product MOTOROLA4 RT2770	0x9031	RT2770
 product MOTOROLA4 RT3070	0x9032	RT3070
 
 /* MpMan products */
 product MPMAN MPF400_2		0x25a8	MPF400 Music Player 2Go
 product MPMAN MPF400_1		0x36d0	MPF400 Music Player 1Go
 
 /* MultiTech products */
 product MULTITECH ATLAS		0xf101	MT5634ZBA-USB modem
 
 /* Mustek products */
 product MUSTEK 1200CU		0x0001	1200 CU scanner
 product MUSTEK 600CU		0x0002	600 CU scanner
 product MUSTEK 1200USB		0x0003	1200 USB scanner
 product MUSTEK 1200UB		0x0006	1200 UB scanner
 product MUSTEK 1200USBPLUS	0x0007	1200 USB Plus scanner
 product MUSTEK 1200CUPLUS	0x0008	1200 CU Plus scanner
 product MUSTEK BEARPAW1200F	0x0010	BearPaw 1200F scanner
 product MUSTEK BEARPAW2400TA	0x0218	BearPaw 2400TA scanner
 product MUSTEK BEARPAW1200TA	0x021e	BearPaw 1200TA scanner
 product MUSTEK 600USB		0x0873	600 USB scanner
 product MUSTEK MDC800		0xa800	MDC-800 digital camera
 
 /* M-Systems products */
 product MSYSTEMS DISKONKEY	0x0010	DiskOnKey
 product MSYSTEMS DISKONKEY2	0x0011	DiskOnKey
 
 /* Myson products */
 product MYSON HEDEN_8813	0x8813	USB-IDE
 product MYSON HEDEN		0x8818	USB-IDE
 product MYSON HUBREADER		0x8819	COMBO Card reader with USB HUB
 product MYSON STARREADER	0x9920	USB flash card adapter
 
 /* National Semiconductor */
 product NATIONAL BEARPAW1200	0x1000	BearPaw 1200
 product NATIONAL BEARPAW2400	0x1001	BearPaw 2400
 
 /* NEC products */
 product NEC HUB_0050		0x0050	USB 2.0 7-Port Hub
 product NEC HUB_005A		0x005a	USB 2.0 4-Port Hub
 product NEC HUB			0x55aa	hub
 product NEC HUB_B		0x55ab	hub
 
 /* NEODIO products */
 product NEODIO ND3260		0x3260	8-in-1 Multi-format Flash Controller
 product NEODIO ND5010		0x5010	Multi-format Flash Controller
 
 /* Neotel products */
 product NEOTEL PRIME		0x4000	Prime USB modem
 
 /* Netac products */
 product NETAC CF_CARD		0x1060	USB-CF-Card
 product NETAC ONLYDISK		0x0003	OnlyDisk
 
 /* NetChip Technology Products */
 product NETCHIP TURBOCONNECT	0x1080	Turbo-Connect
 product NETCHIP CLIK_40		0xa140	USB Clik! 40
 product NETCHIP GADGETZERO	0xa4a0	Linux Gadget Zero
 product NETCHIP ETHERNETGADGET	0xa4a2	Linux Ethernet/RNDIS gadget on pxa210/25x/26x
 product NETCHIP POCKETBOOK	0xa4a5	PocketBook
 
 /* Netgear products */
 product NETGEAR EA101		0x1001	Ethernet
 product NETGEAR EA101X		0x1002	Ethernet
 product NETGEAR FA101		0x1020	Ethernet 10/100, USB1.1
 product NETGEAR FA120		0x1040	USB 2.0 Ethernet
 product NETGEAR M4100		0x1100	M4100/M5300/M7100 series switch
 product NETGEAR WG111V2_2	0x4240	PrismGT USB 2.0 WLAN
 product NETGEAR WG111V3		0x4260	WG111v3
 product NETGEAR WG111U		0x4300	WG111U
 product NETGEAR WG111U_NF	0x4301	WG111U (no firmware)
 product NETGEAR WG111V2		0x6a00	WG111V2
 product NETGEAR RTL8192CU	0x9021	RTL8192CU
 product NETGEAR WNA1000M	0x9041	WNA1000M
 product NETGEAR2 MA101		0x4100	MA101
 product NETGEAR2 MA101B		0x4102	MA101 Rev B
 product NETGEAR3 WG111T		0x4250	WG111T
 product NETGEAR3 WG111T_NF	0x4251	WG111T (no firmware)
 product NETGEAR3 WPN111		0x5f00	WPN111
 product NETGEAR3 WPN111_NF	0x5f01	WPN111 (no firmware)
 product NETGEAR3 WPN111_2	0x5f02	WPN111
 product NETGEAR4 RTL8188CU	0x9041	RTL8188CU
 
 /* NetIndex products */
 product NETINDEX WS002IN	0x2001	Willcom WS002IN
 
 /* NEWlink */
 product NEWLINK USB2IDEBRIDGE	0x00ff	USB 2.0 Hard Drive Enclosure
 
 /* Nikon products */
 product NIKON E990		0x0102	Digital Camera E990
 product NIKON LS40		0x4000	CoolScan LS40 ED
 product NIKON D300		0x041a  Digital Camera D300
 
 /* NovaTech Products */
 product NOVATECH NV902		0x9020	NovaTech NV-902W
 product NOVATECH RT2573		0x9021	RT2573
 product NOVATECH RTL8188CU	0x9071	RTL8188CU
 
 /* Nokia products */
 product NOKIA N958GB		0x0070	Nokia N95 8GBc
 product NOKIA2 CA42		0x1234	CA-42 cable
 
 /* Novatel Wireless products */
 product NOVATEL V640		0x1100	Merlin V620
 product NOVATEL CDMA_MODEM	0x1110	Novatel Wireless Merlin CDMA
 product NOVATEL V620		0x1110	Merlin V620
 product NOVATEL V740		0x1120	Merlin V740
 product NOVATEL V720		0x1130	Merlin V720
 product NOVATEL U740		0x1400	Merlin U740
 product NOVATEL U740_2		0x1410	Merlin U740
 product NOVATEL U870		0x1420	Merlin U870
 product NOVATEL XU870		0x1430	Merlin XU870
 product NOVATEL X950D		0x1450	Merlin X950D
 product NOVATEL ES620		0x2100	Expedite ES620
 product NOVATEL E725		0x2120	Expedite E725
 product NOVATEL ES620_2		0x2130	Expedite ES620
 product NOVATEL ES620		0x2100	ES620 CDMA
 product NOVATEL U720		0x2110	Merlin U720
 product NOVATEL EU730		0x2400	Expedite EU730
 product NOVATEL EU740		0x2410	Expedite EU740
 product NOVATEL EU870D		0x2420	Expedite EU870D
 product NOVATEL U727		0x4100	Merlin U727 CDMA
 product NOVATEL MC950D		0x4400	Novatel MC950D HSUPA
 product NOVATEL MC990D		0x7001	Novatel MC990D
 product NOVATEL ZEROCD		0x5010	Novatel ZeroCD
 product NOVATEL MIFI2200V	0x5020	Novatel MiFi 2200 CDMA Virgin Mobile
 product NOVATEL ZEROCD2		0x5030	Novatel ZeroCD
 product NOVATEL MIFI2200	0x5041	Novatel MiFi 2200 CDMA
 product NOVATEL U727_2		0x5100	Merlin U727 CDMA
 product NOVATEL U760		0x6000	Novatel U760
 product NOVATEL MC760		0x6002	Novatel MC760
 product NOVATEL MC547		0x7042	Novatel MC547
 product NOVATEL MC679		0x7031	Novatel MC679
 product NOVATEL2 FLEXPACKGPS	0x0100	NovAtel FlexPack GPS receiver
 
 /* Merlin products */
 product	MERLIN V620		0x1110	Merlin V620
 
 /* O2Micro products */
 product O2MICRO OZ776_HUB	0x7761	OZ776 hub
 product O2MICRO OZ776_CCID_SC	0x7772	OZ776 CCID SC Reader
 
 /* Olimex products */
 product OLIMEX ARM_USB_OCD		0x0003	FTDI compatible adapter
 product OLIMEX ARM_USB_OCD_H		0x002b	FTDI compatible adapter
 
 /* Olympus products */
 product OLYMPUS C1		0x0102	C-1 Digital Camera
 product OLYMPUS C700		0x0105	C-700 Ultra Zoom
 
 /* OmniVision Technologies, Inc. products */
 product OMNIVISION OV511	0x0511	OV511 Camera
 product OMNIVISION OV511PLUS	0xa511	OV511+ Camera
 
 /* OnSpec Electronic, Inc. */
 product ONSPEC SDS_HOTFIND_D	0x0400	SDS-infrared.com Hotfind-D Infrared Camera
 product ONSPEC MDCFE_B_CF_READER	0xa000	MDCFE-B USB CF Reader
 product ONSPEC CFMS_RW		0xa001	SIIG/Datafab Memory Stick+CF Reader/Writer
 product ONSPEC READER		0xa003	Datafab-based Reader
 product ONSPEC CFSM_READER	0xa005	PNY/Datafab CF+SM Reader
 product ONSPEC CFSM_READER2	0xa006	Simple Tech/Datafab CF+SM Reader
 product ONSPEC MDSM_B_READER	0xa103	MDSM-B reader
 product ONSPEC CFSM_COMBO	0xa109	USB to CF + SM Combo (LC1)
 product ONSPEC UCF100		0xa400	FlashLink UCF-100 CompactFlash Reader
 product ONSPEC2 IMAGEMATE_SDDR55	0xa103	ImageMate SDDR55
 
 /* Option products */
 product OPTION VODAFONEMC3G	0x5000	Vodafone Mobile Connect 3G datacard
 product OPTION GT3G		0x6000	GlobeTrotter 3G datacard
 product OPTION GT3GQUAD		0x6300	GlobeTrotter 3G QUAD datacard
 product OPTION GT3GPLUS		0x6600	GlobeTrotter 3G+ datacard
 product OPTION GTICON322	0xd033	GlobeTrotter Icon322 storage
 product OPTION GTMAX36		0x6701	GlobeTrotter Max 3.6 Modem
 product OPTION GTMAX72		0x6711	GlobeTrotter Max 7.2 HSDPA
 product OPTION GTHSDPA		0x6971	GlobeTrotter HSDPA
 product OPTION GTMAXHSUPA	0x7001	GlobeTrotter HSUPA
 product OPTION GTMAXHSUPAE	0x6901	GlobeTrotter HSUPA PCIe
 product OPTION GTMAX380HSUPAE	0x7211	GlobeTrotter 380HSUPA PCIe
 product OPTION GT3G_1		0x6050	3G modem
 product OPTION GT3G_2		0x6100	3G modem
 product OPTION GT3G_3		0x6150	3G modem
 product OPTION GT3G_4		0x6200	3G modem
 product OPTION GT3G_5		0x6250	3G modem
 product OPTION GT3G_6		0x6350	3G modem
 product OPTION E6500		0x6500	3G modem
 product OPTION E6501		0x6501	3G modem
 product OPTION E6601		0x6601	3G modem
 product OPTION E6721		0x6721	3G modem
 product OPTION E6741		0x6741	3G modem
 product OPTION E6761		0x6761	3G modem
 product OPTION E6800		0x6800	3G modem
 product OPTION E7021		0x7021	3G modem
 product OPTION E7041		0x7041	3G modem
 product OPTION E7061		0x7061	3G modem
 product OPTION E7100		0x7100	3G modem
 product OPTION GTM380		0x7201	3G modem
 product OPTION GE40X		0x7601	Globetrotter HSUPA
 product OPTION GSICON72		0x6911	GlobeSurfer iCON
 product OPTION GSICONHSUPA	0x7251	Globetrotter HSUPA
 product OPTION ICON401		0x7401	GlobeSurfer iCON 401
 product OPTION GTHSUPA		0x7011	Globetrotter HSUPA
 product OPTION GMT382		0x7501	Globetrotter HSUPA
 product OPTION GE40X_1		0x7301	Globetrotter HSUPA
 product OPTION GE40X_2		0x7361	Globetrotter HSUPA
 product OPTION GE40X_3		0x7381	Globetrotter HSUPA
 product OPTION GTM661W		0x9000  GTM661W
 product OPTION ICONEDGE		0xc031	GlobeSurfer iCON EDGE
 product OPTION MODHSXPA		0xd013	Globetrotter HSUPA
 product OPTION ICON321		0xd031	Globetrotter HSUPA
 product OPTION ICON505		0xd055	Globetrotter iCON 505
 product OPTION ICON452		0x7901	Globetrotter iCON 452
 
 /* Optoelectronics Co., Ltd */
 product OPTO BARCODE		0x0001	Barcode Reader
 product OPTO OPTICONCODE	0x0009	Opticon Code Reader
 product OPTO BARCODE_1		0xa002	Barcode Reader
 product OPTO CRD7734		0xc000	USB Cradle CRD-7734-RU
 product OPTO CRD7734_1		0xc001	USB Cradle CRD-7734-RU
 
 /* OvisLink product */
 product OVISLINK RT3072		0x3072	RT3072
 
 /* OQO */
 product OQO WIFI01		0x0002	model 01 WiFi interface
 product OQO BT01		0x0003	model 01 Bluetooth interface
 product OQO ETHER01PLUS		0x7720	model 01+ Ethernet
 product OQO ETHER01		0x8150	model 01 Ethernet interface
 
 /* Ours Technology Inc. */
 product OTI DKU5		0x6858	DKU-5 Serial
 
 /* Owen.ru products */
 product OWEN AC4		0x0004	AC4 USB-RS485 converter
 
 /* OWL producs */
 product OWL CM_160		0xca05	OWL CM-160 power monitor
 
 /* Palm Computing, Inc. product */
 product PALM SERIAL		0x0080	USB Serial
 product PALM M500		0x0001	Palm m500
 product PALM M505		0x0002	Palm m505
 product PALM M515		0x0003	Palm m515
 product PALM I705		0x0020	Palm i705
 product PALM TUNGSTEN_Z		0x0031	Palm Tungsten Z
 product PALM M125		0x0040	Palm m125
 product PALM M130		0x0050	Palm m130
 product PALM TUNGSTEN_T		0x0060	Palm Tungsten T
 product PALM ZIRE31		0x0061	Palm Zire 31
 product PALM ZIRE		0x0070	Palm Zire
 
 /* Panasonic products */
 product PANASONIC LS120CAM	0x0901	LS-120 Camera
 product PANASONIC KXL840AN	0x0d01	CD-R Drive KXL-840AN
 product PANASONIC KXLRW32AN	0x0d09	CD-R Drive KXL-RW32AN
 product PANASONIC KXLCB20AN	0x0d0a	CD-R Drive KXL-CB20AN
 product PANASONIC KXLCB35AN	0x0d0e	DVD-ROM & CD-R/RW
 product PANASONIC SDCAAE	0x1b00	MultiMediaCard
 product PANASONIC TYTP50P6S	0x3900	TY-TP50P6-S 50in Touch Panel
 
 /* Papouch products */
 product PAPOUCH AD4USB		0x8003	FTDI compatible adapter
 product PAPOUCH AP485		0x0101	FTDI compatible adapter
 product PAPOUCH AP485_2		0x0104	FTDI compatible adapter
 product PAPOUCH DRAK5		0x0700	FTDI compatible adapter
 product PAPOUCH DRAK6		0x1000	FTDI compatible adapter
 product PAPOUCH GMSR		0x8005	FTDI compatible adapter
 product PAPOUCH GMUX		0x8004	FTDI compatible adapter
 product PAPOUCH IRAMP		0x0500	FTDI compatible adapter
 product PAPOUCH LEC		0x0300	FTDI compatible adapter
 product PAPOUCH MU		0x8001	FTDI compatible adapter
 product PAPOUCH QUIDO10X1		0x0b00	FTDI compatible adapter
 product PAPOUCH QUIDO2X16		0x0e00	FTDI compatible adapter
 product PAPOUCH QUIDO2X2		0x0a00	FTDI compatible adapter
 product PAPOUCH QUIDO30X3		0x0c00	FTDI compatible adapter
 product PAPOUCH QUIDO3X32		0x0f00	FTDI compatible adapter
 product PAPOUCH QUIDO4X4		0x0900	FTDI compatible adapter
 product PAPOUCH QUIDO60X3		0x0d00	FTDI compatible adapter
 product PAPOUCH QUIDO8X8		0x0800	FTDI compatible adapter
 product PAPOUCH SB232		0x0301	FTDI compatible adapter
 product PAPOUCH SB422		0x0102	FTDI compatible adapter
 product PAPOUCH SB422_2		0x0105	FTDI compatible adapter
 product PAPOUCH SB485		0x0100	FTDI compatible adapter
 product PAPOUCH SB485C		0x0107	FTDI compatible adapter
 product PAPOUCH SB485S		0x0106	FTDI compatible adapter
 product PAPOUCH SB485_2		0x0103	FTDI compatible adapter
 product PAPOUCH SIMUKEY		0x8002	FTDI compatible adapter
 product PAPOUCH TMU		0x0400	FTDI compatible adapter
 product PAPOUCH UPSUSB		0x8000	FTDI compatible adapter
 
 /* PARA Industrial products */
 product PARA RT3070		0x8888	RT3070
 
 /* Simtec Electronics products */
 product SIMTEC ENTROPYKEY	0x0001	Entropy Key
 
 /* Pegatron products */
 product PEGATRON RT2870		0x0002	RT2870
 product PEGATRON RT3070		0x000c	RT3070
 product PEGATRON RT3070_2	0x000e	RT3070
 product PEGATRON RT3070_3	0x0010	RT3070
 
 /* Peracom products */
 product PERACOM SERIAL1		0x0001	Serial
 product PERACOM ENET		0x0002	Ethernet
 product PERACOM ENET3		0x0003	At Home Ethernet
 product PERACOM ENET2		0x0005	Ethernet
 
 /* Philips products */
 product PHILIPS DSS350		0x0101	DSS 350 Digital Speaker System
 product PHILIPS DSS		0x0104	DSS XXX Digital Speaker System
 product PHILIPS HUB		0x0201	hub
 product PHILIPS PCA646VC	0x0303	PCA646VC PC Camera
 product PHILIPS PCVC680K	0x0308	PCVC680K Vesta Pro PC Camera
 product PHILIPS DSS150		0x0471	DSS 150 Digital Speaker System
 product PHILIPS ACE1001		0x066a	AKTAKOM ACE-1001 cable
 product PHILIPS SPE3030CC	0x083a	USB 2.0 External Disk
 product PHILIPS SNU5600		0x1236	SNU5600
 product PHILIPS UM10016		0x1552	ISP 1581 Hi-Speed USB MPEG2 Encoder Reference Kit
 product PHILIPS DIVAUSB		0x1801	DIVA USB mp3 player
 product PHILIPS RT2870		0x200f	RT2870
 
 /* Philips Semiconductor products */
 product PHILIPSSEMI HUB1122	0x1122	HUB
 
 /* Megatec */
 product MEGATEC UPS		0x5161	Phoenixtec protocol based UPS
 
 /* P.I. Engineering products */
 product PIENGINEERING PS2USB	0x020b	PS2 to Mac USB Adapter
 
 /* Planex Communications products */
 product PLANEX GW_US11H		0x14ea	GW-US11H WLAN
 product PLANEX2 RTL8188CUS	0x1201	RTL8188CUS
 product PLANEX2 GW_US11S	0x3220	GW-US11S WLAN
 product PLANEX2 GW_US54GXS	0x5303	GW-US54GXS WLAN
 product PLANEX2 RTL8188CU_1	0xab2a	RTL8188CU
 product PLANEX2 RTL8188CU_2	0xed17	RTL8188CU
 product PLANEX2 RTL8188CU_3	0x4902	RTL8188CU
 product PLANEX2 RTL8188CU_4	0xab2e	RTL8188CU
 product PLANEX2 RTL8192CU	0xab2b	RTL8192CU
 product PLANEX2 GWUS54HP	0xab01	GW-US54HP
 product PLANEX2 GWUS300MINIS	0xab24	GW-US300MiniS
 product PLANEX2	RT3070		0xab25	RT3070
 product PLANEX2 MZKUE150N	0xab2f	MZK-UE150N
 product PLANEX2 GWUS54MINI2	0xab50	GW-US54Mini2
 product PLANEX2 GWUS54SG	0xc002	GW-US54SG
 product PLANEX2 GWUS54GZL	0xc007	GW-US54GZL
 product PLANEX2 GWUS54GD	0xed01	GW-US54GD
 product PLANEX2 GWUSMM		0xed02	GW-USMM
 product PLANEX2 RT2870		0xed06	RT2870
 product PLANEX2 GWUSMICRON	0xed14	GW-USMicroN
 product PLANEX2 GWUSVALUEEZ	0xed17	GW-USValue-EZ
 product PLANEX3 GWUS54GZ	0xab10	GW-US54GZ
 product PLANEX3 GU1000T		0xab11	GU-1000T
 product PLANEX3 GWUS54MINI	0xab13	GW-US54Mini
 product PLANEX2 GWUSNANO	0xab28	GW-USNano
 
 /* Plextor Corp. */
 product PLEXTOR 40_12_40U	0x0011	PlexWriter 40/12/40U
 
 /* PLX products */
 product PLX TESTBOARD		0x9060	test board
 product PLX CA42		0xac70	CA-42
 
 /* PNY products */
 product PNY ATTACHE2		0x0010	USB 2.0 Flash Drive
 
 /* PortGear products */
 product PORTGEAR EA8		0x0008	Ethernet
 product PORTGEAR EA9		0x0009	Ethernet
 
 /* Portsmith products */
 product PORTSMITH EEA		0x3003	Express Ethernet
 
 /* Posiflex products */
 product POSIFLEX PP7000		0x0300	FTDI compatible adapter
 
 /* Primax products */
 product PRIMAX G2X300		0x0300	G2-200 scanner
 product PRIMAX G2E300		0x0301	G2E-300 scanner
 product PRIMAX G2300		0x0302	G2-300 scanner
 product PRIMAX G2E3002		0x0303	G2E-300 scanner
 product PRIMAX 9600		0x0340	Colorado USB 9600 scanner
 product PRIMAX 600U		0x0341	Colorado 600u scanner
 product PRIMAX 6200		0x0345	Visioneer 6200 scanner
 product PRIMAX 19200		0x0360	Colorado USB 19200 scanner
 product PRIMAX 1200U		0x0361	Colorado 1200u scanner
 product PRIMAX G600		0x0380	G2-600 scanner
 product PRIMAX 636I		0x0381	ReadyScan 636i
 product PRIMAX G2600		0x0382	G2-600 scanner
 product PRIMAX G2E600		0x0383	G2E-600 scanner
 product PRIMAX COMFORT		0x4d01	Comfort
 product PRIMAX MOUSEINABOX	0x4d02	Mouse-in-a-Box
 product PRIMAX PCGAUMS1		0x4d04	Sony PCGA-UMS1
 product PRIMAX HP_RH304AA	0x4d17	HP RH304AA mouse
 
 /* Prolific products */
 product PROLIFIC PL2301		0x0000	PL2301 Host-Host interface
 product PROLIFIC PL2302		0x0001	PL2302 Host-Host interface
 product PROLIFIC MOTOROLA	0x0307	Motorola Cable
 product PROLIFIC RSAQ2		0x04bb	PL2303 Serial (IODATA USB-RSAQ2)
 product PROLIFIC ALLTRONIX_GPRS	0x0609	Alltronix ACM003U00 modem
 product PROLIFIC ALDIGA_AL11U	0x0611	AlDiga AL-11U modem
 product PROLIFIC MICROMAX_610U	0x0612	Micromax 610U
 product PROLIFIC DCU11		0x1234	DCU-11 Phone Cable
 product PROLIFIC UIC_MSR206	0x206a	UIC MSR206 Card Reader
 product PROLIFIC PL2303		0x2303	PL2303 Serial (ATEN/IOGEAR UC232A)
 product PROLIFIC PL2305		0x2305	Parallel printer
 product PROLIFIC ATAPI4		0x2307	ATAPI-4 Controller
 product PROLIFIC PL2501		0x2501	PL2501 Host-Host interface
 product PROLIFIC PL2506		0x2506	PL2506 USB to IDE Bridge
 product PROLIFIC HCR331		0x331a	HCR331 Hybrid Card Reader
 product PROLIFIC PHAROS		0xaaa0	Prolific Pharos
 product PROLIFIC RSAQ3		0xaaa2	PL2303 Serial Adapter (IODATA USB-RSAQ3)
 product PROLIFIC2 PL2303	0x2303	PL2303 Serial Adapter
 
 /* Putercom products */
 product PUTERCOM UPA100		0x047e	USB-1284 BRIDGE
 
 /* Qcom products */
 product QCOM RT2573		0x6196	RT2573
 product QCOM RT2573_2		0x6229	RT2573
 product QCOM RT2573_3		0x6238	RT2573
 product QCOM RT2870		0x6259	RT2870
 
 /* QI-hardware */
 product QIHARDWARE JTAGSERIAL		0x0713	FTDI compatible adapter
 
 /* Qisda products */
 product QISDA H21_1		0x4512	3G modem
 product QISDA H21_2		0x4523	3G modem
 product QISDA H20_1		0x4515	3G modem
 product QISDA H20_2		0x4519	3G modem
 
 /* Qualcomm products */
 product QUALCOMM CDMA_MSM	0x6000	CDMA Technologies MSM phone
 product QUALCOMM NTT_L02C_MODEM	0x618f	NTT DOCOMO L-02C
 product QUALCOMM NTT_L02C_STORAGE	0x61dd	NTT DOCOMO L-02C
 product QUALCOMM2 MF330		0x6613	MF330
 product QUALCOMM2 RWT_FCT	0x3100	RWT FCT-CDMA 2000 1xRTT modem
 product QUALCOMM2 CDMA_MSM	0x3196	CDMA Technologies MSM modem
 product QUALCOMM2 AC8700	0x6000	AC8700
 product QUALCOMM2 VW110L	0x1000	Vertex Wireless 110L modem
 product QUALCOMM2 SIM5218	0x9000	SIM5218
 product QUALCOMM2 GOBI2000_QDL	0x9204	Qualcomm Gobi 2000 QDL
 product QUALCOMM2 GOBI2000	0x9205	Qualcomm Gobi 2000 modem
 product QUALCOMMINC CDMA_MSM	0x0001	CDMA Technologies MSM modem
 product QUALCOMMINC E0002	0x0002	3G modem
 product QUALCOMMINC E0003	0x0003	3G modem
 product QUALCOMMINC E0004	0x0004	3G modem
 product QUALCOMMINC E0005	0x0005	3G modem
 product QUALCOMMINC E0006	0x0006	3G modem
 product QUALCOMMINC E0007	0x0007	3G modem
 product QUALCOMMINC E0008	0x0008	3G modem
 product QUALCOMMINC E0009	0x0009	3G modem
 product QUALCOMMINC E000A	0x000a	3G modem
 product QUALCOMMINC E000B	0x000b	3G modem
 product QUALCOMMINC E000C	0x000c	3G modem
 product QUALCOMMINC E000D	0x000d	3G modem
 product QUALCOMMINC E000E	0x000e	3G modem
 product QUALCOMMINC E000F	0x000f	3G modem
 product QUALCOMMINC E0010	0x0010	3G modem
 product QUALCOMMINC E0011	0x0011	3G modem
 product QUALCOMMINC E0012	0x0012	3G modem
 product QUALCOMMINC E0013	0x0013	3G modem
 product QUALCOMMINC E0014	0x0014	3G modem
 product QUALCOMMINC MF628	0x0015	3G modem
 product QUALCOMMINC MF633R	0x0016	ZTE WCDMA modem
 product QUALCOMMINC E0017	0x0017	3G modem
 product QUALCOMMINC E0018	0x0018	3G modem
 product QUALCOMMINC E0019	0x0019	3G modem
 product QUALCOMMINC E0020	0x0020	3G modem
 product QUALCOMMINC E0021	0x0021	3G modem
 product QUALCOMMINC E0022	0x0022	3G modem
 product QUALCOMMINC E0023	0x0023	3G modem
 product QUALCOMMINC E0024	0x0024	3G modem
 product QUALCOMMINC E0025	0x0025	3G modem
 product QUALCOMMINC E0026	0x0026	3G modem
 product QUALCOMMINC E0027	0x0027	3G modem
 product QUALCOMMINC E0028	0x0028	3G modem
 product QUALCOMMINC E0029	0x0029	3G modem
 product QUALCOMMINC E0030	0x0030	3G modem
 product QUALCOMMINC MF626	0x0031	3G modem
 product QUALCOMMINC E0032	0x0032	3G modem
 product QUALCOMMINC E0033	0x0033	3G modem
 product QUALCOMMINC E0037	0x0037	3G modem
 product QUALCOMMINC E0039	0x0039	3G modem
 product QUALCOMMINC E0042	0x0042	3G modem
 product QUALCOMMINC E0043	0x0043	3G modem
 product QUALCOMMINC E0048	0x0048	3G modem
 product QUALCOMMINC E0049	0x0049	3G modem
 product QUALCOMMINC E0051	0x0051	3G modem
 product QUALCOMMINC E0052	0x0052	3G modem
 product QUALCOMMINC ZTE_STOR2	0x0053	USB ZTE Storage
 product QUALCOMMINC E0054	0x0054	3G modem
 product QUALCOMMINC E0055	0x0055	3G modem
 product QUALCOMMINC E0057	0x0057	3G modem
 product QUALCOMMINC E0058	0x0058	3G modem
 product QUALCOMMINC E0059	0x0059	3G modem
 product QUALCOMMINC E0060	0x0060	3G modem
 product QUALCOMMINC E0061	0x0061	3G modem
 product QUALCOMMINC E0062	0x0062	3G modem
 product QUALCOMMINC E0063	0x0063	3G modem
 product QUALCOMMINC E0064	0x0064	3G modem
 product QUALCOMMINC E0066	0x0066	3G modem
 product QUALCOMMINC E0069	0x0069	3G modem
 product QUALCOMMINC E0070	0x0070	3G modem
 product QUALCOMMINC E0073	0x0073	3G modem
 product QUALCOMMINC E0076	0x0076	3G modem
 product QUALCOMMINC E0078	0x0078	3G modem
 product QUALCOMMINC E0082	0x0082	3G modem
 product QUALCOMMINC E0086	0x0086	3G modem
 product QUALCOMMINC SURFSTICK	0x0117	1&1 Surf Stick
 product QUALCOMMINC K3772_Z_INIT	0x1179	K3772-Z Initial
 product QUALCOMMINC K3772_Z	0x1181	K3772-Z
+product QUALCOMMINC MF195E_INIT	0x1514	MF195E initial
+product QUALCOMMINC MF195E	0x1516	MF195E
 product QUALCOMMINC ZTE_STOR	0x2000	USB ZTE Storage
 product QUALCOMMINC E2002	0x2002	3G modem
 product QUALCOMMINC E2003	0x2003	3G modem
 product QUALCOMMINC AC8710	0xfff1	3G modem
 product QUALCOMMINC AC2726	0xfff5	3G modem
 product QUALCOMMINC AC8700	0xfffe	CDMA 1xEVDO USB modem
 
 /* Quanta products */
 product QUANTA RW6815_1		0x00ce	HP iPAQ rw6815
 product QUANTA RT3070		0x0304	RT3070
 product QUANTA Q101_STOR	0x1000	USB Q101 Storage
 product QUANTA Q101		0xea02	HSDPA modem
 product QUANTA Q111		0xea03	HSDPA modem
 product QUANTA GLX		0xea04	HSDPA modem
 product QUANTA GKE		0xea05	HSDPA modem
 product QUANTA GLE		0xea06	HSDPA modem
 product QUANTA RW6815R		0xf003	HP iPAQ rw6815 RNDIS
 
 /* Qtronix products */
 product QTRONIX 980N		0x2011	Scorpion-980N keyboard
 
 /* Quickshot products */
 product QUICKSHOT STRIKEPAD	0x6238	USB StrikePad
 
 /* Radio Shack */
 product RADIOSHACK USBCABLE	0x4026	USB to Serial Cable
 
 /* Rainbow Technologies products */
 product RAINBOW IKEY2000	0x1200	i-Key 2000
 
 /* Ralink Technology products */
 product RALINK RT2570		0x1706	RT2500USB Wireless Adapter
 product RALINK RT2070		0x2070	RT2070
 product RALINK RT2570_2		0x2570	RT2500USB Wireless Adapter
 product RALINK RT2573		0x2573	RT2501USB Wireless Adapter
 product RALINK RT2671		0x2671	RT2601USB Wireless Adapter
 product RALINK RT2770		0x2770	RT2770
 product RALINK RT2870		0x2870	RT2870
 product RALINK RT_STOR		0x2878	USB Storage
 product RALINK RT3070		0x3070	RT3070
 product RALINK RT3071		0x3071	RT3071
 product RALINK RT3072		0x3072	RT3072
 product RALINK RT3370		0x3370	RT3370
 product RALINK RT3572		0x3572	RT3572
 product RALINK RT3573		0x3573	RT3573
 product RALINK RT5370		0x5370	RT5370
 product RALINK RT5572		0x5572	RT5572
 product RALINK RT8070		0x8070	RT8070
 product RALINK RT2570_3		0x9020	RT2500USB Wireless Adapter
 product RALINK RT2573_2		0x9021	RT2501USB Wireless Adapter
 
 /* RATOC Systems products */
 product RATOC REXUSB60		0xb000	USB serial adapter REX-USB60
 product RATOC REXUSB60F		0xb020	USB serial adapter REX-USB60F
 
 /* Realtek products */
 /* Green House and CompUSA OEM this part */
 product REALTEK DUMMY		0x0000	Dummy product
 product REALTEK USB20CRW	0x0158	USB20CRW Card Reader
 product REALTEK RTL8188ETV	0x0179	RTL8188ETV
 product REALTEK RTL8188CTV      0x018a  RTL8188CTV
 product REALTEK USBKR100	0x8150	USBKR100 USB Ethernet
 product REALTEK RTL8188CE_0     0x8170  RTL8188CE
 product REALTEK RTL8171		0x8171	RTL8171
 product REALTEK RTL8172		0x8172	RTL8172
 product REALTEK RTL8173		0x8173	RTL8173
 product REALTEK RTL8174		0x8174	RTL8174
 product REALTEK RTL8188CU_0	0x8176	RTL8188CU
 product REALTEK RTL8188EU	0x8179	RTL8188EU
 product REALTEK RTL8188CE_1	0x817e	RTL8188CE
 product REALTEK RTL8188CU_1	0x817a	RTL8188CU
 product REALTEK RTL8188CU_2	0x817b	RTL8188CU
 product REALTEK RTL8187		0x8187	RTL8187 Wireless Adapter
 product REALTEK RTL8187B_0	0x8189	RTL8187B Wireless Adapter
 product REALTEK RTL8196EU	0x8196	RTL8196EU
 product REALTEK RTL8187B_1	0x8197	RTL8187B Wireless Adapter
 product REALTEK RTL8187B_2	0x8198	RTL8187B Wireless Adapter
 product REALTEK RTL8188CUS	0x818a	RTL8188CUS
 product REALTEK RTL8188CU_COMBO	0x8754	RTL8188CU
 product REALTEK RTL8191CU	0x8177	RTL8191CU
 product REALTEK RTL8192CU	0x8178	RTL8192CU
 product REALTEK RTL8192CE	0x817c	RTL8192CE
 product REALTEK RTL8188RU_1	0x817d	RTL8188RU
 product REALTEK RTL8188RU_3	0x817f	RTL8188RU
 product REALTEK RTL8712		0x8712	RTL8712
 product REALTEK RTL8713		0x8712	RTL8713
 product REALTEK RTL8188RU_2	0x317f	RTL8188RU
 product REALTEK RTL8192SU	0xc512	RTL8192SU
 
 /* RedOctane products */
 product REDOCTANE DUMMY		0x0000	Dummy product
 product REDOCTANE GHMIDI	0x474b	GH MIDI INTERFACE
 
 /* Renesas products */
 product RENESAS RX610		0x0053	RX610 RX-Stick
 
 /* Ricoh products */
 product RICOH VGPVCC2		0x1830	VGP-VCC2 Camera
 product RICOH VGPVCC3		0x1832	VGP-VCC3 Camera
 product RICOH VGPVCC2_2		0x1833	VGP-VCC2 Camera
 product RICOH VGPVCC2_3		0x1834	VGP-VCC2 Camera
 product RICOH VGPVCC7		0x183a	VGP-VCC7 Camera
 product RICOH VGPVCC8		0x183b	VGP-VCC8 Camera
 
 /* Reiner-SCT products */
 product REINERSCT CYBERJACK_ECOM	0x0100	e-com cyberJack
 
 /* Roland products */
 product ROLAND UA100		0x0000	UA-100 Audio I/F
 product ROLAND UM4		0x0002	UM-4 MIDI I/F
 product ROLAND SC8850		0x0003	SC-8850 MIDI Synth
 product ROLAND U8		0x0004	U-8 Audio I/F
 product ROLAND UM2		0x0005	UM-2 MIDI I/F
 product ROLAND SC8820		0x0007	SC-8820 MIDI Synth
 product ROLAND PC300		0x0008	PC-300 MIDI Keyboard
 product ROLAND UM1		0x0009	UM-1 MIDI I/F
 product ROLAND SK500		0x000b	SK-500 MIDI Keyboard
 product ROLAND SCD70		0x000c	SC-D70 MIDI Synth
 product ROLAND UM880N		0x0014	EDIROL UM-880 MIDI I/F (native)
 product ROLAND UM880G		0x0015	EDIROL UM-880 MIDI I/F (generic)
 product ROLAND SD90		0x0016	SD-90 MIDI Synth
 product ROLAND UM550		0x0023	UM-550 MIDI I/F
 product ROLAND SD20		0x0027	SD-20 MIDI Synth
 product ROLAND SD80		0x0029	SD-80 MIDI Synth
 product ROLAND UA700		0x002b	UA-700 Audio I/F
 
 /* Rockfire products */
 product ROCKFIRE GAMEPAD	0x2033	gamepad 203USB
 
 /* RATOC Systems products */
 product RATOC REXUSB60		0xb000	REX-USB60
 product RATOC REXUSB60F		0xb020	REX-USB60F
 
 /* RT system products */
 product RTSYSTEMS CT29B		0x9e54	FTDI compatible adapter
 product RTSYSTEMS SERIAL_VX7		0x9e52	FTDI compatible adapter
 
 /* Sagem products */
 product SAGEM USBSERIAL		0x0027	USB-Serial Controller
 product SAGEM XG760A		0x004a	XG-760A
 product SAGEM XG76NA		0x0062	XG-76NA
 
 /* Samsung products */
 product SAMSUNG WIS09ABGN	0x2018	WIS09ABGN Wireless LAN adapter
 product SAMSUNG ML6060		0x3008	ML-6060 laser printer
 product SAMSUNG YP_U2		0x5050	YP-U2 MP3 Player
 product SAMSUNG YP_U4		0x5092	YP-U4 MP3 Player
 product SAMSUNG I500		0x6601	I500 Palm USB Phone
 product SAMSUNG I330		0x8001	I330 phone cradle
 product SAMSUNG2 RT2870_1	0x2018	RT2870
 
 /* Samsung Techwin products */
 product SAMSUNG_TECHWIN DIGIMAX_410	0x000a	Digimax 410
 
 /* SanDisk products */
 product SANDISK SDDR05A		0x0001	ImageMate SDDR-05a
 product SANDISK SDDR31		0x0002	ImageMate SDDR-31
 product SANDISK SDDR05		0x0005	ImageMate SDDR-05
 product SANDISK SDDR12		0x0100	ImageMate SDDR-12
 product SANDISK SDDR09		0x0200	ImageMate SDDR-09
 product SANDISK SDDR75		0x0810	ImageMate SDDR-75
 product SANDISK SDCZ2_128	0x7100	Cruzer Mini 128MB
 product SANDISK SDCZ2_256	0x7104	Cruzer Mini 256MB
 product SANDISK SDCZ4_128	0x7112	Cruzer Micro 128MB
 product SANDISK SDCZ4_256	0x7113	Cruzer Micro 256MB
 product SANDISK IMAGEMATE_SDDR289	0xb6ba	ImageMate SDDR-289
 
 /* Sanwa Electric Instrument Co., Ltd. products */
 product SANWA KB_USB2 		0x0701	KB-USB2 multimeter cable
 
 /* Sanyo Electric products */
 product SANYO SCP4900 		0x0701	Sanyo SCP-4900 USB Phone
 
 /* ScanLogic products */
 product SCANLOGIC SL11R		0x0002	SL11R IDE Adapter
 product SCANLOGIC 336CX		0x0300	Phantom 336CX - C3 scanner
 
 /* Schweitzer Engineering Laboratories products */
 product SEL C662		0x0001	C662 Cable
 
 /* Sealevel products */
 product SEALEVEL 2101		0x2101	FTDI compatible adapter
 product SEALEVEL 2102		0x2102	FTDI compatible adapter
 product SEALEVEL 2103		0x2103	FTDI compatible adapter
 product SEALEVEL 2104		0x2104	FTDI compatible adapter
 product SEALEVEL 2106		0x9020	FTDI compatible adapter
 product SEALEVEL 2201_1		0x2211	FTDI compatible adapter
 product SEALEVEL 2201_2		0x2221	FTDI compatible adapter
 product SEALEVEL 2202_1		0x2212	FTDI compatible adapter
 product SEALEVEL 2202_2		0x2222	FTDI compatible adapter
 product SEALEVEL 2203_1		0x2213	FTDI compatible adapter
 product SEALEVEL 2203_2		0x2223	FTDI compatible adapter
 product SEALEVEL 2401_1		0x2411	FTDI compatible adapter
 product SEALEVEL 2401_2		0x2421	FTDI compatible adapter
 product SEALEVEL 2401_3		0x2431	FTDI compatible adapter
 product SEALEVEL 2401_4		0x2441	FTDI compatible adapter
 product SEALEVEL 2402_1		0x2412	FTDI compatible adapter
 product SEALEVEL 2402_2		0x2422	FTDI compatible adapter
 product SEALEVEL 2402_3		0x2432	FTDI compatible adapter
 product SEALEVEL 2402_4		0x2442	FTDI compatible adapter
 product SEALEVEL 2403_1		0x2413	FTDI compatible adapter
 product SEALEVEL 2403_2		0x2423	FTDI compatible adapter
 product SEALEVEL 2403_3		0x2433	FTDI compatible adapter
 product SEALEVEL 2403_4		0x2443	FTDI compatible adapter
 product SEALEVEL 2801_1		0x2811	FTDI compatible adapter
 product SEALEVEL 2801_2		0x2821	FTDI compatible adapter
 product SEALEVEL 2801_3		0x2831	FTDI compatible adapter
 product SEALEVEL 2801_4		0x2841	FTDI compatible adapter
 product SEALEVEL 2801_5		0x2851	FTDI compatible adapter
 product SEALEVEL 2801_6		0x2861	FTDI compatible adapter
 product SEALEVEL 2801_7		0x2871	FTDI compatible adapter
 product SEALEVEL 2801_8		0x2881	FTDI compatible adapter
 product SEALEVEL 2802_1		0x2812	FTDI compatible adapter
 product SEALEVEL 2802_2		0x2822	FTDI compatible adapter
 product SEALEVEL 2802_3		0x2832	FTDI compatible adapter
 product SEALEVEL 2802_4		0x2842	FTDI compatible adapter
 product SEALEVEL 2802_5		0x2852	FTDI compatible adapter
 product SEALEVEL 2802_6		0x2862	FTDI compatible adapter
 product SEALEVEL 2802_7		0x2872	FTDI compatible adapter
 product SEALEVEL 2802_8		0x2882	FTDI compatible adapter
 product SEALEVEL 2803_1		0x2813	FTDI compatible adapter
 product SEALEVEL 2803_2		0x2823	FTDI compatible adapter
 product SEALEVEL 2803_3		0x2833	FTDI compatible adapter
 product SEALEVEL 2803_4		0x2843	FTDI compatible adapter
 product SEALEVEL 2803_5		0x2853	FTDI compatible adapter
 product SEALEVEL 2803_6		0x2863	FTDI compatible adapter
 product SEALEVEL 2803_7		0x2873	FTDI compatible adapter
 product SEALEVEL 2803_8		0x2883	FTDI compatible adapter
 
 /* Senao products */
 product SENAO RT2870_3		0x0605	RT2870
 product SENAO RT2870_4		0x0615	RT2870
 product SENAO NUB8301		0x2000	NUB-8301
 product SENAO RT2870_1		0x9701	RT2870
 product SENAO RT2870_2		0x9702	RT2870
 product SENAO RT3070		0x9703	RT3070
 product SENAO RT3071		0x9705	RT3071
 product SENAO RT3072_1		0x9706	RT3072
 product SENAO RT3072_2		0x9707	RT3072
 product SENAO RT3072_3		0x9708	RT3072
 product SENAO RT3072_4		0x9709	RT3072
 product SENAO RT3072_5		0x9801	RT3072
 product SENAO RTL8192SU_1	0x9603	RTL8192SU
 product SENAO RTL8192SU_2	0x9605	RTL8192SU
 
 /* ShanTou products */
 product SHANTOU ST268		0x0268	ST268
 product SHANTOU DM9601		0x9601	DM 9601
 product SHANTOU ADM8515		0x8515	ADM8515
 
 /* Shark products */
 product SHARK PA		0x0400	Pocket Adapter
 
 /* Sharp products */
 product SHARP SL5500		0x8004	Zaurus SL-5500 PDA
 product SHARP SLA300		0x8005	Zaurus SL-A300 PDA
 product SHARP SL5600		0x8006	Zaurus SL-5600 PDA
 product SHARP SLC700		0x8007	Zaurus SL-C700 PDA
 product SHARP SLC750		0x9031	Zaurus SL-C750 PDA
 product SHARP WZERO3ES		0x9123	W-ZERO3 ES Smartphone
 product SHARP WZERO3ADES	0x91ac	Advanced W-ZERO3 ES Smartphone
 product SHARP WILLCOM03		0x9242	WILLCOM03
 
 /* Shuttle Technology products */
 product SHUTTLE EUSB		0x0001	E-USB Bridge
 product SHUTTLE EUSCSI		0x0002	eUSCSI Bridge
 product SHUTTLE SDDR09		0x0003	ImageMate SDDR09
 product SHUTTLE EUSBCFSM	0x0005	eUSB SmartMedia / CompactFlash Adapter
 product SHUTTLE ZIOMMC		0x0006	eUSB MultiMediaCard Adapter
 product SHUTTLE HIFD		0x0007	Sony Hifd
 product SHUTTLE EUSBATAPI	0x0009	eUSB ATA/ATAPI Adapter
 product SHUTTLE CF		0x000a	eUSB CompactFlash Adapter
 product SHUTTLE EUSCSI_B	0x000b	eUSCSI Bridge
 product SHUTTLE EUSCSI_C	0x000c	eUSCSI Bridge
 product SHUTTLE CDRW		0x0101	CD-RW Device
 product SHUTTLE EUSBORCA	0x0325	eUSB ORCA Quad Reader
 
 /* Siemens products */
 product SIEMENS SPEEDSTREAM	0x1001	SpeedStream
 product SIEMENS SPEEDSTREAM22	0x1022	SpeedStream 1022
 product SIEMENS2 WLL013		0x001b	WLL013
 product SIEMENS2 ES75		0x0034  GSM module MC35
 product SIEMENS2 WL54G		0x3c06	54g USB Network Adapter
 product SIEMENS3 SX1		0x0001	SX1
 product SIEMENS3 X65		0x0003	X65
 product SIEMENS3 X75		0x0004	X75
 product SIEMENS3 EF81 		0x0005	EF81
 
 /* Sierra Wireless products */
 product SIERRA EM5625		0x0017	EM5625
 product SIERRA MC5720_2		0x0018	MC5720
 product SIERRA MC5725		0x0020	MC5725
 product SIERRA AIRCARD580	0x0112	Sierra Wireless AirCard 580
 product SIERRA AIRCARD595	0x0019	Sierra Wireless AirCard 595
 product SIERRA AC595U		0x0120	Sierra Wireless AirCard 595U
 product SIERRA AC597E		0x0021	Sierra Wireless AirCard 597E
 product SIERRA EM5725		0x0022	EM5725
 product SIERRA C597		0x0023	Sierra Wireless Compass 597
 product SIERRA MC5727		0x0024	MC5727
 product SIERRA T598		0x0025	T598
 product SIERRA T11		0x0026	T11
 product SIERRA AC402		0x0027	AC402
 product SIERRA MC5728		0x0028	MC5728
 product SIERRA E0029		0x0029	E0029
 product SIERRA AIRCARD580	0x0112	Sierra Wireless AirCard 580
 product SIERRA AC595U		0x0120	Sierra Wireless AirCard 595U
 product SIERRA MC5720		0x0218	MC5720 Wireless Modem
 product SIERRA MINI5725		0x0220	Sierra Wireless miniPCI 5275
 product SIERRA MC5727_2		0x0224	MC5727
 product SIERRA MC8755_2		0x6802	MC8755
 product SIERRA MC8765		0x6803	MC8765
 product SIERRA MC8755		0x6804	MC8755
 product SIERRA MC8765_2		0x6805	MC8765
 product SIERRA MC8755_4		0x6808	MC8755
 product SIERRA MC8765_3		0x6809	MC8765
 product SIERRA AC875U		0x6812	AC875U HSDPA USB Modem
 product SIERRA MC8755_3		0x6813	MC8755 HSDPA
 product SIERRA MC8775_2		0x6815	MC8775
 product SIERRA MC8775		0x6816	MC8775
 product SIERRA AC875		0x6820	Sierra Wireless AirCard 875
 product SIERRA AC875U_2		0x6821	AC875U
 product SIERRA AC875E		0x6822	AC875E
 product SIERRA MC8780		0x6832	MC8780
 product SIERRA MC8781		0x6833	MC8781
 product SIERRA MC8780_2		0x6834	MC8780
 product SIERRA MC8781_2		0x6835	MC8781
 product SIERRA MC8780_3		0x6838	MC8780
 product SIERRA MC8781_3		0x6839	MC8781
 product SIERRA MC8785		0x683A	MC8785
 product SIERRA MC8785_2		0x683B	MC8785
 product SIERRA MC8790		0x683C	MC8790
 product SIERRA MC8791		0x683D	MC8791
 product SIERRA MC8792		0x683E	MC8792
 product SIERRA AC880		0x6850	Sierra Wireless AirCard 880
 product SIERRA AC881		0x6851	Sierra Wireless AirCard 881
 product SIERRA AC880E		0x6852	Sierra Wireless AirCard 880E
 product SIERRA AC881E		0x6853	Sierra Wireless AirCard 881E
 product SIERRA AC880U		0x6855	Sierra Wireless AirCard 880U
 product SIERRA AC881U		0x6856	Sierra Wireless AirCard 881U
 product SIERRA AC885E		0x6859	AC885E
 product SIERRA AC885E_2		0x685A	AC885E
 product SIERRA AC885U		0x6880	Sierra Wireless AirCard 885U
 product SIERRA C888		0x6890	C888
 product SIERRA C22		0x6891	C22
 product SIERRA E6892		0x6892	E6892
 product SIERRA E6893		0x6893	E6893
 product SIERRA MC8700		0x68A3	MC8700
 product SIERRA AIRCARD875	0x6820	Aircard 875 HSDPA
 product SIERRA AC313U		0x68aa	Sierra Wireless AirCard 313U
 product SIERRA TRUINSTALL	0x0fff	Aircard Tru Installer
 
 /* Sigmatel products */
 product SIGMATEL WBT_3052	0x4200	WBT-3052 IrDA/USB Bridge
 product SIGMATEL I_BEAD100	0x8008	i-Bead 100 MP3 Player
 
 /* SIIG products */
 /* Also: Omnidirectional Control Technology products */
 product SIIG DIGIFILMREADER	0x0004	DigiFilm-Combo Reader
 product SIIG WINTERREADER	0x0330	WINTERREADER Reader
 product SIIG2 DK201		0x0103	FTDI compatible adapter
 product SIIG2 USBTOETHER	0x0109	USB TO Ethernet
 product SIIG2 US2308		0x0421	Serial
 
 /* Silicom products */
 product SILICOM U2E		0x0001	U2E
 product SILICOM GPE		0x0002	Psion Gold Port Ethernet
 
 /* SI Labs */
 product SILABS	VSTABI		0x0f91	VStabi Controller
 product SILABS	ARKHAM_DS101_M	0x1101	Arkham DS101 Monitor
 product SILABS	ARKHAM_DS101_A	0x1601	Arkham DS101 Adapter
 product SILABS	BSM7DUSB	0x800a	SPORTident BSM7-D USB
 product SILABS	POLOLU		0x803b	Pololu Serial
 product SILABS	CYGNAL_DEBUG	0x8044	Cygnal Debug Adapter
 product SILABS	SB_PARAMOUNT_ME	0x8043	Software Bisque Paramount ME
 product SILABS	SAEL		0x8053	SA-EL USB
 product SILABS	GSM2228		0x8054	Enfora GSM2228 USB
 product SILABS	ARGUSISP	0x8066	Argussoft ISP
 product SILABS	IMS_USB_RS422	0x806f	IMS USB-RS422
 product SILABS	CRUMB128	0x807a	Crumb128 board
 product SILABS	OPTRIS_MSPRO	0x80c4	Optris MSpro LT Thermometer
 product SILABS	DEGREE		0x80ca	Degree Controls Inc
 product SILABS	TRACIENT	0x80dd	Tracient RFID
 product SILABS	TRAQMATE	0x80ed	Track Systems Traqmate
 product SILABS	SUUNTO		0x80f6	Suunto Sports Instrument
 product SILABS	ARYGON_MIFARE	0x8115	Arygon Mifare RFID reader
 product SILABS	BURNSIDE	0x813d	Burnside Telecon Deskmobile
 product SILABS	TAMSMASTER	0x813f	Tams Master Easy Control
 product SILABS	WMRBATT		0x814a	WMR RIGblaster Plug&Play
 product SILABS	WMRRIGBLASTER	0x814a	WMR RIGblaster Plug&Play
 product SILABS	WMRRIGTALK	0x814b	WMR RIGtalk RT1
 product SILABS	B_G_H3000	0x8156	B&G H3000 Data Cable
 product SILABS	HELICOM		0x815e	Helicomm IP-Link 1220-DVM
 product SILABS	HAMLINKUSB	0x815f	Timewave HamLinkUSB
 product SILABS	AVIT_USB_TTL	0x818b	AVIT Research USB-TTL
 product SILABS	MJS_TOSLINK	0x819f	MJS USB-TOSLINK
 product SILABS	WAVIT		0x81a6	ThinkOptics WavIt
 product SILABS	MULTIPLEX_RC	0x81a9	Multiplex RC adapter
 product SILABS	MSD_DASHHAWK	0x81ac	MSD DashHawk
 product SILABS	INSYS_MODEM	0x81ad	INSYS Modem
 product SILABS	LIPOWSKY_JTAG	0x81c8	Lipowsky Baby-JTAG
 product SILABS	LIPOWSKY_LIN	0x81e2	Lipowsky Baby-LIN
 product SILABS	AEROCOMM	0x81e7	Aerocomm Radio
 product SILABS	ZEPHYR_BIO	0x81e8	Zephyr Bioharness
 product SILABS	EMS_C1007	0x81f2	EMS C1007 HF RFID controller
 product SILABS	LIPOWSKY_HARP	0x8218	Lipowsky HARP-1
 product SILABS	C2_EDGE_MODEM	0x822b	Commander 2 EDGE(GSM) Modem
 product SILABS	CYGNAL_GPS	0x826b	Cygnal Fasttrax GPS
 product SILABS	TELEGESIS_ETRX2	0x8293	Telegesis ETRX2USB
 product SILABS	PROCYON_AVS	0x82f9	Procyon AVS
 product SILABS	MC35PU		0x8341	MC35pu
 product SILABS	CYGNAL		0x8382	Cygnal
 product SILABS	AMBER_AMB2560	0x83a8	Amber Wireless AMB2560
 product SILABS	DEKTEK_DTAPLUS	0x83d8	DekTec DTA Plus VHF/UHF Booster
 product SILABS	KYOCERA_GPS	0x8411	Kyocera GPS
 product SILABS	IRZ_SG10	0x8418	IRZ SG-10 GSM/GPRS Modem
 product SILABS	BEI_VCP		0x846e	BEI USB Sensor (VCP)
 product SILABS	BALLUFF_RFID	0x8477	Balluff RFID reader
 product SILABS	AC_SERV_IBUS	0x85ea	AC-Services IBUS Interface
 product SILABS	AC_SERV_CIS	0x85eb	AC-Services CIS-IBUS
 product SILABS	V_PREON32	0x85f8	Virtenio Preon32
 product SILABS	AC_SERV_CAN	0x8664	AC-Services CAN Interface
 product SILABS	AC_SERV_OBD	0x8665	AC-Services OBD Interface
 product SILABS	MMB_ZIGBEE	0x88a4	MMB Networks ZigBee
 product SILABS	INGENI_ZIGBEE	0x88a5	Planet Innovation Ingeni ZigBee
 product SILABS	CP2102		0xea60	SILABS USB UART
 product SILABS	CP210X_2	0xea61	CP210x Serial
 product SILABS	CP210X_3	0xea70	CP210x Serial
 product SILABS	CP210X_4	0xea80	CP210x Serial
 product SILABS	INFINITY_MIC	0xea71	Infinity GPS-MIC-1 Radio Monophone
 product SILABS	USBSCOPE50	0xf001	USBscope50
 product SILABS	USBWAVE12	0xf002	USBwave12
 product SILABS	USBPULSE100	0xf003	USBpulse100
 product SILABS	USBCOUNT50	0xf004	USBcount50
 product SILABS2 DCU11CLONE	0xaa26	DCU-11 clone
 product SILABS3 GPRS_MODEM	0xea61	GPRS Modem
 product SILABS4 100EU_MODEM	0xea61	GPRS Modem 100EU
 
 /* Silicon Portals Inc. */
 product SILICONPORTALS YAPPH_NF	0x0200	YAP Phone (no firmware)
 product SILICONPORTALS YAPPHONE	0x0201	YAP Phone
 
 /* Sirius Technologies products */
 product SIRIUS ROADSTER		0x0001	NetComm Roadster II 56 USB
 
 /* Sitecom products */
 product SITECOM LN029		0x182d	USB 2.0 Ethernet
 product SITECOM SERIAL		0x2068	USB to serial cable (v2)
 product SITECOM2 WL022		0x182d	WL-022
 
 /* Sitecom Europe products */
 product SITECOMEU RT2870_1	0x0017	RT2870
 product SITECOMEU WL168V1	0x000d	WL-168 v1
 product SITECOMEU LN030  	0x0021	MCS7830
 product SITECOMEU WL168V4	0x0028	WL-168 v4
 product SITECOMEU RT2870_2	0x002b	RT2870
 product SITECOMEU RT2870_3	0x002c	RT2870
 product SITECOMEU RT2870_4	0x002d	RT2870
 product SITECOMEU RT2770	0x0039	RT2770
 product SITECOMEU RT3070_2	0x003b	RT3070
 product SITECOMEU RT3070_3	0x003c	RT3070
 product SITECOMEU RT3070_4	0x003d	RT3070
 product SITECOMEU RT3070	0x003e	RT3070
 product SITECOMEU WL608		0x003f	WL-608
 product SITECOMEU RT3071	0x0040	RT3071
 product SITECOMEU RT3072_1	0x0041	RT3072
 product SITECOMEU RT3072_2	0x0042	RT3072
 product SITECOMEU WL353		0x0045	WL-353
 product SITECOMEU RT3072_3	0x0047	RT3072
 product SITECOMEU RT3072_4	0x0048	RT3072
 product SITECOMEU RT3072_5	0x004a	RT3072
 product SITECOMEU WL349V1	0x004b	WL-349 v1
 product SITECOMEU RT3072_6	0x004d	RT3072
 product SITECOMEU RTL8188CU_1	0x0052	RTL8188CU
 product SITECOMEU RTL8188CU_2	0x005c	RTL8188CU
 product SITECOMEU RTL8192CU	0x0061	RTL8192CU
 product SITECOMEU LN032		0x0072	LN-032
 product SITECOMEU LN028		0x061c	LN-028
 product SITECOMEU WL113		0x9071	WL-113
 product SITECOMEU ZD1211B	0x9075	ZD1211B
 product SITECOMEU WL172		0x90ac	WL-172
 product SITECOMEU WL113R2	0x9712	WL-113 rev 2
 
 /* Skanhex Technology products */
 product SKANHEX MD_7425		0x410a	MD 7425 Camera
 product SKANHEX SX_520Z		0x5200	SX 520z Camera
 
 /* Smart Technologies products */
 product SMART PL2303		0x2303	Serial adapter
 
 /* SmartBridges products */
 product SMARTBRIDGES SMARTLINK	0x0001	SmartLink USB Ethernet
 product SMARTBRIDGES SMARTNIC	0x0003	smartNIC 2 PnP Ethernet
 
 /* SMC products */
 product SMC 2102USB		0x0100	10Mbps Ethernet
 product SMC 2202USB		0x0200	10/100 Ethernet
 product SMC 2206USB		0x0201	EZ Connect USB Ethernet
 product SMC 2862WG		0xee13	EZ Connect Wireless Adapter
 product SMC2 2020HUB		0x2020	USB Hub
 product SMC2 2514HUB		0x2514	USB Hub
 product SMC3 2662WUSB		0xa002	2662W-AR Wireless
 product SMC2 LAN9500_ETH	0x9500	USB/Ethernet
 product SMC2 LAN9505_ETH	0x9505	USB/Ethernet
 product SMC2 LAN9530_ETH	0x9530	USB/Ethernet
 product SMC2 LAN9730_ETH	0x9730	USB/Ethernet
 product SMC2 LAN9500_SAL10	0x9900	USB/Ethernet
 product SMC2 LAN9505_SAL10	0x9901	USB/Ethernet
 product SMC2 LAN9500A_SAL10	0x9902	USB/Ethernet
 product SMC2 LAN9505A_SAL10	0x9903	USB/Ethernet
 product SMC2 LAN9514_SAL10	0x9904	USB/Ethernet
 product SMC2 LAN9500A_HAL	0x9905	USB/Ethernet
 product SMC2 LAN9505A_HAL	0x9906	USB/Ethernet
 product SMC2 LAN9500_ETH_2	0x9907	USB/Ethernet
 product SMC2 LAN9500A_ETH_2	0x9908	USB/Ethernet
 product SMC2 LAN9514_ETH_2	0x9909	USB/Ethernet
 product SMC2 LAN9500A_ETH	0x9e00	USB/Ethernet
 product SMC2 LAN9505A_ETH	0x9e01	USB/Ethernet
 product SMC2 LAN89530_ETH	0x9e08	USB/Ethernet
 product SMC2 LAN9514_ETH	0xec00	USB/Ethernet
 
 /* SOHOware products */
 product SOHOWARE NUB100		0x9100	10/100 USB Ethernet
 product SOHOWARE NUB110		0x9110	10/100 USB Ethernet
 
 /* SOLID YEAR products */
 product SOLIDYEAR KEYBOARD	0x2101	Solid Year USB keyboard
 
 /* SONY products */
 product SONY DSC		0x0010	DSC cameras
 product SONY MS_NW_MS7		0x0025	Memorystick NW-MS7
 product SONY PORTABLE_HDD_V2	0x002b	Portable USB Harddrive V2
 product SONY MSACUS1		0x002d	Memorystick MSAC-US1
 product SONY HANDYCAM		0x002e	Handycam
 product SONY MSC		0x0032	MSC memory stick slot
 product SONY CLIE_35		0x0038	Sony Clie v3.5
 product SONY MS_PEG_N760C	0x0058	PEG N760c Memorystick
 product SONY CLIE_40		0x0066	Sony Clie v4.0
 product SONY MS_MSC_U03		0x0069	Memorystick MSC-U03
 product SONY CLIE_40_MS		0x006d	Sony Clie v4.0 Memory Stick slot
 product SONY CLIE_S360		0x0095	Sony Clie s360
 product SONY CLIE_41_MS		0x0099	Sony Clie v4.1 Memory Stick slot
 product SONY CLIE_41		0x009a	Sony Clie v4.1
 product SONY CLIE_NX60		0x00da	Sony Clie nx60
 product SONY CLIE_TH55		0x0144	Sony Clie th55
 product SONY CLIE_TJ37		0x0169	Sony Clie tj37
 product SONY RF_RECEIVER	0x01db	Sony RF mouse/kbd Receiver VGP-WRC1
 product SONY QN3		0x0437	Sony QN3 CMD-Jxx phone cable
 
 /* Sony Ericsson products */
 product SONYERICSSON DCU10	0x0528	DCU-10 Phone Data Cable
 product SONYERICSSON DATAPILOT	0x2003	Datapilot Phone Cable
 
 /* SOURCENEXT products */
 product SOURCENEXT KEIKAI8	0x039f	KeikaiDenwa 8
 product SOURCENEXT KEIKAI8_CHG	0x012e	KeikaiDenwa 8 with charger
 
 /* SparkLAN products */
 product SPARKLAN RT2573		0x0004	RT2573
 product SPARKLAN RT2870_1	0x0006	RT2870
 product SPARKLAN RT3070		0x0010	RT3070
 
 /* Soundgraph products */
 product SOUNDGRAPH IMON_VFD	0x0044	Antec Veris Elite VFD Panel, Knob, and Remote
 product SOUNDGRAPH SSTONE_LC16	0xffdc	Silverstone LC16 VFD Panel, Knob, and Remote
 
 /* Speed Dragon Multimedia products */
 product SPEEDDRAGON MS3303H	0x110b	MS3303H Serial
 
 /* Sphairon Access Systems GmbH products */
 product SPHAIRON UB801R		0x0110	UB801R
 
 /* Stelera Wireless products */
 product STELERA ZEROCD		0x1000	Zerocd Installer
 product STELERA C105		0x1002	Stelera/Bandrish C105 USB
 product STELERA E1003		0x1003	3G modem
 product STELERA E1004		0x1004	3G modem
 product STELERA E1005		0x1005	3G modem
 product STELERA E1006		0x1006	3G modem
 product STELERA E1007		0x1007	3G modem
 product STELERA E1008		0x1008	3G modem
 product STELERA E1009		0x1009	3G modem
 product STELERA E100A		0x100a	3G modem
 product STELERA E100B		0x100b	3G modem
 product STELERA E100C		0x100c	3G modem
 product STELERA E100D		0x100d	3G modem
 product STELERA E100E		0x100e	3G modem
 product STELERA E100F		0x100f	3G modem
 product STELERA E1010		0x1010	3G modem
 product STELERA E1011		0x1011	3G modem
 product STELERA E1012		0x1012	3G modem
 
 /* STMicroelectronics products */
 product STMICRO BIOCPU		0x2016	Biometric Coprocessor
 product STMICRO COMMUNICATOR	0x7554	USB Communicator
 product STMICRO ST72682		0xfada  USB 2.0 Flash drive controller
 
 /* STSN products */
 product STSN STSN0001		0x0001	Internet Access Device
 
 /* SUN Corporation products */
 product SUNTAC DS96L		0x0003	SUNTAC U-Cable type D2
 product SUNTAC PS64P1		0x0005	SUNTAC U-Cable type P1
 product SUNTAC VS10U		0x0009	SUNTAC Slipper U
 product SUNTAC IS96U		0x000a	SUNTAC Ir-Trinity
 product SUNTAC AS64LX		0x000b	SUNTAC U-Cable type A3
 product SUNTAC AS144L4		0x0011	SUNTAC U-Cable type A4
 
 /* Sun Microsystems products */
 product SUN KEYBOARD_TYPE_6	0x0005	Type 6 USB keyboard
 product SUN KEYBOARD_TYPE_7	0x00a2	Type 7 USB keyboard
 /* XXX The above is a North American PC style keyboard possibly */
 product SUN MOUSE		0x0100	Type 6 USB mouse
 product SUN KBD_HUB		0x100e	Kbd Hub
 
 /* Sunplus Innovation Technology Inc. products */
 product SUNPLUS USBMOUSE	0x0007	USB Optical Mouse
 
 /* Super Top products */
 product	SUPERTOP IDE		0x6600	USB-IDE
 product	SUPERTOP FLASHDRIVE	0x121c	extrememory Snippy
 
 /* Syntech products */
 product SYNTECH CPT8001C	0x0001	CPT-8001C Barcode scanner
 product	SYNTECH CYPHERLAB100	0x1000	CipherLab USB Barcode Scanner
 
 /* Teclast products */
 product TECLAST TLC300		0x3203	USB Media Player
 
 /* Testo products */
 product TESTO USB_INTERFACE	0x0001	FTDI compatible adapter
 
 /* TexTech products */
 product TEXTECH DUMMY		0x0000	Dummy product
 product TEXTECH U2M_1		0x0101	Textech USB MIDI cable
 product TEXTECH U2M_2		0x1806	Textech USB MIDI cable
 
 /* The Mobility Lab products */
 product TML USB_SERIAL		0x0064	FTDI compatible adapter
 
 /* Thurlby Thandar Instrument products */
 product TTI QL355P		0x03e8	FTDI compatible adapter
 
 /* Supra products */
 product DIAMOND2 SUPRAEXPRESS56K 0x07da	Supra Express 56K modem
 product DIAMOND2 SUPRA2890	0x0b4a	SupraMax 2890 56K Modem
 product DIAMOND2 RIO600USB	0x5001	Rio 600 USB
 product DIAMOND2 RIO800USB	0x5002	Rio 800 USB
 
 /* Surecom Technology products */
 product SURECOM EP9001G2A	0x11f2	EP-9001-G rev 2A
 product SURECOM RT2570		0x11f3	RT2570
 product SURECOM RT2573		0x31f3	RT2573
 
 /* Sweex products */
 product SWEEX ZD1211		0x1809	ZD1211
 product SWEEX2 LW153		0x0153	LW153
 product SWEEX2 LW154		0x0154	LW154
 product SWEEX2 LW303		0x0302	LW303
 product SWEEX2 LW313		0x0313	LW313
 
 /* System TALKS, Inc. */
 product SYSTEMTALKS SGCX2UL	0x1920	SGC-X2UL
 
 /* Tapwave products */
 product TAPWAVE ZODIAC		0x0100	Zodiac
 
 /* Taugagreining products */
 product TAUGA CAMERAMATE	0x0005	CameraMate (DPCM_USB)
 
 /* TCTMobile products */
 product TCTMOBILE X060S		0x0000	X060S 3G modem
 product TCTMOBILE X080S		0xf000	X080S 3G modem
 
 /* TDK products */
 product TDK UPA9664		0x0115	USB-PDC Adapter UPA9664
 product TDK UCA1464		0x0116	USB-cdmaOne Adapter UCA1464
 product TDK UHA6400		0x0117	USB-PHS Adapter UHA6400
 product TDK UPA6400		0x0118	USB-PHS Adapter UPA6400
 product TDK BT_DONGLE		0x0309	Bluetooth USB dongle
 
 /* TEAC products */
 product TEAC FD05PUB		0x0000	FD-05PUB floppy
 
 /* Tekram Technology products */
 product TEKRAM QUICKWLAN	0x1630	QuickWLAN
 product TEKRAM ZD1211_1		0x5630	ZD1211
 product TEKRAM ZD1211_2		0x6630	ZD1211
 
 /* Telex Communications products */
 product TELEX MIC1		0x0001	Enhanced USB Microphone
 
 /* Telit products */
 product TELIT UC864E		0x1003	UC864E 3G modem
 product TELIT UC864G		0x1004	UC864G 3G modem
 
 /* Ten X Technology, Inc. */
 product TENX UAUDIO0		0xf211	USB audio headset
 
 /* Texas Intel products */
 product TI UTUSB41		0x1446	UT-USB41 hub
 product TI TUSB2046		0x2046	TUSB2046 hub
 
 /* Thrustmaster products */
 product THRUST FUSION_PAD	0xa0a3	Fusion Digital Gamepad
 
 /* TLayTech products */
 product TLAYTECH TEU800		0x1682	TEU800 3G modem
 
 /* Topre Corporation products */
 product TOPRE HHKB		0x0100	HHKB Professional
 
 /* Toshiba Corporation products */
 product TOSHIBA POCKETPC_E740	0x0706	PocketPC e740
 product TOSHIBA RT3070		0x0a07	RT3070
 product TOSHIBA G450		0x0d45	G450 modem
 product TOSHIBA HSDPA		0x1302	G450 modem
 product TOSHIBA TRANSMEMORY	0x6545	USB ThumbDrive
 
 /* Trek Technology products */
 product TREK THUMBDRIVE		0x1111	ThumbDrive
 product TREK MEMKEY		0x8888	IBM USB Memory Key
 product TREK THUMBDRIVE_8MB	0x9988	ThumbDrive_8MB
 
 /* TRENDnet products */
 product TRENDNET RTL8192CU	0x624d	RTL8192CU
 product TRENDNET RTL8188CU	0x648b	RTL8188CU
 
 /* Tripp-Lite products */
 product TRIPPLITE U209		0x2008	Serial
 
 /* Trumpion products */
 product TRUMPION T33520		0x1001	T33520 USB Flash Card Controller
 product TRUMPION C3310		0x1100	Comotron C3310 MP3 player
 product TRUMPION MP3		0x1200	MP3 player
 
 /* TwinMOS */
 product TWINMOS G240		0xa006	G240
 product TWINMOS MDIV		0x1325	Memory Disk IV
 
 /* Ubiquam products */
 product UBIQUAM UALL		0x3100	CDMA 1xRTT USB Modem (U-100/105/200/300/520)
 
 /* Ultima products */
 product ULTIMA 1200UBPLUS	0x4002	1200 UB Plus scanner
 
 /* UMAX products */
 product UMAX ASTRA1236U		0x0002	Astra 1236U Scanner
 product UMAX ASTRA1220U		0x0010	Astra 1220U Scanner
 product UMAX ASTRA2000U		0x0030	Astra 2000U Scanner
 product UMAX ASTRA2100U		0x0130	Astra 2100U Scanner
 product UMAX ASTRA2200U		0x0230	Astra 2200U Scanner
 product UMAX ASTRA3400		0x0060	Astra 3400 Scanner
 
 /* U-MEDIA Communications products */
 product UMEDIA TEW444UBEU	0x3006	TEW-444UB EU
 product UMEDIA TEW444UBEU_NF	0x3007	TEW-444UB EU (no firmware)
 product UMEDIA TEW429UB_A	0x300a	TEW-429UB_A
 product UMEDIA TEW429UB		0x300b	TEW-429UB
 product UMEDIA TEW429UBC1	0x300d	TEW-429UB C1
 product UMEDIA RT2870_1		0x300e	RT2870
 product UMEDIA ALL0298V2	0x3204	ALL0298 v2
 product UMEDIA AR5523_2		0x3205	AR5523
 product UMEDIA AR5523_2_NF	0x3206	AR5523 (no firmware)
 
 /* Universal Access products */
 product UNIACCESS PANACHE	0x0101	Panache Surf USB ISDN Adapter
 
 /* Unknown products */
 product UNKNOWN4 NF_RIC		0x0001	FTDI compatible adapter
 
 /* USI products */
 product USI MC60		0x10c5	MC60 Serial
 
 /* U.S. Robotics products */
 product USR USR5422		0x0118	USR5422 WLAN
 product USR USR5423		0x0121	USR5423 WLAN
 
 /* VIA Technologies products */
 product VIA USB2IDEBRIDGE	0x6204	USB 2.0 IDE Bridge
 
 /* VIA Labs */
 product VIALABS USB30SATABRIDGE	0x0700	USB 3.0 SATA Bridge
 
 /* Vaisala products */
 product VAISALA CABLE		0x0200	USB Interface cable
 
 /* Vertex products */
 product VERTEX VW110L		0x0100	Vertex VW110L modem
 
 /* VidzMedia products */
 product VIDZMEDIA MONSTERTV	0x4fb1	MonsterTV P2H
 
 /* Vision products */
 product VISION VC6452V002	0x0002	CPiA Camera
 
 /* Visioneer products */
 product VISIONEER 7600		0x0211	OneTouch 7600
 product VISIONEER 5300		0x0221	OneTouch 5300
 product VISIONEER 3000		0x0224	Scanport 3000
 product VISIONEER 6100		0x0231	OneTouch 6100
 product VISIONEER 6200		0x0311	OneTouch 6200
 product VISIONEER 8100		0x0321	OneTouch 8100
 product VISIONEER 8600		0x0331	OneTouch 8600
 
 /* Vivitar products */
 product VIVITAR 35XX		0x0003	Vivicam 35Xx
 
 /* VTech products */
 product VTECH RT2570		0x3012	RT2570
 product VTECH ZD1211B		0x3014	ZD1211B
 
 /* Wacom products */
 product WACOM CT0405U		0x0000	CT-0405-U Tablet
 product WACOM GRAPHIRE		0x0010	Graphire
 product WACOM GRAPHIRE3_4X5	0x0013	Graphire 3 4x5
 product WACOM INTUOSA5		0x0021	Intuos A5
 product WACOM GD0912U		0x0022	Intuos 9x12 Graphics Tablet
 
 /* WAGO Kontakttechnik GmbH products */
 product WAGO SERVICECABLE	0x07a6	USB Service Cable 750-923
 
 /* WaveSense products */
 product WAVESENSE JAZZ		0xaaaa	Jazz blood glucose meter
 
 /* WCH products */
 product WCH CH341SER		0x5523	CH341/CH340 USB-Serial Bridge
 product WCH2 DUMMY		0x0000	Dummy product
 product WCH2 CH341SER_2		0x5523	CH341/CH340 USB-Serial Bridge
 product WCH2 CH341SER		0x7523	CH341/CH340 USB-Serial Bridge
 product WCH2 U2M		0X752d	CH345 USB2.0-MIDI
 
 /* West Mountain Radio products */
 product WESTMOUNTAIN RIGBLASTER_ADVANTAGE	0x0003	RIGblaster Advantage
 
 /* Western Digital products */
 product WESTERN COMBO		0x0200	Firewire USB Combo
 product WESTERN EXTHDD		0x0400	External HDD
 product WESTERN HUB		0x0500	USB HUB
 product WESTERN MYBOOK		0x0901	MyBook External HDD
 product WESTERN MYPASSWORD	0x0704	MyPassword External HDD
 product WESTERN MYPASSPORT	0x0748	MyPassport External HDD
 
 /* WeTelecom products */
 product WETELECOM WM_D200	0x6801	WM-D200
 
 /* WIENER Plein & Baus GmbH products */
 product WIENERPLEINBAUS PL512	0x0010	PL512 PSU
 product WIENERPLEINBAUS RCM	0x0011	RCM Remote Control
 product WIENERPLEINBAUS MPOD	0x0012	MPOD PSU
 product WIENERPLEINBAUS CML	0x0015	CML Data Logger
 
 /* Windbond Electronics */
 product WINBOND UH104		0x5518	4-port USB Hub
 
 /* WinMaxGroup products */
 product WINMAXGROUP FLASH64MC	0x6660	USB Flash Disk 64M-C
 
 /* Wistron NeWeb products */
 product WISTRONNEWEB UR045G	0x0427	PrismGT USB 2.0 WLAN
 product WISTRONNEWEB UR055G	0x0711	UR055G
 product WISTRONNEWEB AR5523_1	0x0826	AR5523
 product WISTRONNEWEB AR5523_1_NF	0x0827	AR5523 (no firmware)
 product WISTRONNEWEB AR5523_2	0x082a	AR5523
 product WISTRONNEWEB AR5523_2_NF	0x0829	AR5523 (no firmware)
 
 /* Xerox products */
 product XEROX WCM15		0xffef	WorkCenter M15
 
 /* Xirlink products */
 product XIRLINK PCCAM		0x8080	IBM PC Camera
 
 /* Xyratex products */
 product XYRATEX PRISM_GT_1	0x2000	PrismGT USB 2.0 WLAN
 product XYRATEX PRISM_GT_2	0x2002	PrismGT USB 2.0 WLAN
 
 /* Yamaha products */
 product YAMAHA UX256		0x1000	UX256 MIDI I/F
 product YAMAHA UX96		0x1008	UX96 MIDI I/F
 product YAMAHA RPU200		0x3104	RP-U200
 product YAMAHA RTA54I		0x4000	NetVolante RTA54i Broadband&ISDN Router
 product YAMAHA RTW65B		0x4001	NetVolante RTW65b Broadband Wireless Router
 product YAMAHA RTW65I		0x4002	NetVolante RTW65i Broadband&ISDN Wireless Router
 product YAMAHA RTA55I		0x4004	NetVolante RTA55i Broadband VoIP Router
 
 /* Yano products */
 product YANO U640MO		0x0101	U640MO-03
 product YANO FW800HD		0x05fc	METALWEAR-HDD
 
 /* Y.C. Cable products */
 product YCCABLE PL2303		0x0fba	PL2303 Serial
 
 /* Y-E Data products */
 product YEDATA FLASHBUSTERU	0x0000	Flashbuster-U
 
 /* Yiso Wireless Co. products */
 product	YISO C893		0xc893	CDMA 2000 1xEVDO PC Card
 
 /* Z-Com products */
 product ZCOM M4Y750		0x0001	M4Y-750
 product ZCOM XI725		0x0002	XI-725/726
 product ZCOM XI735		0x0005	XI-735
 product ZCOM XG703A		0x0008	PrismGT USB 2.0 WLAN
 product ZCOM ZD1211		0x0011	ZD1211
 product ZCOM AR5523		0x0012	AR5523
 product ZCOM AR5523_NF		0x0013	AR5523 driver (no firmware)
 product ZCOM XM142		0x0015	XM-142
 product ZCOM ZD1211B		0x001a	ZD1211B
 product ZCOM RT2870_1		0x0022	RT2870
 product ZCOM RT2870_2		0x0025	RT2870
 
 /* Zinwell products */
 product ZINWELL RT2570		0x0260	RT2570
 product ZINWELL RT2870_1	0x0280	RT2870
 product ZINWELL RT2870_2	0x0282	RT2870
 product ZINWELL RT3072_1	0x0283	RT3072
 product ZINWELL RT3072_2	0x0284	RT3072
 product ZINWELL RT3070		0x5257	RT3070
 
 /* Zoom Telephonics, Inc. products */
 product ZOOM 2986L		0x9700	2986L Fax modem
 
 /* Zoran Microelectronics products */
 product ZORAN EX20DSC		0x4343	Digital Camera EX-20 DSC
 
 /* Zydas Technology Corporation products */
 product ZYDAS ZD1211		0x1211	ZD1211 WLAN abg
 product ZYDAS ZD1211B		0x1215	ZD1211B
 
 /* ZyXEL Communication Co. products */
 product ZYXEL OMNI56K		0x1500	Omni 56K Plus
 product ZYXEL 980N		0x2011	Scorpion-980N keyboard
 product ZYXEL ZYAIRG220		0x3401	ZyAIR G-220
 product ZYXEL G200V2		0x3407	G-200 v2
 product ZYXEL AG225H		0x3409	AG-225H
 product ZYXEL M202		0x340a	M-202
 product ZYXEL G220V2		0x340f	G-220 v2
 product ZYXEL G202		0x3410	G-202
 product ZYXEL RT2870_1		0x3416	RT2870
 product ZYXEL RT2870_2		0x341a	RT2870
 product ZYXEL RT3070		0x341e	NWD2105
 product ZYXEL RTL8192CU		0x341f	RTL8192CU
 product ZYXEL NWD2705		0x3421	NWD2705
Index: user/dchagin/lemul/sys/net/bpf.c
===================================================================
--- user/dchagin/lemul/sys/net/bpf.c	(revision 274240)
+++ user/dchagin/lemul/sys/net/bpf.c	(revision 274241)
@@ -1,2934 +1,2934 @@
 /*-
  * Copyright (c) 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from the Stanford/CMU enet packet filter,
  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  * Berkeley Laboratory.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bpf.h"
 #include "opt_compat.h"
 #include "opt_netgraph.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/time.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
 #include <sys/ttycom.h>
 #include <sys/uio.h>
 
 #include <sys/event.h>
 #include <sys/file.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #define	BPF_INTERNAL
 #include <net/bpf.h>
 #include <net/bpf_buffer.h>
 #ifdef BPF_JITTER
 #include <net/bpf_jitter.h>
 #endif
 #include <net/bpf_zerocopy.h>
 #include <net/bpfdesc.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <net80211/ieee80211_freebsd.h>
 
 #include <security/mac/mac_framework.h>
 
 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 
 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 
 #define PRINET  26			/* interruptible */
 
 #define	SIZEOF_BPF_HDR(type)	\
     (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen))
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 #define BPF_ALIGNMENT32 sizeof(int32_t)
 #define BPF_WORDALIGN32(x) (((x)+(BPF_ALIGNMENT32-1))&~(BPF_ALIGNMENT32-1))
 
 #ifndef BURN_BRIDGES
 /*
  * 32-bit version of structure prepended to each packet.  We use this header
  * instead of the standard one for 32-bit streams.  We mark the a stream as
  * 32-bit the first time we see a 32-bit compat ioctl request.
  */
 struct bpf_hdr32 {
 	struct timeval32 bh_tstamp;	/* time stamp */
 	uint32_t	bh_caplen;	/* length of captured portion */
 	uint32_t	bh_datalen;	/* original length of packet */
 	uint16_t	bh_hdrlen;	/* length of bpf header (this struct
 					   plus alignment padding) */
 };
 #endif
 
 struct bpf_program32 {
 	u_int bf_len;
 	uint32_t bf_insns;
 };
 
 struct bpf_dltlist32 {
 	u_int	bfl_len;
 	u_int	bfl_list;
 };
 
 #define	BIOCSETF32	_IOW('B', 103, struct bpf_program32)
 #define	BIOCSRTIMEOUT32	_IOW('B', 109, struct timeval32)
 #define	BIOCGRTIMEOUT32	_IOR('B', 110, struct timeval32)
 #define	BIOCGDLTLIST32	_IOWR('B', 121, struct bpf_dltlist32)
 #define	BIOCSETWF32	_IOW('B', 123, struct bpf_program32)
 #define	BIOCSETFNR32	_IOW('B', 130, struct bpf_program32)
 #endif
 
 /*
  * bpf_iflist is a list of BPF interface structures, each corresponding to a
  * specific DLT.  The same network interface might have several BPF interface
  * structures registered by different layers in the stack (i.e., 802.11
  * frames, ethernet frames, etc).
  */
 static LIST_HEAD(, bpf_if)	bpf_iflist, bpf_freelist;
 static struct mtx	bpf_mtx;		/* bpf global lock */
 static int		bpf_bpfd_cnt;
 
 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
 static void	bpf_detachd(struct bpf_d *);
 static void	bpf_detachd_locked(struct bpf_d *);
 static void	bpf_freed(struct bpf_d *);
 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
 		    struct sockaddr *, int *, struct bpf_insn *);
 static int	bpf_setif(struct bpf_d *, struct ifreq *);
 static void	bpf_timed_out(void *);
 static __inline void
 		bpf_wakeup(struct bpf_d *);
 static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
 		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
 		    struct bintime *);
 static void	reset_d(struct bpf_d *);
 static int	bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 static int	bpf_setdlt(struct bpf_d *, u_int);
 static void	filt_bpfdetach(struct knote *);
 static int	filt_bpfread(struct knote *, long);
 static void	bpf_drvinit(void *);
 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
 int bpf_maxinsns = BPF_MAXINSNS;
 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
     &bpf_maxinsns, 0, "Maximum bpf program instructions");
 static int bpf_zerocopy_enable = 0;
 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
     bpf_stats_sysctl, "bpf statistics portal");
 
 static VNET_DEFINE(int, bpf_optimize_writers) = 0;
 #define	V_bpf_optimize_writers VNET(bpf_optimize_writers)
-SYSCTL_VNET_INT(_net_bpf, OID_AUTO, optimize_writers,
-    CTLFLAG_RW, &VNET_NAME(bpf_optimize_writers), 0,
+SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RW,
+    &VNET_NAME(bpf_optimize_writers), 0,
     "Do not send packets until BPF program is set");
 
 static	d_open_t	bpfopen;
 static	d_read_t	bpfread;
 static	d_write_t	bpfwrite;
 static	d_ioctl_t	bpfioctl;
 static	d_poll_t	bpfpoll;
 static	d_kqfilter_t	bpfkqfilter;
 
 static struct cdevsw bpf_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	bpfopen,
 	.d_read =	bpfread,
 	.d_write =	bpfwrite,
 	.d_ioctl =	bpfioctl,
 	.d_poll =	bpfpoll,
 	.d_name =	"bpf",
 	.d_kqfilter =	bpfkqfilter,
 };
 
 static struct filterops bpfread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_bpfdetach,
 	.f_event = filt_bpfread,
 };
 
 eventhandler_tag	bpf_ifdetach_cookie = NULL;
 
 /*
  * LOCKING MODEL USED BY BPF:
  * Locks:
  * 1) global lock (BPF_LOCK). Mutex, used to protect interface addition/removal,
  * some global counters and every bpf_if reference.
  * 2) Interface lock. Rwlock, used to protect list of BPF descriptors and their filters.
  * 3) Descriptor lock. Mutex, used to protect BPF buffers and various structure fields
  *   used by bpf_mtap code.
  *
  * Lock order:
  *
  * Global lock, interface lock, descriptor lock
  *
  * We have to acquire interface lock before descriptor main lock due to BPF_MTAP[2]
  * working model. In many places (like bpf_detachd) we start with BPF descriptor
  * (and we need to at least rlock it to get reliable interface pointer). This
  * gives us potential LOR. As a result, we use global lock to protect from bpf_if
  * change in every such place.
  *
  * Changing d->bd_bif is protected by 1) global lock, 2) interface lock and
  * 3) descriptor main wlock.
  * Reading bd_bif can be protected by any of these locks, typically global lock.
  *
  * Changing read/write BPF filter is protected by the same three locks,
  * the same applies for reading.
  *
  * Sleeping in global lock is not allowed due to bpfdetach() using it.
  */
 
 /*
  * Wrapper functions for various buffering methods.  If the set of buffer
  * modes expands, we will probably want to introduce a switch data structure
  * similar to protosw, et.
  */
 static void
 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
     u_int len)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
 
 	case BPF_BUFMODE_ZBUF:
 		d->bd_zcopy++;
 		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
 
 	default:
 		panic("bpf_buf_append_bytes");
 	}
 }
 
 static void
 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
     u_int len)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
 
 	case BPF_BUFMODE_ZBUF:
 		d->bd_zcopy++;
 		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
 
 	default:
 		panic("bpf_buf_append_mbuf");
 	}
 }
 
 /*
  * This function gets called when the free buffer is re-assigned.
  */
 static void
 bpf_buf_reclaimed(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return;
 
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_buf_reclaimed(d);
 		return;
 
 	default:
 		panic("bpf_buf_reclaimed");
 	}
 }
 
 /*
  * If the buffer mechanism has a way to decide that a held buffer can be made
  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
  * returned if the buffer can be discarded, (0) is returned if it cannot.
  */
 static int
 bpf_canfreebuf(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_canfreebuf(d));
 	}
 	return (0);
 }
 
 /*
  * Allow the buffer model to indicate that the current store buffer is
  * immutable, regardless of the appearance of space.  Return (1) if the
  * buffer is writable, and (0) if not.
  */
 static int
 bpf_canwritebuf(struct bpf_d *d)
 {
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_canwritebuf(d));
 	}
 	return (1);
 }
 
 /*
  * Notify buffer model that an attempt to write to the store buffer has
  * resulted in a dropped packet, in which case the buffer may be considered
  * full.
  */
 static void
 bpf_buffull(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_buffull(d);
 		break;
 	}
 }
 
 /*
  * Notify the buffer model that a buffer has moved into the hold position.
  */
 void
 bpf_bufheld(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_ZBUF:
 		bpf_zerocopy_bufheld(d);
 		break;
 	}
 }
 
 static void
 bpf_free(struct bpf_d *d)
 {
 
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 		return (bpf_buffer_free(d));
 
 	case BPF_BUFMODE_ZBUF:
 		return (bpf_zerocopy_free(d));
 
 	default:
 		panic("bpf_buf_free");
 	}
 }
 
 static int
 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 		return (EOPNOTSUPP);
 	return (bpf_buffer_uiomove(d, buf, len, uio));
 }
 
 static int
 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 		return (EOPNOTSUPP);
 	return (bpf_buffer_ioctl_sblen(d, i));
 }
 
 static int
 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
 }
 
 static int
 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
 }
 
 static int
 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 {
 
 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 		return (EOPNOTSUPP);
 	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
 }
 
 /*
  * General BPF functions.
  */
 static int
 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
     struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter)
 {
 	const struct ieee80211_bpf_params *p;
 	struct ether_header *eh;
 	struct mbuf *m;
 	int error;
 	int len;
 	int hlen;
 	int slen;
 
 	/*
 	 * Build a sockaddr based on the data link layer type.
 	 * We do this at this level because the ethernet header
 	 * is copied directly into the data field of the sockaddr.
 	 * In the case of SLIP, there is no header and the packet
 	 * is forwarded as is.
 	 * Also, we are careful to leave room at the front of the mbuf
 	 * for the link level header.
 	 */
 	switch (linktype) {
 
 	case DLT_SLIP:
 		sockp->sa_family = AF_INET;
 		hlen = 0;
 		break;
 
 	case DLT_EN10MB:
 		sockp->sa_family = AF_UNSPEC;
 		/* XXX Would MAXLINKHDR be better? */
 		hlen = ETHER_HDR_LEN;
 		break;
 
 	case DLT_FDDI:
 		sockp->sa_family = AF_IMPLINK;
 		hlen = 0;
 		break;
 
 	case DLT_RAW:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 0;
 		break;
 
 	case DLT_NULL:
 		/*
 		 * null interface types require a 4 byte pseudo header which
 		 * corresponds to the address family of the packet.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;
 		break;
 
 	case DLT_ATM_RFC1483:
 		/*
 		 * en atm driver requires 4-byte atm pseudo header.
 		 * though it isn't standard, vpi:vci needs to be
 		 * specified anyway.
 		 */
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
 		break;
 
 	case DLT_PPP:
 		sockp->sa_family = AF_UNSPEC;
 		hlen = 4;	/* This should match PPP_HDRLEN */
 		break;
 
 	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
 		sockp->sa_family = AF_IEEE80211;
 		hlen = 0;
 		break;
 
 	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
 		sockp->sa_family = AF_IEEE80211;
 		sockp->sa_len = 12;	/* XXX != 0 */
 		hlen = sizeof(struct ieee80211_bpf_params);
 		break;
 
 	default:
 		return (EIO);
 	}
 
 	len = uio->uio_resid;
 	if (len < hlen || len - hlen > ifp->if_mtu)
 		return (EMSGSIZE);
 
 	m = m_get2(len, M_WAITOK, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		return (EIO);
 	m->m_pkthdr.len = m->m_len = len;
 	*mp = m;
 
 	error = uiomove(mtod(m, u_char *), len, uio);
 	if (error)
 		goto bad;
 
 	slen = bpf_filter(wfilter, mtod(m, u_char *), len, len);
 	if (slen == 0) {
 		error = EPERM;
 		goto bad;
 	}
 
 	/* Check for multicast destination */
 	switch (linktype) {
 	case DLT_EN10MB:
 		eh = mtod(m, struct ether_header *);
 		if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 			if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
 			    ETHER_ADDR_LEN) == 0)
 				m->m_flags |= M_BCAST;
 			else
 				m->m_flags |= M_MCAST;
 		}
 		break;
 	}
 
 	/*
 	 * Make room for link header, and copy it to sockaddr
 	 */
 	if (hlen != 0) {
 		if (sockp->sa_family == AF_IEEE80211) {
 			/*
 			 * Collect true length from the parameter header
 			 * NB: sockp is known to be zero'd so if we do a
 			 *     short copy unspecified parameters will be
 			 *     zero.
 			 * NB: packet may not be aligned after stripping
 			 *     bpf params
 			 * XXX check ibp_vers
 			 */
 			p = mtod(m, const struct ieee80211_bpf_params *);
 			hlen = p->ibp_len;
 			if (hlen > sizeof(sockp->sa_data)) {
 				error = EINVAL;
 				goto bad;
 			}
 		}
 		bcopy(mtod(m, const void *), sockp->sa_data, hlen);
 	}
 	*hdrlen = hlen;
 
 	return (0);
 bad:
 	m_freem(m);
 	return (error);
 }
 
 /*
  * Attach file to the bpf interface, i.e. make d listen on bp.
  */
 static void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
 	int op_w;
 
 	BPF_LOCK_ASSERT();
 
 	/*
 	 * Save sysctl value to protect from sysctl change
 	 * between reads
 	 */
 	op_w = V_bpf_optimize_writers;
 
 	if (d->bd_bif != NULL)
 		bpf_detachd_locked(d);
 	/*
 	 * Point d at bp, and add d to the interface's list.
 	 * Since there are many applicaiotns using BPF for
 	 * sending raw packets only (dhcpd, cdpd are good examples)
 	 * we can delay adding d to the list of active listeners until
 	 * some filter is configured.
 	 */
 
 	BPFIF_WLOCK(bp);
 	BPFD_LOCK(d);
 
 	d->bd_bif = bp;
 
 	if (op_w != 0) {
 		/* Add to writers-only list */
 		LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
 		/*
 		 * We decrement bd_writer on every filter set operation.
 		 * First BIOCSETF is done by pcap_open_live() to set up
 		 * snap length. After that appliation usually sets its own filter
 		 */
 		d->bd_writer = 2;
 	} else
 		LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 
 	BPFD_UNLOCK(d);
 	BPFIF_WUNLOCK(bp);
 
 	bpf_bpfd_cnt++;
 
 	CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
 	    __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
 
 	if (op_w == 0)
 		EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
 }
 
 /*
  * Check if we need to upgrade our descriptor @d from write-only mode.
  */
 static int
 bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen)
 {
 	int is_snap, need_upgrade;
 
 	/*
 	 * Check if we've already upgraded or new filter is empty.
 	 */
 	if (d->bd_writer == 0 || fcode == NULL)
 		return (0);
 
 	need_upgrade = 0;
 
 	/*
 	 * Check if cmd looks like snaplen setting from
 	 * pcap_bpf.c:pcap_open_live().
 	 * Note we're not checking .k value here:
 	 * while pcap_open_live() definitely sets to to non-zero value,
 	 * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
 	 * do not consider upgrading immediately
 	 */
 	if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K))
 		is_snap = 1;
 	else
 		is_snap = 0;
 
 	if (is_snap == 0) {
 		/*
 		 * We're setting first filter and it doesn't look like
 		 * setting snaplen.  We're probably using bpf directly.
 		 * Upgrade immediately.
 		 */
 		need_upgrade = 1;
 	} else {
 		/*
 		 * Do not require upgrade by first BIOCSETF
 		 * (used to set snaplen) by pcap_open_live().
 		 */
 
 		if (--d->bd_writer == 0) {
 			/*
 			 * First snaplen filter has already
 			 * been set. This is probably catch-all
 			 * filter
 			 */
 			need_upgrade = 1;
 		}
 	}
 
 	CTR5(KTR_NET,
 	    "%s: filter function set by pid %d, "
 	    "bd_writer counter %d, snap %d upgrade %d",
 	    __func__, d->bd_pid, d->bd_writer,
 	    is_snap, need_upgrade);
 
 	return (need_upgrade);
 }
 
 /*
  * Add d to the list of active bp filters.
  * Reuqires bpf_attachd() to be called before
  */
 static void
 bpf_upgraded(struct bpf_d *d)
 {
 	struct bpf_if *bp;
 
 	BPF_LOCK_ASSERT();
 
 	bp = d->bd_bif;
 
 	/*
 	 * Filter can be set several times without specifying interface.
 	 * Mark d as reader and exit.
 	 */
 	if (bp == NULL) {
 		BPFD_LOCK(d);
 		d->bd_writer = 0;
 		BPFD_UNLOCK(d);
 		return;
 	}
 
 	BPFIF_WLOCK(bp);
 	BPFD_LOCK(d);
 
 	/* Remove from writers-only list */
 	LIST_REMOVE(d, bd_next);
 	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 	/* Mark d as reader */
 	d->bd_writer = 0;
 
 	BPFD_UNLOCK(d);
 	BPFIF_WUNLOCK(bp);
 
 	CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid);
 
 	EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
 }
 
 /*
  * Detach a file from its interface.
  */
 static void
 bpf_detachd(struct bpf_d *d)
 {
 	BPF_LOCK();
 	bpf_detachd_locked(d);
 	BPF_UNLOCK();
 }
 
 static void
 bpf_detachd_locked(struct bpf_d *d)
 {
 	int error;
 	struct bpf_if *bp;
 	struct ifnet *ifp;
 
 	CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
 
 	BPF_LOCK_ASSERT();
 
 	/* Check if descriptor is attached */
 	if ((bp = d->bd_bif) == NULL)
 		return;
 
 	BPFIF_WLOCK(bp);
 	BPFD_LOCK(d);
 
 	/* Save bd_writer value */
 	error = d->bd_writer;
 
 	/*
 	 * Remove d from the interface's descriptor list.
 	 */
 	LIST_REMOVE(d, bd_next);
 
 	ifp = bp->bif_ifp;
 	d->bd_bif = NULL;
 	BPFD_UNLOCK(d);
 	BPFIF_WUNLOCK(bp);
 
 	bpf_bpfd_cnt--;
 
 	/* Call event handler iff d is attached */
 	if (error == 0)
 		EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
 
 	/*
 	 * Check if this descriptor had requested promiscuous mode.
 	 * If so, turn it off.
 	 */
 	if (d->bd_promisc) {
 		d->bd_promisc = 0;
 		CURVNET_SET(ifp->if_vnet);
 		error = ifpromisc(ifp, 0);
 		CURVNET_RESTORE();
 		if (error != 0 && error != ENXIO) {
 			/*
 			 * ENXIO can happen if a pccard is unplugged
 			 * Something is really wrong if we were able to put
 			 * the driver into promiscuous mode, but can't
 			 * take it out.
 			 */
 			if_printf(bp->bif_ifp,
 				"bpf_detach: ifpromisc failed (%d)\n", error);
 		}
 	}
 }
 
 /*
  * Close the descriptor by detaching it from its interface,
  * deallocating its buffers, and marking it free.
  */
 static void
 bpf_dtor(void *data)
 {
 	struct bpf_d *d = data;
 
 	BPFD_LOCK(d);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 	funsetown(&d->bd_sigio);
 	bpf_detachd(d);
 #ifdef MAC
 	mac_bpfdesc_destroy(d);
 #endif /* MAC */
 	seldrain(&d->bd_sel);
 	knlist_destroy(&d->bd_sel.si_note);
 	callout_drain(&d->bd_callout);
 	bpf_freed(d);
 	free(d, M_BPF);
 }
 
 /*
  * Open ethernet device.  Returns ENXIO for illegal minor device number,
  * EBUSY if file is open by another process.
  */
 /* ARGSUSED */
 static	int
 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	struct bpf_d *d;
 	int error, size;
 
 	d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
 	error = devfs_set_cdevpriv(d, bpf_dtor);
 	if (error != 0) {
 		free(d, M_BPF);
 		return (error);
 	}
 
 	/*
 	 * For historical reasons, perform a one-time initialization call to
 	 * the buffer routines, even though we're not yet committed to a
 	 * particular buffer method.
 	 */
 	bpf_buffer_init(d);
 	d->bd_hbuf_in_use = 0;
 	d->bd_bufmode = BPF_BUFMODE_BUFFER;
 	d->bd_sig = SIGIO;
 	d->bd_direction = BPF_D_INOUT;
 	BPF_PID_REFRESH(d, td);
 #ifdef MAC
 	mac_bpfdesc_init(d);
 	mac_bpfdesc_create(td->td_ucred, d);
 #endif
 	mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF);
 	callout_init_mtx(&d->bd_callout, &d->bd_lock, 0);
 	knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock);
 
 	/* Allocate default buffers */
 	size = d->bd_bufsize;
 	bpf_buffer_ioctl_sblen(d, &size);
 
 	return (0);
 }
 
 /*
  *  bpfread - read next chunk of packets from buffers
  */
 static	int
 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d;
 	int error;
 	int non_block;
 	int timed_out;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Restrict application to use a buffer the same size as
 	 * as kernel buffers.
 	 */
 	if (uio->uio_resid != d->bd_bufsize)
 		return (EINVAL);
 
 	non_block = ((ioflag & O_NONBLOCK) != 0);
 
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
 		BPFD_UNLOCK(d);
 		return (EOPNOTSUPP);
 	}
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	timed_out = (d->bd_state == BPF_TIMED_OUT);
 	d->bd_state = BPF_IDLE;
 	while (d->bd_hbuf_in_use) {
 		error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 		    PRINET|PCATCH, "bd_hbuf", 0);
 		if (error != 0) {
 			BPFD_UNLOCK(d);
 			return (error);
 		}
 	}
 	/*
 	 * If the hold buffer is empty, then do a timed sleep, which
 	 * ends when the timeout expires or when enough packets
 	 * have arrived to fill the store buffer.
 	 */
 	while (d->bd_hbuf == NULL) {
 		if (d->bd_slen != 0) {
 			/*
 			 * A packet(s) either arrived since the previous
 			 * read or arrived while we were asleep.
 			 */
 			if (d->bd_immediate || non_block || timed_out) {
 				/*
 				 * Rotate the buffers and return what's here
 				 * if we are in immediate mode, non-blocking
 				 * flag is set, or this descriptor timed out.
 				 */
 				ROTATE_BUFFERS(d);
 				break;
 			}
 		}
 
 		/*
 		 * No data is available, check to see if the bpf device
 		 * is still pointed at a real interface.  If not, return
 		 * ENXIO so that the userland process knows to rebind
 		 * it before using it again.
 		 */
 		if (d->bd_bif == NULL) {
 			BPFD_UNLOCK(d);
 			return (ENXIO);
 		}
 
 		if (non_block) {
 			BPFD_UNLOCK(d);
 			return (EWOULDBLOCK);
 		}
 		error = msleep(d, &d->bd_lock, PRINET|PCATCH,
 		     "bpf", d->bd_rtout);
 		if (error == EINTR || error == ERESTART) {
 			BPFD_UNLOCK(d);
 			return (error);
 		}
 		if (error == EWOULDBLOCK) {
 			/*
 			 * On a timeout, return what's in the buffer,
 			 * which may be nothing.  If there is something
 			 * in the store buffer, we can rotate the buffers.
 			 */
 			if (d->bd_hbuf)
 				/*
 				 * We filled up the buffer in between
 				 * getting the timeout and arriving
 				 * here, so we don't need to rotate.
 				 */
 				break;
 
 			if (d->bd_slen == 0) {
 				BPFD_UNLOCK(d);
 				return (0);
 			}
 			ROTATE_BUFFERS(d);
 			break;
 		}
 	}
 	/*
 	 * At this point, we know we have something in the hold slot.
 	 */
 	d->bd_hbuf_in_use = 1;
 	BPFD_UNLOCK(d);
 
 	/*
 	 * Move data from hold buffer into user space.
 	 * We know the entire buffer is transferred since
 	 * we checked above that the read buffer is bpf_bufsize bytes.
   	 *
 	 * We do not have to worry about simultaneous reads because
 	 * we waited for sole access to the hold buffer above.
 	 */
 	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
 
 	BPFD_LOCK(d);
 	KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf"));
 	d->bd_fbuf = d->bd_hbuf;
 	d->bd_hbuf = NULL;
 	d->bd_hlen = 0;
 	bpf_buf_reclaimed(d);
 	d->bd_hbuf_in_use = 0;
 	wakeup(&d->bd_hbuf_in_use);
 	BPFD_UNLOCK(d);
 
 	return (error);
 }
 
 /*
  * If there are processes sleeping on this descriptor, wake them up.
  */
 static __inline void
 bpf_wakeup(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 	if (d->bd_state == BPF_WAITING) {
 		callout_stop(&d->bd_callout);
 		d->bd_state = BPF_IDLE;
 	}
 	wakeup(d);
 	if (d->bd_async && d->bd_sig && d->bd_sigio)
 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
 
 	selwakeuppri(&d->bd_sel, PRINET);
 	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
 }
 
 static void
 bpf_timed_out(void *arg)
 {
 	struct bpf_d *d = (struct bpf_d *)arg;
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout))
 		return;
 	if (d->bd_state == BPF_WAITING) {
 		d->bd_state = BPF_TIMED_OUT;
 		if (d->bd_slen != 0)
 			bpf_wakeup(d);
 	}
 }
 
 static int
 bpf_ready(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
 		return (1);
 	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
 	    d->bd_slen != 0)
 		return (1);
 	return (0);
 }
 
 static int
 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct bpf_d *d;
 	struct ifnet *ifp;
 	struct mbuf *m, *mc;
 	struct sockaddr dst;
 	int error, hlen;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	BPF_PID_REFRESH_CUR(d);
 	d->bd_wcount++;
 	/* XXX: locking required */
 	if (d->bd_bif == NULL) {
 		d->bd_wdcount++;
 		return (ENXIO);
 	}
 
 	ifp = d->bd_bif->bif_ifp;
 
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		d->bd_wdcount++;
 		return (ENETDOWN);
 	}
 
 	if (uio->uio_resid == 0) {
 		d->bd_wdcount++;
 		return (0);
 	}
 
 	bzero(&dst, sizeof(dst));
 	m = NULL;
 	hlen = 0;
 	/* XXX: bpf_movein() can sleep */
 	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
 	    &m, &dst, &hlen, d->bd_wfilter);
 	if (error) {
 		d->bd_wdcount++;
 		return (error);
 	}
 	d->bd_wfcount++;
 	if (d->bd_hdrcmplt)
 		dst.sa_family = pseudo_AF_HDRCMPLT;
 
 	if (d->bd_feedback) {
 		mc = m_dup(m, M_NOWAIT);
 		if (mc != NULL)
 			mc->m_pkthdr.rcvif = ifp;
 		/* Set M_PROMISC for outgoing packets to be discarded. */
 		if (d->bd_direction == BPF_D_INOUT)
 			m->m_flags |= M_PROMISC;
 	} else
 		mc = NULL;
 
 	m->m_pkthdr.len -= hlen;
 	m->m_len -= hlen;
 	m->m_data += hlen;	/* XXX */
 
 	CURVNET_SET(ifp->if_vnet);
 #ifdef MAC
 	BPFD_LOCK(d);
 	mac_bpfdesc_create_mbuf(d, m);
 	if (mc != NULL)
 		mac_bpfdesc_create_mbuf(d, mc);
 	BPFD_UNLOCK(d);
 #endif
 
 	error = (*ifp->if_output)(ifp, m, &dst, NULL);
 	if (error)
 		d->bd_wdcount++;
 
 	if (mc != NULL) {
 		if (error == 0)
 			(*ifp->if_input)(ifp, mc);
 		else
 			m_freem(mc);
 	}
 	CURVNET_RESTORE();
 
 	return (error);
 }
 
 /*
  * Reset a descriptor by flushing its packet buffer and clearing the receive
  * and drop counts.  This is doable for kernel-only buffers, but with
  * zero-copy buffers, we can't write to (or rotate) buffers that are
  * currently owned by userspace.  It would be nice if we could encapsulate
  * this logic in the buffer code rather than here.
  */
 static void
 reset_d(struct bpf_d *d)
 {
 
 	BPFD_LOCK_ASSERT(d);
 
 	while (d->bd_hbuf_in_use)
 		mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET,
 		    "bd_hbuf", 0);
 	if ((d->bd_hbuf != NULL) &&
 	    (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) {
 		/* Free the hold buffer. */
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 		d->bd_hlen = 0;
 		bpf_buf_reclaimed(d);
 	}
 	if (bpf_canwritebuf(d))
 		d->bd_slen = 0;
 	d->bd_rcount = 0;
 	d->bd_dcount = 0;
 	d->bd_fcount = 0;
 	d->bd_wcount = 0;
 	d->bd_wfcount = 0;
 	d->bd_wdcount = 0;
 	d->bd_zcopy = 0;
 }
 
 /*
  *  FIONREAD		Check for read packet available.
  *  SIOCGIFADDR		Get interface address - convenient hook to driver.
  *  BIOCGBLEN		Get buffer len [for read()].
  *  BIOCSETF		Set read filter.
  *  BIOCSETFNR		Set read filter without resetting descriptor.
  *  BIOCSETWF		Set write filter.
  *  BIOCFLUSH		Flush read packet buffer.
  *  BIOCPROMISC		Put interface into promiscuous mode.
  *  BIOCGDLT		Get link layer type.
  *  BIOCGETIF		Get interface name.
  *  BIOCSETIF		Set interface.
  *  BIOCSRTIMEOUT	Set read timeout.
  *  BIOCGRTIMEOUT	Get read timeout.
  *  BIOCGSTATS		Get packet stats.
  *  BIOCIMMEDIATE	Set immediate mode.
  *  BIOCVERSION		Get filter language version.
  *  BIOCGHDRCMPLT	Get "header already complete" flag
  *  BIOCSHDRCMPLT	Set "header already complete" flag
  *  BIOCGDIRECTION	Get packet direction flag
  *  BIOCSDIRECTION	Set packet direction flag
  *  BIOCGTSTAMP		Get time stamp format and resolution.
  *  BIOCSTSTAMP		Set time stamp format and resolution.
  *  BIOCLOCK		Set "locked" flag
  *  BIOCFEEDBACK	Set packet feedback mode.
  *  BIOCSETZBUF		Set current zero-copy buffer locations.
  *  BIOCGETZMAX		Get maximum zero-copy buffer size.
  *  BIOCROTZBUF		Force rotation of zero-copy buffer
  *  BIOCSETBUFMODE	Set buffer mode.
  *  BIOCGETBUFMODE	Get current buffer mode.
  */
 /* ARGSUSED */
 static	int
 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
     struct thread *td)
 {
 	struct bpf_d *d;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH(d, td);
 	if (d->bd_state == BPF_WAITING)
 		callout_stop(&d->bd_callout);
 	d->bd_state = BPF_IDLE;
 	BPFD_UNLOCK(d);
 
 	if (d->bd_locked == 1) {
 		switch (cmd) {
 		case BIOCGBLEN:
 		case BIOCFLUSH:
 		case BIOCGDLT:
 		case BIOCGDLTLIST:
 #ifdef COMPAT_FREEBSD32
 		case BIOCGDLTLIST32:
 #endif
 		case BIOCGETIF:
 		case BIOCGRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 		case BIOCGRTIMEOUT32:
 #endif
 		case BIOCGSTATS:
 		case BIOCVERSION:
 		case BIOCGRSIG:
 		case BIOCGHDRCMPLT:
 		case BIOCSTSTAMP:
 		case BIOCFEEDBACK:
 		case FIONREAD:
 		case BIOCLOCK:
 		case BIOCSRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 		case BIOCSRTIMEOUT32:
 #endif
 		case BIOCIMMEDIATE:
 		case TIOCGPGRP:
 		case BIOCROTZBUF:
 			break;
 		default:
 			return (EPERM);
 		}
 	}
 #ifdef COMPAT_FREEBSD32
 	/*
 	 * If we see a 32-bit compat ioctl, mark the stream as 32-bit so
 	 * that it will get 32-bit packet headers.
 	 */
 	switch (cmd) {
 	case BIOCSETF32:
 	case BIOCSETFNR32:
 	case BIOCSETWF32:
 	case BIOCGDLTLIST32:
 	case BIOCGRTIMEOUT32:
 	case BIOCSRTIMEOUT32:
 		BPFD_LOCK(d);
 		d->bd_compat32 = 1;
 		BPFD_UNLOCK(d);
 	}
 #endif
 
 	CURVNET_SET(TD_TO_VNET(td));
 	switch (cmd) {
 
 	default:
 		error = EINVAL;
 		break;
 
 	/*
 	 * Check for read packet available.
 	 */
 	case FIONREAD:
 		{
 			int n;
 
 			BPFD_LOCK(d);
 			n = d->bd_slen;
 			while (d->bd_hbuf_in_use)
 				mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 				    PRINET, "bd_hbuf", 0);
 			if (d->bd_hbuf)
 				n += d->bd_hlen;
 			BPFD_UNLOCK(d);
 
 			*(int *)addr = n;
 			break;
 		}
 
 	case SIOCGIFADDR:
 		{
 			struct ifnet *ifp;
 
 			if (d->bd_bif == NULL)
 				error = EINVAL;
 			else {
 				ifp = d->bd_bif->bif_ifp;
 				error = (*ifp->if_ioctl)(ifp, cmd, addr);
 			}
 			break;
 		}
 
 	/*
 	 * Get buffer len [for read()].
 	 */
 	case BIOCGBLEN:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_bufsize;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set buffer length.
 	 */
 	case BIOCSBLEN:
 		error = bpf_ioctl_sblen(d, (u_int *)addr);
 		break;
 
 	/*
 	 * Set link layer read filter.
 	 */
 	case BIOCSETF:
 	case BIOCSETFNR:
 	case BIOCSETWF:
 #ifdef COMPAT_FREEBSD32
 	case BIOCSETF32:
 	case BIOCSETFNR32:
 	case BIOCSETWF32:
 #endif
 		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
 		break;
 
 	/*
 	 * Flush read packet buffer.
 	 */
 	case BIOCFLUSH:
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Put interface into promiscuous mode.
 	 */
 	case BIOCPROMISC:
 		if (d->bd_bif == NULL) {
 			/*
 			 * No interface attached yet.
 			 */
 			error = EINVAL;
 			break;
 		}
 		if (d->bd_promisc == 0) {
 			error = ifpromisc(d->bd_bif->bif_ifp, 1);
 			if (error == 0)
 				d->bd_promisc = 1;
 		}
 		break;
 
 	/*
 	 * Get current data link type.
 	 */
 	case BIOCGDLT:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			*(u_int *)addr = d->bd_bif->bif_dlt;
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Get a list of supported data link types.
 	 */
 #ifdef COMPAT_FREEBSD32
 	case BIOCGDLTLIST32:
 		{
 			struct bpf_dltlist32 *list32;
 			struct bpf_dltlist dltlist;
 
 			list32 = (struct bpf_dltlist32 *)addr;
 			dltlist.bfl_len = list32->bfl_len;
 			dltlist.bfl_list = PTRIN(list32->bfl_list);
 			BPF_LOCK();
 			if (d->bd_bif == NULL)
 				error = EINVAL;
 			else {
 				error = bpf_getdltlist(d, &dltlist);
 				if (error == 0)
 					list32->bfl_len = dltlist.bfl_len;
 			}
 			BPF_UNLOCK();
 			break;
 		}
 #endif
 
 	case BIOCGDLTLIST:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Set data link type.
 	 */
 	case BIOCSDLT:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_setdlt(d, *(u_int *)addr);
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Get interface name.
 	 */
 	case BIOCGETIF:
 		BPF_LOCK();
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else {
 			struct ifnet *const ifp = d->bd_bif->bif_ifp;
 			struct ifreq *const ifr = (struct ifreq *)addr;
 
 			strlcpy(ifr->ifr_name, ifp->if_xname,
 			    sizeof(ifr->ifr_name));
 		}
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Set interface.
 	 */
 	case BIOCSETIF:
 		BPF_LOCK();
 		error = bpf_setif(d, (struct ifreq *)addr);
 		BPF_UNLOCK();
 		break;
 
 	/*
 	 * Set read timeout.
 	 */
 	case BIOCSRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 	case BIOCSRTIMEOUT32:
 #endif
 		{
 			struct timeval *tv = (struct timeval *)addr;
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 			struct timeval32 *tv32;
 			struct timeval tv64;
 
 			if (cmd == BIOCSRTIMEOUT32) {
 				tv32 = (struct timeval32 *)addr;
 				tv = &tv64;
 				tv->tv_sec = tv32->tv_sec;
 				tv->tv_usec = tv32->tv_usec;
 			} else
 #endif
 				tv = (struct timeval *)addr;
 
 			/*
 			 * Subtract 1 tick from tvtohz() since this isn't
 			 * a one-shot timer.
 			 */
 			if ((error = itimerfix(tv)) == 0)
 				d->bd_rtout = tvtohz(tv) - 1;
 			break;
 		}
 
 	/*
 	 * Get read timeout.
 	 */
 	case BIOCGRTIMEOUT:
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 	case BIOCGRTIMEOUT32:
 #endif
 		{
 			struct timeval *tv;
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 			struct timeval32 *tv32;
 			struct timeval tv64;
 
 			if (cmd == BIOCGRTIMEOUT32)
 				tv = &tv64;
 			else
 #endif
 				tv = (struct timeval *)addr;
 
 			tv->tv_sec = d->bd_rtout / hz;
 			tv->tv_usec = (d->bd_rtout % hz) * tick;
 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
 			if (cmd == BIOCGRTIMEOUT32) {
 				tv32 = (struct timeval32 *)addr;
 				tv32->tv_sec = tv->tv_sec;
 				tv32->tv_usec = tv->tv_usec;
 			}
 #endif
 
 			break;
 		}
 
 	/*
 	 * Get packet stats.
 	 */
 	case BIOCGSTATS:
 		{
 			struct bpf_stat *bs = (struct bpf_stat *)addr;
 
 			/* XXXCSJP overflow */
 			bs->bs_recv = d->bd_rcount;
 			bs->bs_drop = d->bd_dcount;
 			break;
 		}
 
 	/*
 	 * Set immediate mode.
 	 */
 	case BIOCIMMEDIATE:
 		BPFD_LOCK(d);
 		d->bd_immediate = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCVERSION:
 		{
 			struct bpf_version *bv = (struct bpf_version *)addr;
 
 			bv->bv_major = BPF_MAJOR_VERSION;
 			bv->bv_minor = BPF_MINOR_VERSION;
 			break;
 		}
 
 	/*
 	 * Get "header already complete" flag
 	 */
 	case BIOCGHDRCMPLT:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_hdrcmplt;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set "header already complete" flag
 	 */
 	case BIOCSHDRCMPLT:
 		BPFD_LOCK(d);
 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Get packet direction flag
 	 */
 	case BIOCGDIRECTION:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_direction;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set packet direction flag
 	 */
 	case BIOCSDIRECTION:
 		{
 			u_int	direction;
 
 			direction = *(u_int *)addr;
 			switch (direction) {
 			case BPF_D_IN:
 			case BPF_D_INOUT:
 			case BPF_D_OUT:
 				BPFD_LOCK(d);
 				d->bd_direction = direction;
 				BPFD_UNLOCK(d);
 				break;
 			default:
 				error = EINVAL;
 			}
 		}
 		break;
 
 	/*
 	 * Get packet timestamp format and resolution.
 	 */
 	case BIOCGTSTAMP:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_tstamp;
 		BPFD_UNLOCK(d);
 		break;
 
 	/*
 	 * Set packet timestamp format and resolution.
 	 */
 	case BIOCSTSTAMP:
 		{
 			u_int	func;
 
 			func = *(u_int *)addr;
 			if (BPF_T_VALID(func))
 				d->bd_tstamp = func;
 			else
 				error = EINVAL;
 		}
 		break;
 
 	case BIOCFEEDBACK:
 		BPFD_LOCK(d);
 		d->bd_feedback = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCLOCK:
 		BPFD_LOCK(d);
 		d->bd_locked = 1;
 		BPFD_UNLOCK(d);
 		break;
 
 	case FIONBIO:		/* Non-blocking I/O */
 		break;
 
 	case FIOASYNC:		/* Send signal on receive packets */
 		BPFD_LOCK(d);
 		d->bd_async = *(int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case FIOSETOWN:
 		/*
 		 * XXX: Add some sort of locking here?
 		 * fsetown() can sleep.
 		 */
 		error = fsetown(*(int *)addr, &d->bd_sigio);
 		break;
 
 	case FIOGETOWN:
 		BPFD_LOCK(d);
 		*(int *)addr = fgetown(&d->bd_sigio);
 		BPFD_UNLOCK(d);
 		break;
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		error = fsetown(-(*(int *)addr), &d->bd_sigio);
 		break;
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)addr = -fgetown(&d->bd_sigio);
 		break;
 
 	case BIOCSRSIG:		/* Set receive signal */
 		{
 			u_int sig;
 
 			sig = *(u_int *)addr;
 
 			if (sig >= NSIG)
 				error = EINVAL;
 			else {
 				BPFD_LOCK(d);
 				d->bd_sig = sig;
 				BPFD_UNLOCK(d);
 			}
 			break;
 		}
 	case BIOCGRSIG:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_sig;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCGETBUFMODE:
 		BPFD_LOCK(d);
 		*(u_int *)addr = d->bd_bufmode;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCSETBUFMODE:
 		/*
 		 * Allow the buffering mode to be changed as long as we
 		 * haven't yet committed to a particular mode.  Our
 		 * definition of commitment, for now, is whether or not a
 		 * buffer has been allocated or an interface attached, since
 		 * that's the point where things get tricky.
 		 */
 		switch (*(u_int *)addr) {
 		case BPF_BUFMODE_BUFFER:
 			break;
 
 		case BPF_BUFMODE_ZBUF:
 			if (bpf_zerocopy_enable)
 				break;
 			/* FALLSTHROUGH */
 
 		default:
 			CURVNET_RESTORE();
 			return (EINVAL);
 		}
 
 		BPFD_LOCK(d);
 		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
 		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
 			BPFD_UNLOCK(d);
 			CURVNET_RESTORE();
 			return (EBUSY);
 		}
 		d->bd_bufmode = *(u_int *)addr;
 		BPFD_UNLOCK(d);
 		break;
 
 	case BIOCGETZMAX:
 		error = bpf_ioctl_getzmax(td, d, (size_t *)addr);
 		break;
 
 	case BIOCSETZBUF:
 		error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr);
 		break;
 
 	case BIOCROTZBUF:
 		error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr);
 		break;
 	}
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * Set d's packet filter program to fp.  If this file already has a filter,
  * free it and replace it.  Returns EINVAL for bogus requests.
  *
  * Note we need global lock here to serialize bpf_setf() and bpf_setif() calls
  * since reading d->bd_bif can't be protected by d or interface lock due to
  * lock order.
  *
  * Additionally, we have to acquire interface write lock due to bpf_mtap() uses
  * interface read lock to read all filers.
  *
  */
 static int
 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
 {
 #ifdef COMPAT_FREEBSD32
 	struct bpf_program fp_swab;
 	struct bpf_program32 *fp32;
 #endif
 	struct bpf_insn *fcode, *old;
 #ifdef BPF_JITTER
 	bpf_jit_filter *jfunc, *ofunc;
 #endif
 	size_t size;
 	u_int flen;
 	int need_upgrade;
 
 #ifdef COMPAT_FREEBSD32
 	switch (cmd) {
 	case BIOCSETF32:
 	case BIOCSETWF32:
 	case BIOCSETFNR32:
 		fp32 = (struct bpf_program32 *)fp;
 		fp_swab.bf_len = fp32->bf_len;
 		fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
 		fp = &fp_swab;
 		switch (cmd) {
 		case BIOCSETF32:
 			cmd = BIOCSETF;
 			break;
 		case BIOCSETWF32:
 			cmd = BIOCSETWF;
 			break;
 		}
 		break;
 	}
 #endif
 
 	fcode = NULL;
 #ifdef BPF_JITTER
 	jfunc = ofunc = NULL;
 #endif
 	need_upgrade = 0;
 
 	/*
 	 * Check new filter validness before acquiring any locks.
 	 * Allocate memory for new filter, if needed.
 	 */
 	flen = fp->bf_len;
 	if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0))
 		return (EINVAL);
 	size = flen * sizeof(*fp->bf_insns);
 	if (size > 0) {
 		/* We're setting up new filter.  Copy and check actual data. */
 		fcode = malloc(size, M_BPF, M_WAITOK);
 		if (copyin(fp->bf_insns, fcode, size) != 0 ||
 		    !bpf_validate(fcode, flen)) {
 			free(fcode, M_BPF);
 			return (EINVAL);
 		}
 #ifdef BPF_JITTER
 		/* Filter is copied inside fcode and is perfectly valid. */
 		jfunc = bpf_jitter(fcode, flen);
 #endif
 	}
 
 	BPF_LOCK();
 
 	/*
 	 * Set up new filter.
 	 * Protect filter change by interface lock.
 	 * Additionally, we are protected by global lock here.
 	 */
 	if (d->bd_bif != NULL)
 		BPFIF_WLOCK(d->bd_bif);
 	BPFD_LOCK(d);
 	if (cmd == BIOCSETWF) {
 		old = d->bd_wfilter;
 		d->bd_wfilter = fcode;
 	} else {
 		old = d->bd_rfilter;
 		d->bd_rfilter = fcode;
 #ifdef BPF_JITTER
 		ofunc = d->bd_bfilter;
 		d->bd_bfilter = jfunc;
 #endif
 		if (cmd == BIOCSETF)
 			reset_d(d);
 
 		need_upgrade = bpf_check_upgrade(cmd, d, fcode, flen);
 	}
 	BPFD_UNLOCK(d);
 	if (d->bd_bif != NULL)
 		BPFIF_WUNLOCK(d->bd_bif);
 	if (old != NULL)
 		free(old, M_BPF);
 #ifdef BPF_JITTER
 	if (ofunc != NULL)
 		bpf_destroy_jit_filter(ofunc);
 #endif
 
 	/* Move d to active readers list. */
 	if (need_upgrade != 0)
 		bpf_upgraded(d);
 
 	BPF_UNLOCK();
 	return (0);
 }
 
 /*
  * Detach a file from its current interface (if attached at all) and attach
  * to the interface indicated by the name stored in ifr.
  * Return an errno or 0.
  */
 static int
 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
 {
 	struct bpf_if *bp;
 	struct ifnet *theywant;
 
 	BPF_LOCK_ASSERT();
 
 	theywant = ifunit(ifr->ifr_name);
 	if (theywant == NULL || theywant->if_bpf == NULL)
 		return (ENXIO);
 
 	bp = theywant->if_bpf;
 
 	/* Check if interface is not being detached from BPF */
 	BPFIF_RLOCK(bp);
 	if (bp->flags & BPFIF_FLAG_DYING) {
 		BPFIF_RUNLOCK(bp);
 		return (ENXIO);
 	}
 	BPFIF_RUNLOCK(bp);
 
 	/*
 	 * Behavior here depends on the buffering model.  If we're using
 	 * kernel memory buffers, then we can allocate them here.  If we're
 	 * using zero-copy, then the user process must have registered
 	 * buffers by the time we get here.  If not, return an error.
 	 */
 	switch (d->bd_bufmode) {
 	case BPF_BUFMODE_BUFFER:
 	case BPF_BUFMODE_ZBUF:
 		if (d->bd_sbuf == NULL)
 			return (EINVAL);
 		break;
 
 	default:
 		panic("bpf_setif: bufmode %d", d->bd_bufmode);
 	}
 	if (bp != d->bd_bif)
 		bpf_attachd(d, bp);
 	BPFD_LOCK(d);
 	reset_d(d);
 	BPFD_UNLOCK(d);
 	return (0);
 }
 
 /*
  * Support for select() and poll() system calls
  *
  * Return true iff the specific operation will not block indefinitely.
  * Otherwise, return false but make a note that a selwakeup() must be done.
  */
 static int
 bpfpoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct bpf_d *d;
 	int revents;
 
 	if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
 		return (events &
 		    (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	revents = events & (POLLOUT | POLLWRNORM);
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH(d, td);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (bpf_ready(d))
 			revents |= events & (POLLIN | POLLRDNORM);
 		else {
 			selrecord(td, &d->bd_sel);
 			/* Start the read timeout if necessary. */
 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 				callout_reset(&d->bd_callout, d->bd_rtout,
 				    bpf_timed_out, d);
 				d->bd_state = BPF_WAITING;
 			}
 		}
 	}
 	BPFD_UNLOCK(d);
 	return (revents);
 }
 
 /*
  * Support for kevent() system call.  Register EVFILT_READ filters and
  * reject all others.
  */
 int
 bpfkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct bpf_d *d;
 
 	if (devfs_get_cdevpriv((void **)&d) != 0 ||
 	    kn->kn_filter != EVFILT_READ)
 		return (1);
 
 	/*
 	 * Refresh PID associated with this descriptor.
 	 */
 	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	kn->kn_fop = &bpfread_filtops;
 	kn->kn_hook = d;
 	knlist_add(&d->bd_sel.si_note, kn, 1);
 	BPFD_UNLOCK(d);
 
 	return (0);
 }
 
 static void
 filt_bpfdetach(struct knote *kn)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 
 	knlist_remove(&d->bd_sel.si_note, kn, 0);
 }
 
 static int
 filt_bpfread(struct knote *kn, long hint)
 {
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 	int ready;
 
 	BPFD_LOCK_ASSERT(d);
 	ready = bpf_ready(d);
 	if (ready) {
 		kn->kn_data = d->bd_slen;
 		while (d->bd_hbuf_in_use)
 			mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 			    PRINET, "bd_hbuf", 0);
 		if (d->bd_hbuf)
 			kn->kn_data += d->bd_hlen;
 	} else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
 		callout_reset(&d->bd_callout, d->bd_rtout,
 		    bpf_timed_out, d);
 		d->bd_state = BPF_WAITING;
 	}
 
 	return (ready);
 }
 
 #define	BPF_TSTAMP_NONE		0
 #define	BPF_TSTAMP_FAST		1
 #define	BPF_TSTAMP_NORMAL	2
 #define	BPF_TSTAMP_EXTERN	3
 
 static int
 bpf_ts_quality(int tstype)
 {
 
 	if (tstype == BPF_T_NONE)
 		return (BPF_TSTAMP_NONE);
 	if ((tstype & BPF_T_FAST) != 0)
 		return (BPF_TSTAMP_FAST);
 
 	return (BPF_TSTAMP_NORMAL);
 }
 
 static int
 bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m)
 {
 	struct m_tag *tag;
 	int quality;
 
 	quality = bpf_ts_quality(tstype);
 	if (quality == BPF_TSTAMP_NONE)
 		return (quality);
 
 	if (m != NULL) {
 		tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL);
 		if (tag != NULL) {
 			*bt = *(struct bintime *)(tag + 1);
 			return (BPF_TSTAMP_EXTERN);
 		}
 	}
 	if (quality == BPF_TSTAMP_NORMAL)
 		binuptime(bt);
 	else
 		getbinuptime(bt);
 
 	return (quality);
 }
 
 /*
  * Incoming linkage from device drivers.  Process the packet pkt, of length
  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
  * by each process' filter, and if accepted, stashed into the corresponding
  * buffer.
  */
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 	struct bintime bt;
 	struct bpf_d *d;
 #ifdef BPF_JITTER
 	bpf_jit_filter *bf;
 #endif
 	u_int slen;
 	int gottime;
 
 	gottime = BPF_TSTAMP_NONE;
 
 	BPFIF_RLOCK(bp);
 
 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		/*
 		 * We are not using any locks for d here because:
 		 * 1) any filter change is protected by interface
 		 * write lock
 		 * 2) destroying/detaching d is protected by interface
 		 * write lock, too
 		 */
 
 		/* XXX: Do not protect counter for the sake of performance. */
 		++d->bd_rcount;
 		/*
 		 * NB: We dont call BPF_CHECK_DIRECTION() here since there is no
 		 * way for the caller to indiciate to us whether this packet
 		 * is inbound or outbound.  In the bpf_mtap() routines, we use
 		 * the interface pointers on the mbuf to figure it out.
 		 */
 #ifdef BPF_JITTER
 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
 		if (bf != NULL)
 			slen = (*(bf->func))(pkt, pktlen, pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
 		if (slen != 0) {
 			/*
 			 * Filter matches. Let's to acquire write lock.
 			 */
 			BPFD_LOCK(d);
 
 			d->bd_fcount++;
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp, NULL);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, pkt, pktlen, slen,
 				    bpf_append_bytes, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	BPFIF_RUNLOCK(bp);
 }
 
 #define	BPF_CHECK_DIRECTION(d, r, i)				\
 	    (((d)->bd_direction == BPF_D_IN && (r) != (i)) ||	\
 	    ((d)->bd_direction == BPF_D_OUT && (r) == (i)))
 
 /*
  * Incoming linkage from device drivers, when packet is in an mbuf chain.
  * Locking model is explained in bpf_tap().
  */
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 	struct bintime bt;
 	struct bpf_d *d;
 #ifdef BPF_JITTER
 	bpf_jit_filter *bf;
 #endif
 	u_int pktlen, slen;
 	int gottime;
 
 	/* Skip outgoing duplicate packets. */
 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
 		m->m_flags &= ~M_PROMISC;
 		return;
 	}
 
 	pktlen = m_length(m, NULL);
 	gottime = BPF_TSTAMP_NONE;
 
 	BPFIF_RLOCK(bp);
 
 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
 			continue;
 		++d->bd_rcount;
 #ifdef BPF_JITTER
 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
 		/* XXX We cannot handle multiple mbufs. */
 		if (bf != NULL && m->m_next == NULL)
 			slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
 		if (slen != 0) {
 			BPFD_LOCK(d);
 
 			d->bd_fcount++;
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)m, pktlen, slen,
 				    bpf_append_mbuf, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	BPFIF_RUNLOCK(bp);
 }
 
 /*
  * Incoming linkage from device drivers, when packet is in
  * an mbuf chain and to be prepended by a contiguous header.
  */
 void
 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
 {
 	struct bintime bt;
 	struct mbuf mb;
 	struct bpf_d *d;
 	u_int pktlen, slen;
 	int gottime;
 
 	/* Skip outgoing duplicate packets. */
 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
 		m->m_flags &= ~M_PROMISC;
 		return;
 	}
 
 	pktlen = m_length(m, NULL);
 	/*
 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
 	 * Note that we cut corners here; we only setup what's
 	 * absolutely needed--this mbuf should never go anywhere else.
 	 */
 	mb.m_next = m;
 	mb.m_data = data;
 	mb.m_len = dlen;
 	pktlen += dlen;
 
 	gottime = BPF_TSTAMP_NONE;
 
 	BPFIF_RLOCK(bp);
 
 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
 			continue;
 		++d->bd_rcount;
 		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
 		if (slen != 0) {
 			BPFD_LOCK(d);
 
 			d->bd_fcount++;
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)&mb, pktlen, slen,
 				    bpf_append_mbuf, &bt);
 			BPFD_UNLOCK(d);
 		}
 	}
 	BPFIF_RUNLOCK(bp);
 }
 
 #undef	BPF_CHECK_DIRECTION
 
 #undef	BPF_TSTAMP_NONE
 #undef	BPF_TSTAMP_FAST
 #undef	BPF_TSTAMP_NORMAL
 #undef	BPF_TSTAMP_EXTERN
 
 static int
 bpf_hdrlen(struct bpf_d *d)
 {
 	int hdrlen;
 
 	hdrlen = d->bd_bif->bif_hdrlen;
 #ifndef BURN_BRIDGES
 	if (d->bd_tstamp == BPF_T_NONE ||
 	    BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME)
 #ifdef COMPAT_FREEBSD32
 		if (d->bd_compat32)
 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32);
 		else
 #endif
 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr);
 	else
 #endif
 		hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr);
 #ifdef COMPAT_FREEBSD32
 	if (d->bd_compat32)
 		hdrlen = BPF_WORDALIGN32(hdrlen);
 	else
 #endif
 		hdrlen = BPF_WORDALIGN(hdrlen);
 
 	return (hdrlen - d->bd_bif->bif_hdrlen);
 }
 
 static void
 bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype)
 {
 	struct bintime bt2;
 	struct timeval tsm;
 	struct timespec tsn;
 
 	if ((tstype & BPF_T_MONOTONIC) == 0) {
 		bt2 = *bt;
 		bintime_add(&bt2, &boottimebin);
 		bt = &bt2;
 	}
 	switch (BPF_T_FORMAT(tstype)) {
 	case BPF_T_MICROTIME:
 		bintime2timeval(bt, &tsm);
 		ts->bt_sec = tsm.tv_sec;
 		ts->bt_frac = tsm.tv_usec;
 		break;
 	case BPF_T_NANOTIME:
 		bintime2timespec(bt, &tsn);
 		ts->bt_sec = tsn.tv_sec;
 		ts->bt_frac = tsn.tv_nsec;
 		break;
 	case BPF_T_BINTIME:
 		ts->bt_sec = bt->sec;
 		ts->bt_frac = bt->frac;
 		break;
 	}
 }
 
 /*
  * Move the packet data from interface memory (pkt) into the
  * store buffer.  "cpfn" is the routine called to do the actual data
  * transfer.  bcopy is passed in to copy contiguous chunks, while
  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
  * pkt is really an mbuf.
  */
 static void
 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
     struct bintime *bt)
 {
 	struct bpf_xhdr hdr;
 #ifndef BURN_BRIDGES
 	struct bpf_hdr hdr_old;
 #ifdef COMPAT_FREEBSD32
 	struct bpf_hdr32 hdr32_old;
 #endif
 #endif
 	int caplen, curlen, hdrlen, totlen;
 	int do_wakeup = 0;
 	int do_timestamp;
 	int tstype;
 
 	BPFD_LOCK_ASSERT(d);
 
 	/*
 	 * Detect whether user space has released a buffer back to us, and if
 	 * so, move it from being a hold buffer to a free buffer.  This may
 	 * not be the best place to do it (for example, we might only want to
 	 * run this check if we need the space), but for now it's a reliable
 	 * spot to do it.
 	 */
 	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
 		while (d->bd_hbuf_in_use)
 			mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 			    PRINET, "bd_hbuf", 0);
 		d->bd_fbuf = d->bd_hbuf;
 		d->bd_hbuf = NULL;
 		d->bd_hlen = 0;
 		bpf_buf_reclaimed(d);
 	}
 
 	/*
 	 * Figure out how many bytes to move.  If the packet is
 	 * greater or equal to the snapshot length, transfer that
 	 * much.  Otherwise, transfer the whole packet (unless
 	 * we hit the buffer size limit).
 	 */
 	hdrlen = bpf_hdrlen(d);
 	totlen = hdrlen + min(snaplen, pktlen);
 	if (totlen > d->bd_bufsize)
 		totlen = d->bd_bufsize;
 
 	/*
 	 * Round up the end of the previous packet to the next longword.
 	 *
 	 * Drop the packet if there's no room and no hope of room
 	 * If the packet would overflow the storage buffer or the storage
 	 * buffer is considered immutable by the buffer model, try to rotate
 	 * the buffer and wakeup pending processes.
 	 */
 #ifdef COMPAT_FREEBSD32
 	if (d->bd_compat32)
 		curlen = BPF_WORDALIGN32(d->bd_slen);
 	else
 #endif
 		curlen = BPF_WORDALIGN(d->bd_slen);
 	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
 		if (d->bd_fbuf == NULL) {
 			/*
 			 * There's no room in the store buffer, and no
 			 * prospect of room, so drop the packet.  Notify the
 			 * buffer model.
 			 */
 			bpf_buffull(d);
 			++d->bd_dcount;
 			return;
 		}
 		while (d->bd_hbuf_in_use)
 			mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
 			    PRINET, "bd_hbuf", 0);
 		ROTATE_BUFFERS(d);
 		do_wakeup = 1;
 		curlen = 0;
 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
 		/*
 		 * Immediate mode is set, or the read timeout has already
 		 * expired during a select call.  A packet arrived, so the
 		 * reader should be woken up.
 		 */
 		do_wakeup = 1;
 	caplen = totlen - hdrlen;
 	tstype = d->bd_tstamp;
 	do_timestamp = tstype != BPF_T_NONE;
 #ifndef BURN_BRIDGES
 	if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) {
 		struct bpf_ts ts;
 		if (do_timestamp)
 			bpf_bintime2ts(bt, &ts, tstype);
 #ifdef COMPAT_FREEBSD32
 		if (d->bd_compat32) {
 			bzero(&hdr32_old, sizeof(hdr32_old));
 			if (do_timestamp) {
 				hdr32_old.bh_tstamp.tv_sec = ts.bt_sec;
 				hdr32_old.bh_tstamp.tv_usec = ts.bt_frac;
 			}
 			hdr32_old.bh_datalen = pktlen;
 			hdr32_old.bh_hdrlen = hdrlen;
 			hdr32_old.bh_caplen = caplen;
 			bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old,
 			    sizeof(hdr32_old));
 			goto copy;
 		}
 #endif
 		bzero(&hdr_old, sizeof(hdr_old));
 		if (do_timestamp) {
 			hdr_old.bh_tstamp.tv_sec = ts.bt_sec;
 			hdr_old.bh_tstamp.tv_usec = ts.bt_frac;
 		}
 		hdr_old.bh_datalen = pktlen;
 		hdr_old.bh_hdrlen = hdrlen;
 		hdr_old.bh_caplen = caplen;
 		bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old,
 		    sizeof(hdr_old));
 		goto copy;
 	}
 #endif
 
 	/*
 	 * Append the bpf header.  Note we append the actual header size, but
 	 * move forward the length of the header plus padding.
 	 */
 	bzero(&hdr, sizeof(hdr));
 	if (do_timestamp)
 		bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype);
 	hdr.bh_datalen = pktlen;
 	hdr.bh_hdrlen = hdrlen;
 	hdr.bh_caplen = caplen;
 	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
 
 	/*
 	 * Copy the packet data into the store buffer and update its length.
 	 */
 #ifndef BURN_BRIDGES
 copy:
 #endif
 	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen);
 	d->bd_slen = curlen + totlen;
 
 	if (do_wakeup)
 		bpf_wakeup(d);
 }
 
 /*
  * Free buffers currently in use by a descriptor.
  * Called on close.
  */
 static void
 bpf_freed(struct bpf_d *d)
 {
 
 	/*
 	 * We don't need to lock out interrupts since this descriptor has
 	 * been detached from its interface and it yet hasn't been marked
 	 * free.
 	 */
 	bpf_free(d);
 	if (d->bd_rfilter != NULL) {
 		free((caddr_t)d->bd_rfilter, M_BPF);
 #ifdef BPF_JITTER
 		if (d->bd_bfilter != NULL)
 			bpf_destroy_jit_filter(d->bd_bfilter);
 #endif
 	}
 	if (d->bd_wfilter != NULL)
 		free((caddr_t)d->bd_wfilter, M_BPF);
 	mtx_destroy(&d->bd_lock);
 }
 
 /*
  * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
  * fixed size of the link header (variable length headers not yet supported).
  */
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 /*
  * Attach an interface to bpf.  ifp is a pointer to the structure
  * defining the interface to be attached, dlt is the link layer type,
  * and hdrlen is the fixed size of the link header (variable length
  * headers are not yet supporrted).
  */
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
 {
 	struct bpf_if *bp;
 
 	bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO);
 	if (bp == NULL)
 		panic("bpfattach");
 
 	LIST_INIT(&bp->bif_dlist);
 	LIST_INIT(&bp->bif_wlist);
 	bp->bif_ifp = ifp;
 	bp->bif_dlt = dlt;
 	rw_init(&bp->bif_lock, "bpf interface lock");
 	KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized"));
 	*driverp = bp;
 
 	BPF_LOCK();
 	LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
 	BPF_UNLOCK();
 
 	bp->bif_hdrlen = hdrlen;
 
 	if (bootverbose)
 		if_printf(ifp, "bpf attached\n");
 }
 
 /*
  * Detach bpf from an interface. This involves detaching each descriptor
  * associated with the interface. Notify each descriptor as it's detached
  * so that any sleepers wake up and get ENXIO.
  */
 void
 bpfdetach(struct ifnet *ifp)
 {
 	struct bpf_if	*bp, *bp_temp;
 	struct bpf_d	*d;
 	int ndetached;
 
 	ndetached = 0;
 
 	BPF_LOCK();
 	/* Find all bpf_if struct's which reference ifp and detach them. */
 	LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) {
 		if (ifp != bp->bif_ifp)
 			continue;
 
 		LIST_REMOVE(bp, bif_next);
 		/* Add to to-be-freed list */
 		LIST_INSERT_HEAD(&bpf_freelist, bp, bif_next);
 
 		ndetached++;
 		/*
 		 * Delay freeing bp till interface is detached
 		 * and all routes through this interface are removed.
 		 * Mark bp as detached to restrict new consumers.
 		 */
 		BPFIF_WLOCK(bp);
 		bp->flags |= BPFIF_FLAG_DYING;
 		BPFIF_WUNLOCK(bp);
 
 		CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p",
 		    __func__, bp->bif_dlt, bp, ifp);
 
 		/* Free common descriptors */
 		while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) {
 			bpf_detachd_locked(d);
 			BPFD_LOCK(d);
 			bpf_wakeup(d);
 			BPFD_UNLOCK(d);
 		}
 
 		/* Free writer-only descriptors */
 		while ((d = LIST_FIRST(&bp->bif_wlist)) != NULL) {
 			bpf_detachd_locked(d);
 			BPFD_LOCK(d);
 			bpf_wakeup(d);
 			BPFD_UNLOCK(d);
 		}
 	}
 	BPF_UNLOCK();
 
 #ifdef INVARIANTS
 	if (ndetached == 0)
 		printf("bpfdetach: %s was not attached\n", ifp->if_xname);
 #endif
 }
 
 /*
  * Interface departure handler.
  * Note departure event does not guarantee interface is going down.
  * Interface renaming is currently done via departure/arrival event set.
  *
  * Departure handled is called after all routes pointing to
  * given interface are removed and interface is in down state
  * restricting any packets to be sent/received. We assume it is now safe
  * to free data allocated by BPF.
  */
 static void
 bpf_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct bpf_if *bp, *bp_temp;
 	int nmatched = 0;
 
 	BPF_LOCK();
 	/*
 	 * Find matching entries in free list.
 	 * Nothing should be found if bpfdetach() was not called.
 	 */
 	LIST_FOREACH_SAFE(bp, &bpf_freelist, bif_next, bp_temp) {
 		if (ifp != bp->bif_ifp)
 			continue;
 
 		CTR3(KTR_NET, "%s: freeing BPF instance %p for interface %p",
 		    __func__, bp, ifp);
 
 		LIST_REMOVE(bp, bif_next);
 
 		rw_destroy(&bp->bif_lock);
 		free(bp, M_BPF);
 
 		nmatched++;
 	}
 	BPF_UNLOCK();
 
 	/*
 	 * Note that we cannot zero other pointers to
 	 * custom DLTs possibly used by given interface.
 	 */
 	if (nmatched != 0)
 		ifp->if_bpf = NULL;
 }
 
 /*
  * Get a list of available data link type of the interface.
  */
 static int
 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
 {
 	int n, error;
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 
 	BPF_LOCK_ASSERT();
 
 	ifp = d->bd_bif->bif_ifp;
 	n = 0;
 	error = 0;
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp != ifp)
 			continue;
 		if (bfl->bfl_list != NULL) {
 			if (n >= bfl->bfl_len)
 				return (ENOMEM);
 			error = copyout(&bp->bif_dlt,
 			    bfl->bfl_list + n, sizeof(u_int));
 		}
 		n++;
 	}
 	bfl->bfl_len = n;
 	return (error);
 }
 
 /*
  * Set the data link type of a BPF instance.
  */
 static int
 bpf_setdlt(struct bpf_d *d, u_int dlt)
 {
 	int error, opromisc;
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 
 	BPF_LOCK_ASSERT();
 
 	if (d->bd_bif->bif_dlt == dlt)
 		return (0);
 	ifp = d->bd_bif->bif_ifp;
 
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
 			break;
 	}
 
 	if (bp != NULL) {
 		opromisc = d->bd_promisc;
 		bpf_attachd(d, bp);
 		BPFD_LOCK(d);
 		reset_d(d);
 		BPFD_UNLOCK(d);
 		if (opromisc) {
 			error = ifpromisc(bp->bif_ifp, 1);
 			if (error)
 				if_printf(bp->bif_ifp,
 					"bpf_setdlt: ifpromisc failed (%d)\n",
 					error);
 			else
 				d->bd_promisc = 1;
 		}
 	}
 	return (bp == NULL ? EINVAL : 0);
 }
 
 static void
 bpf_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF);
 	LIST_INIT(&bpf_iflist);
 	LIST_INIT(&bpf_freelist);
 
 	dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
 	/* For compatibility */
 	make_dev_alias(dev, "bpf0");
 
 	/* Register interface departure handler */
 	bpf_ifdetach_cookie = EVENTHANDLER_REGISTER(
 		    ifnet_departure_event, bpf_ifdetach, NULL,
 		    EVENTHANDLER_PRI_ANY);
 }
 
 /*
  * Zero out the various packet counters associated with all of the bpf
  * descriptors.  At some point, we will probably want to get a bit more
  * granular and allow the user to specify descriptors to be zeroed.
  */
 static void
 bpf_zero_counters(void)
 {
 	struct bpf_if *bp;
 	struct bpf_d *bd;
 
 	BPF_LOCK();
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		BPFIF_RLOCK(bp);
 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			BPFD_LOCK(bd);
 			bd->bd_rcount = 0;
 			bd->bd_dcount = 0;
 			bd->bd_fcount = 0;
 			bd->bd_wcount = 0;
 			bd->bd_wfcount = 0;
 			bd->bd_zcopy = 0;
 			BPFD_UNLOCK(bd);
 		}
 		BPFIF_RUNLOCK(bp);
 	}
 	BPF_UNLOCK();
 }
 
 /*
  * Fill filter statistics
  */
 static void
 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
 {
 
 	bzero(d, sizeof(*d));
 	BPFD_LOCK_ASSERT(bd);
 	d->bd_structsize = sizeof(*d);
 	/* XXX: reading should be protected by global lock */
 	d->bd_immediate = bd->bd_immediate;
 	d->bd_promisc = bd->bd_promisc;
 	d->bd_hdrcmplt = bd->bd_hdrcmplt;
 	d->bd_direction = bd->bd_direction;
 	d->bd_feedback = bd->bd_feedback;
 	d->bd_async = bd->bd_async;
 	d->bd_rcount = bd->bd_rcount;
 	d->bd_dcount = bd->bd_dcount;
 	d->bd_fcount = bd->bd_fcount;
 	d->bd_sig = bd->bd_sig;
 	d->bd_slen = bd->bd_slen;
 	d->bd_hlen = bd->bd_hlen;
 	d->bd_bufsize = bd->bd_bufsize;
 	d->bd_pid = bd->bd_pid;
 	strlcpy(d->bd_ifname,
 	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
 	d->bd_locked = bd->bd_locked;
 	d->bd_wcount = bd->bd_wcount;
 	d->bd_wdcount = bd->bd_wdcount;
 	d->bd_wfcount = bd->bd_wfcount;
 	d->bd_zcopy = bd->bd_zcopy;
 	d->bd_bufmode = bd->bd_bufmode;
 }
 
 /*
  * Handle `netstat -B' stats request
  */
 static int
 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	static const struct xbpf_d zerostats;
 	struct xbpf_d *xbdbuf, *xbd, tempstats;
 	int index, error;
 	struct bpf_if *bp;
 	struct bpf_d *bd;
 
 	/*
 	 * XXX This is not technically correct. It is possible for non
 	 * privileged users to open bpf devices. It would make sense
 	 * if the users who opened the devices were able to retrieve
 	 * the statistics for them, too.
 	 */
 	error = priv_check(req->td, PRIV_NET_BPF);
 	if (error)
 		return (error);
 	/*
 	 * Check to see if the user is requesting that the counters be
 	 * zeroed out.  Explicitly check that the supplied data is zeroed,
 	 * as we aren't allowing the user to set the counters currently.
 	 */
 	if (req->newptr != NULL) {
 		if (req->newlen != sizeof(tempstats))
 			return (EINVAL);
 		memset(&tempstats, 0, sizeof(tempstats));
 		error = SYSCTL_IN(req, &tempstats, sizeof(tempstats));
 		if (error)
 			return (error);
 		if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0)
 			return (EINVAL);
 		bpf_zero_counters();
 		return (0);
 	}
 	if (req->oldptr == NULL)
 		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
 	if (bpf_bpfd_cnt == 0)
 		return (SYSCTL_OUT(req, 0, 0));
 	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
 	BPF_LOCK();
 	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
 		BPF_UNLOCK();
 		free(xbdbuf, M_BPF);
 		return (ENOMEM);
 	}
 	index = 0;
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		BPFIF_RLOCK(bp);
 		/* Send writers-only first */
 		LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			BPFD_LOCK(bd);
 			bpfstats_fill_xbpf(xbd, bd);
 			BPFD_UNLOCK(bd);
 		}
 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			BPFD_LOCK(bd);
 			bpfstats_fill_xbpf(xbd, bd);
 			BPFD_UNLOCK(bd);
 		}
 		BPFIF_RUNLOCK(bp);
 	}
 	BPF_UNLOCK();
 	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
 	free(xbdbuf, M_BPF);
 	return (error);
 }
 
 SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
 
 #else /* !DEV_BPF && !NETGRAPH_BPF */
 /*
  * NOP stubs to allow bpf-using drivers to load and function.
  *
  * A 'better' implementation would allow the core bpf functionality
  * to be loaded at runtime.
  */
 static struct bpf_if bp_null;
 
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
 }
 
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
 }
 
 void
 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
 {
 }
 
 void
 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
 {
 
 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
 }
 
 void
 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
 {
 
 	*driverp = &bp_null;
 }
 
 void
 bpfdetach(struct ifnet *ifp)
 {
 }
 
 u_int
 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
 {
 	return -1;	/* "no filter" behaviour */
 }
 
 int
 bpf_validate(const struct bpf_insn *f, int len)
 {
 	return 0;		/* false */
 }
 
 #endif /* !DEV_BPF && !NETGRAPH_BPF */
Index: user/dchagin/lemul/sys/net/flowtable.c
===================================================================
--- user/dchagin/lemul/sys/net/flowtable.c	(revision 274240)
+++ user/dchagin/lemul/sys/net/flowtable.c	(revision 274241)
@@ -1,1189 +1,1189 @@
 /*-
  * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
  * Copyright (c) 2008-2010, BitGravity Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  *  1. Redistributions of source code must retain the above copyright notice,
  *     this list of conditions and the following disclaimer.
  *
  *  2. Neither the name of the BitGravity Corporation nor the names of its
  *     contributors may be used to endorse or promote products derived from
  *     this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "opt_route.h"
 #include "opt_mpath.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/bitstring.h>
 #include <sys/condvar.h>
 #include <sys/callout.h>
 #include <sys/hash.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_llatbl.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/flowtable.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #ifdef FLOWTABLE_HASH_ALL
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <netinet/sctp.h>
 #endif
 
 #include <ddb/ddb.h>
 
 #ifdef	FLOWTABLE_HASH_ALL
 #define	KEY_PORTS	(sizeof(uint16_t) * 2)
 #define	KEY_ADDRS	2
 #else
 #define	KEY_PORTS	0
 #define	KEY_ADDRS	1
 #endif
 
 #ifdef	INET6
 #define	KEY_ADDR_LEN	sizeof(struct in6_addr)
 #else
 #define	KEY_ADDR_LEN	sizeof(struct in_addr)
 #endif
 
 #define	KEYLEN	((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t))
 
 struct flentry {
 	uint32_t		f_hash;		/* hash flowing forward */
 	uint32_t		f_key[KEYLEN];	/* address(es and ports) */
 	uint32_t		f_uptime;	/* uptime at last access */
 	uint16_t		f_fibnum;	/* fib index */
 #ifdef FLOWTABLE_HASH_ALL
 	uint8_t			f_proto;	/* protocol */
 	uint8_t			f_flags;	/* stale? */
 #define FL_STALE 		1
 #endif
 	SLIST_ENTRY(flentry)	f_next;		/* pointer to collision entry */
 	struct rtentry		*f_rt;		/* rtentry for flow */
 	struct llentry		*f_lle;		/* llentry for flow */
 };
 #undef KEYLEN
 
 SLIST_HEAD(flist, flentry);
 /* Make sure we can use pcpu_zone_ptr for struct flist. */
 CTASSERT(sizeof(struct flist) == sizeof(void *));
 
 struct flowtable {
 	counter_u64_t	*ft_stat;
 	int 		ft_size;
 	/*
 	 * ft_table is a malloc(9)ed array of pointers.  Pointers point to
 	 * memory from UMA_ZONE_PCPU zone.
 	 * ft_masks is per-cpu pointer itself.  Each instance points
 	 * to a malloc(9)ed bitset, that is private to corresponding CPU.
 	 */
 	struct flist	**ft_table;
 	bitstr_t 	**ft_masks;
 	bitstr_t	*ft_tmpmask;
 };
 
 #define	FLOWSTAT_ADD(ft, name, v)	\
 	counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
 #define	FLOWSTAT_INC(ft, name)	FLOWSTAT_ADD(ft, name, 1)
 
 static struct proc *flowcleanerproc;
 static uint32_t flow_hashjitter;
 
 static struct cv 	flowclean_f_cv;
 static struct cv 	flowclean_c_cv;
 static struct mtx	flowclean_lock;
 static uint32_t		flowclean_cycles;
 
 /*
  * TODO:
  * - add sysctls to resize && flush flow tables
  * - Add per flowtable sysctls for statistics and configuring timeouts
  * - add saturation counter to rtentry to support per-packet load-balancing
  *   add flag to indicate round-robin flow, add list lookup from head
      for flows
  * - add sysctl / device node / syscall to support exporting and importing
  *   of flows with flag to indicate that a flow was imported so should
  *   not be considered for auto-cleaning
  * - support explicit connection state (currently only ad-hoc for DSR)
  * - idetach() cleanup for options VIMAGE builds.
  */
 #ifdef INET
 static VNET_DEFINE(struct flowtable, ip4_ft);
 #define	V_ip4_ft	VNET(ip4_ft)
 #endif
 #ifdef INET6
 static VNET_DEFINE(struct flowtable, ip6_ft);
 #define	V_ip6_ft	VNET(ip6_ft)
 #endif
 
 static uma_zone_t flow_zone;
 
 static VNET_DEFINE(int, flowtable_enable) = 1;
 #define	V_flowtable_enable		VNET(flowtable_enable)
 
 static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
     "flowtable");
-SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW,
+SYSCTL_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
 SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
     &flow_zone, "Maximum number of flows allowed");
 
 static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
 
 static struct flentry *
 flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
 
 #ifdef INET
 static struct flentry *
 flowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
 {
 	struct flentry *fle;
 	struct sockaddr_in *sin;
 	struct ip *ip;
 	uint32_t fibnum;
 #ifdef FLOWTABLE_HASH_ALL
 	uint32_t key[3];
 	int iphlen;
 	uint16_t sport, dport;
 	uint8_t proto;
 #endif
 
 	ip = mtod(m, struct ip *);
 
 	if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
 	    (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
 		return (NULL);
 
 	fibnum = M_GETFIB(m);
 
 #ifdef FLOWTABLE_HASH_ALL
 	iphlen = ip->ip_hl << 2;
 	proto = ip->ip_p;
 
 	switch (proto) {
 	case IPPROTO_TCP: {
 		struct tcphdr *th;
 
 		th = (struct tcphdr *)((char *)ip + iphlen);
 		sport = th->th_sport;
 		dport = th->th_dport;
 		if (th->th_flags & (TH_RST|TH_FIN))
 			fibnum |= (FL_STALE << 24);
 		break;
 	}
 	case IPPROTO_UDP: {
 		struct udphdr *uh;
 
 		uh = (struct udphdr *)((char *)ip + iphlen);
 		sport = uh->uh_sport;
 		dport = uh->uh_dport;
 		break;
 	}
 	case IPPROTO_SCTP: {
 		struct sctphdr *sh;
 
 		sh = (struct sctphdr *)((char *)ip + iphlen);
 		sport = sh->src_port;
 		dport = sh->dest_port;
 		/* XXXGL: handle stale? */
 		break;
 	}
 	default:
 		sport = dport = 0;
 		break;
 	}
 
 	key[0] = ip->ip_dst.s_addr;
 	key[1] = ip->ip_src.s_addr;
 	key[2] = (dport << 16) | sport;
 	fibnum |= proto << 16;
 
 	fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
 	    fibnum);
 
 #else	/* !FLOWTABLE_HASH_ALL */
 
 	fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
 	    sizeof(struct in_addr), fibnum);
 
 #endif	/* FLOWTABLE_HASH_ALL */
 
 	if (fle == NULL)
 		return (NULL);
 
 	sin = (struct sockaddr_in *)&ro->ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = ip->ip_dst;
 
 	return (fle);
 }
 #endif /* INET */
 
 #ifdef INET6
 /*
  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
  * pointer might become stale after other pullups (but we never use it
  * this way).
  */
 #define PULLUP_TO(_len, p, T)						\
 do {									\
 	int x = (_len) + sizeof(T);					\
 	if ((m)->m_len < x)						\
 		return (NULL);						\
 	p = (mtod(m, char *) + (_len));					\
 } while (0)
 
 #define	TCP(p)		((struct tcphdr *)(p))
 #define	SCTP(p)		((struct sctphdr *)(p))
 #define	UDP(p)		((struct udphdr *)(p))
 
 static struct flentry *
 flowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
 {
 	struct flentry *fle;
 	struct sockaddr_in6 *sin6;
 	struct ip6_hdr *ip6;
 	uint32_t fibnum;
 #ifdef FLOWTABLE_HASH_ALL
 	uint32_t key[9];
 	void *ulp;
 	int hlen;
 	uint16_t sport, dport;
 	u_short offset;
 	uint8_t proto;
 #else
 	uint32_t key[4];
 #endif
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (in6_localaddr(&ip6->ip6_dst))
 		return (NULL);
 
 	fibnum = M_GETFIB(m);
 
 #ifdef	FLOWTABLE_HASH_ALL
 	hlen = sizeof(struct ip6_hdr);
 	proto = ip6->ip6_nxt;
 	offset = sport = dport = 0;
 	ulp = NULL;
 	while (ulp == NULL) {
 		switch (proto) {
 		case IPPROTO_ICMPV6:
 		case IPPROTO_OSPFIGP:
 		case IPPROTO_PIM:
 		case IPPROTO_CARP:
 		case IPPROTO_ESP:
 		case IPPROTO_NONE:
 			ulp = ip6;
 			break;
 		case IPPROTO_TCP:
 			PULLUP_TO(hlen, ulp, struct tcphdr);
 			dport = TCP(ulp)->th_dport;
 			sport = TCP(ulp)->th_sport;
 			if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
 				fibnum |= (FL_STALE << 24);
 			break;
 		case IPPROTO_SCTP:
 			PULLUP_TO(hlen, ulp, struct sctphdr);
 			dport = SCTP(ulp)->src_port;
 			sport = SCTP(ulp)->dest_port;
 			/* XXXGL: handle stale? */
 			break;
 		case IPPROTO_UDP:
 			PULLUP_TO(hlen, ulp, struct udphdr);
 			dport = UDP(ulp)->uh_dport;
 			sport = UDP(ulp)->uh_sport;
 			break;
 		case IPPROTO_HOPOPTS:	/* RFC 2460 */
 			PULLUP_TO(hlen, ulp, struct ip6_hbh);
 			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 			ulp = NULL;
 			break;
 		case IPPROTO_ROUTING:	/* RFC 2460 */
 			PULLUP_TO(hlen, ulp, struct ip6_rthdr);
 			hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
 			proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
 			ulp = NULL;
 			break;
 		case IPPROTO_FRAGMENT:	/* RFC 2460 */
 			PULLUP_TO(hlen, ulp, struct ip6_frag);
 			hlen += sizeof (struct ip6_frag);
 			proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
 			offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
 			    IP6F_OFF_MASK;
 			ulp = NULL;
 			break;
 		case IPPROTO_DSTOPTS:	/* RFC 2460 */
 			PULLUP_TO(hlen, ulp, struct ip6_hbh);
 			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 			ulp = NULL;
 			break;
 		case IPPROTO_AH:	/* RFC 2402 */
 			PULLUP_TO(hlen, ulp, struct ip6_ext);
 			hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
 			proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
 			ulp = NULL;
 			break;
 		default:
 			PULLUP_TO(hlen, ulp, struct ip6_ext);
 			break;
 		}
 	}
 
 	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
 	bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
 	key[8] = (dport << 16) | sport;
 	fibnum |= proto << 16;
 
 	fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
 	    fibnum);
 #else	/* !FLOWTABLE_HASH_ALL */
 	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
 	fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
 	    fibnum);
 #endif	/* FLOWTABLE_HASH_ALL */
 
 	if (fle == NULL)
 		return (NULL);
 
 	sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(*sin6);
 	bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
 
 	return (fle);
 }
 #endif /* INET6 */
 
 static bitstr_t *
 flowtable_mask(struct flowtable *ft)
 {
 
 	/*
 	 * flowtable_free_stale() calls w/o critical section, but
 	 * with sched_bind(). Since pointer is stable throughout
 	 * ft lifetime, it is safe, otherwise...
 	 *
 	 * CRITICAL_ASSERT(curthread);
 	 */
 
 	return (*(bitstr_t **)zpcpu_get(ft->ft_masks));
 }
 
 static struct flist *
 flowtable_list(struct flowtable *ft, uint32_t hash)
 {
 
 	CRITICAL_ASSERT(curthread);
 	return (zpcpu_get(ft->ft_table[hash % ft->ft_size]));
 }
 
 static int
 flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
 {
 
 	if (((fle->f_rt->rt_flags & RTF_HOST) &&
 	    ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) ||
 	    (fle->f_rt->rt_ifp == NULL) ||
 	    !RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
 	    (fle->f_lle->la_flags & LLE_VALID) == 0)
 		return (1);
 
 	if (time_uptime - fle->f_uptime > maxidle)
 		return (1);
 
 #ifdef FLOWTABLE_HASH_ALL
 	if (fle->f_flags & FL_STALE)
 		return (1);
 #endif
 
 	return (0);
 }
 
 static int
 flow_full(void)
 {
 	int count, max;
 
 	count = uma_zone_get_cur(flow_zone);
 	max = uma_zone_get_max(flow_zone);
 
 	return (count > (max - (max >> 3)));
 }
 
 static int
 flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
 {
 #ifdef FLOWTABLE_HASH_ALL
 	uint8_t proto;
 
 	proto = (fibnum >> 16) & 0xff;
 	fibnum &= 0xffff;
 #endif
 
 	CRITICAL_ASSERT(curthread);
 
 	/* Microoptimization for IPv4: don't use bcmp(). */
 	if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) ||
 	    (bcmp(fle->f_key, key, keylen) == 0)) &&
 	    fibnum == fle->f_fibnum &&
 #ifdef FLOWTABLE_HASH_ALL
 	    proto == fle->f_proto &&
 #endif
 	    (fle->f_rt->rt_flags & RTF_UP) &&
 	    fle->f_rt->rt_ifp != NULL &&
 	    (fle->f_lle->la_flags & LLE_VALID))
 		return (1);
 
 	return (0);
 }
 
 static struct flentry *
 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
     int keylen, uint32_t fibnum0)
 {
 #ifdef INET6
 	struct route_in6 sro6;
 #endif
 #ifdef INET
 	struct route sro;
 #endif
 	struct route *ro = NULL;
 	struct rtentry *rt;
 	struct lltable *lt = NULL;
 	struct llentry *lle;
 	struct sockaddr_storage *l3addr;
 	struct ifnet *ifp;
 	struct flist *flist;
 	struct flentry *fle, *iter;
 	bitstr_t *mask;
 	uint16_t fibnum = fibnum0;
 #ifdef FLOWTABLE_HASH_ALL
 	uint8_t proto;
 
 	proto = (fibnum0 >> 16) & 0xff;
 	fibnum = fibnum0 & 0xffff;
 #endif
 
 	/*
 	 * This bit of code ends up locking the
 	 * same route 3 times (just like ip_output + ether_output)
 	 * - at lookup
 	 * - in rt_check when called by arpresolve
 	 * - dropping the refcount for the rtentry
 	 *
 	 * This could be consolidated to one if we wrote a variant
 	 * of arpresolve with an rt_check variant that expected to
 	 * receive the route locked
 	 */
 #ifdef INET
 	if (ft == &V_ip4_ft) {
 		struct sockaddr_in *sin;
 
 		ro = &sro;
 		bzero(&sro.ro_dst, sizeof(sro.ro_dst));
 
 		sin = (struct sockaddr_in *)&sro.ro_dst;
 		sin->sin_family = AF_INET;
 		sin->sin_len = sizeof(*sin);
 		sin->sin_addr.s_addr = key[0];
 	}
 #endif
 #ifdef INET6
 	if (ft == &V_ip6_ft) {
 		struct sockaddr_in6 *sin6;
 
 		ro = (struct route *)&sro6;
 		sin6 = &sro6.ro_dst;
 
 		bzero(sin6, sizeof(*sin6));
 		sin6->sin6_family = AF_INET6;
 		sin6->sin6_len = sizeof(*sin6);
 		bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
 	}
 #endif
 
 	ro->ro_rt = NULL;
 #ifdef RADIX_MPATH
 	rtalloc_mpath_fib(ro, hash, fibnum);
 #else
 	rtalloc_ign_fib(ro, 0, fibnum);
 #endif
 	if (ro->ro_rt == NULL)
 		return (NULL);
 
 	rt = ro->ro_rt;
 	ifp = rt->rt_ifp;
 
 	if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
 		RTFREE(rt);
 		return (NULL);
 	}
 
 #ifdef INET
 	if (ft == &V_ip4_ft)
 		lt = LLTABLE(ifp);
 #endif
 #ifdef INET6
 	if (ft == &V_ip6_ft)
 		lt = LLTABLE6(ifp);
 #endif
 
 	if (rt->rt_flags & RTF_GATEWAY)
 		l3addr = (struct sockaddr_storage *)rt->rt_gateway;
 	else
 		l3addr = (struct sockaddr_storage *)&ro->ro_dst;
 	lle = llentry_alloc(ifp, lt, l3addr);
 
 	if (lle == NULL) {
 		RTFREE(rt);
 		return (NULL);
 	}
 
 	/* Don't insert the entry if the ARP hasn't yet finished resolving. */
 	if ((lle->la_flags & LLE_VALID) == 0) {
 		RTFREE(rt);
 		LLE_FREE(lle);
 		FLOWSTAT_INC(ft, ft_fail_lle_invalid);
 		return (NULL);
 	}
 
 	fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
 	if (fle == NULL) {
 		RTFREE(rt);
 		LLE_FREE(lle);
 		return (NULL);
 	}
 
 	fle->f_hash = hash;
 	bcopy(key, &fle->f_key, keylen);
 	fle->f_rt = rt;
 	fle->f_lle = lle;
 	fle->f_fibnum = fibnum;
 	fle->f_uptime = time_uptime;
 #ifdef FLOWTABLE_HASH_ALL
 	fle->f_proto = proto;
 	fle->f_flags = fibnum0 >> 24;
 #endif
 
 	critical_enter();
 	mask = flowtable_mask(ft);
 	flist = flowtable_list(ft, hash);
 
 	if (SLIST_EMPTY(flist)) {
 		bit_set(mask, (hash % ft->ft_size));
 		SLIST_INSERT_HEAD(flist, fle, f_next);
 		goto skip;
 	}
 
 	/*
 	 * find end of list and make sure that we were not
 	 * preempted by another thread handling this flow
 	 */
 	SLIST_FOREACH(iter, flist, f_next) {
 		KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size,
 		    ("%s: wrong hash", __func__));
 		if (flow_matches(iter, key, keylen, fibnum)) {
 			/*
 			 * We probably migrated to an other CPU after
 			 * lookup in flowtable_lookup_common() failed.
 			 * It appeared that this CPU already has flow
 			 * entry.
 			 */
 			iter->f_uptime = time_uptime;
 #ifdef FLOWTABLE_HASH_ALL
 			iter->f_flags |= fibnum >> 24;
 #endif
 			critical_exit();
 			FLOWSTAT_INC(ft, ft_collisions);
 			uma_zfree(flow_zone, fle);
 			return (iter);
 		}
 	}
 
 	SLIST_INSERT_HEAD(flist, fle, f_next);
 skip:
 	critical_exit();
 	FLOWSTAT_INC(ft, ft_inserts);
 
 	return (fle);
 }
 
 int
 flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
 {
 	struct flentry *fle;
 
 	if (V_flowtable_enable == 0)
 		return (ENXIO);
 
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
 		fle = flowtable_lookup_ipv4(m, ro);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		fle = flowtable_lookup_ipv6(m, ro);
 		break;
 #endif
 	default:
 		panic("%s: sa %d", __func__, sa);
 	}
 
 	if (fle == NULL)
 		return (EHOSTUNREACH);
 
 	if (!(m->m_flags & M_FLOWID)) {
 		m->m_flags |= M_FLOWID;
 		m->m_pkthdr.flowid = fle->f_hash;
 	}
 
 	ro->ro_rt = fle->f_rt;
 	ro->ro_lle = fle->f_lle;
 	ro->ro_flags |= RT_NORTREF;
 
 	return (0);
 }
 
 static struct flentry *
 flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen,
     uint32_t fibnum)
 {
 	struct flist *flist;
 	struct flentry *fle;
 	uint32_t hash;
 
 	FLOWSTAT_INC(ft, ft_lookups);
 
 	hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter);
 
 	critical_enter();
 	flist = flowtable_list(ft, hash);
 	SLIST_FOREACH(fle, flist, f_next) {
 		KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size,
 		    ("%s: wrong hash", __func__));
 		if (flow_matches(fle, key, keylen, fibnum)) {
 			fle->f_uptime = time_uptime;
 #ifdef FLOWTABLE_HASH_ALL
 			fle->f_flags |= fibnum >> 24;
 #endif
 			critical_exit();
 			FLOWSTAT_INC(ft, ft_hits);
 			return (fle);
 		}
 	}
 	critical_exit();
 
 	FLOWSTAT_INC(ft, ft_misses);
 
 	return (flowtable_insert(ft, hash, key, keylen, fibnum));
 }
 
 /*
  * used by the bit_alloc macro
  */
 #define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO)
 static void
 flowtable_alloc(struct flowtable *ft)
 {
 
 	ft->ft_table = malloc(ft->ft_size * sizeof(struct flist),
 	    M_FTABLE, M_WAITOK);
 	for (int i = 0; i < ft->ft_size; i++)
 		ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
 
 	ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
 	for (int i = 0; i < mp_ncpus; i++) {
 		bitstr_t **b;
 
 		b = zpcpu_get_cpu(ft->ft_masks, i);
 		*b = bit_alloc(ft->ft_size);
 	}
 	ft->ft_tmpmask = bit_alloc(ft->ft_size);
 }
 #undef calloc
 
 static void
 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle)
 {
 	struct flist *flist, freelist;
 	struct flentry *fle, *fle1, *fleprev;
 	bitstr_t *mask, *tmpmask;
 	int curbit, tmpsize;
 
 	SLIST_INIT(&freelist);
 	mask = flowtable_mask(ft);
 	tmpmask = ft->ft_tmpmask;
 	tmpsize = ft->ft_size;
 	memcpy(tmpmask, mask, ft->ft_size/8);
 	curbit = 0;
 	fleprev = NULL; /* pacify gcc */
 	/*
 	 * XXX Note to self, bit_ffs operates at the byte level
 	 * and thus adds gratuitous overhead
 	 */
 	bit_ffs(tmpmask, ft->ft_size, &curbit);
 	while (curbit != -1) {
 		if (curbit >= ft->ft_size || curbit < -1) {
 			log(LOG_ALERT,
 			    "warning: bad curbit value %d \n",
 			    curbit);
 			break;
 		}
 
 		FLOWSTAT_INC(ft, ft_free_checks);
 
 		critical_enter();
 		flist = flowtable_list(ft, curbit);
 #ifdef DIAGNOSTIC
 		if (SLIST_EMPTY(flist) && curbit > 0) {
 			log(LOG_ALERT,
 			    "warning bit=%d set, but no fle found\n",
 			    curbit);
 		}
 #endif
 		SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) {
 			if (rt != NULL && fle->f_rt != rt) {
 				fleprev = fle;
 				continue;
 			}
 			if (!flow_stale(ft, fle, maxidle)) {
 				fleprev = fle;
 				continue;
 			}
 
 			if (fle == SLIST_FIRST(flist))
 				SLIST_REMOVE_HEAD(flist, f_next);
 			else
 				SLIST_REMOVE_AFTER(fleprev, f_next);
 			SLIST_INSERT_HEAD(&freelist, fle, f_next);
 		}
 		if (SLIST_EMPTY(flist))
 			bit_clear(mask, curbit);
 		critical_exit();
 
 		bit_clear(tmpmask, curbit);
 		tmpmask += (curbit / 8);
 		tmpsize -= (curbit / 8) * 8;
 		bit_ffs(tmpmask, tmpsize, &curbit);
 	}
 
 	SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) {
 		FLOWSTAT_INC(ft, ft_frees);
 		if (fle->f_rt != NULL)
 			RTFREE(fle->f_rt);
 		if (fle->f_lle != NULL)
 			LLE_FREE(fle->f_lle);
 		uma_zfree(flow_zone, fle);
 	}
 }
 
 static void
 flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle)
 {
 	int i;
 
 	CPU_FOREACH(i) {
 		if (smp_started == 1) {
 			thread_lock(curthread);
 			sched_bind(curthread, i);
 			thread_unlock(curthread);
 		}
 
 		flowtable_free_stale(ft, rt, maxidle);
 
 		if (smp_started == 1) {
 			thread_lock(curthread);
 			sched_unbind(curthread);
 			thread_unlock(curthread);
 		}
 	}
 }
 
 void
 flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
 {
 	struct flowtable *ft;
 
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
 		ft = &V_ip4_ft;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ft = &V_ip6_ft;
 		break;
 #endif
 	default:
 		panic("%s: sa %d", __func__, sa);
 	}
 
 	flowtable_clean_vnet(ft, rt, 0);
 }
 
 static void
 flowtable_cleaner(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct thread *td;
 
 	if (bootverbose)
 		log(LOG_INFO, "flowtable cleaner started\n");
 	td = curthread;
 	while (1) {
 		uint32_t flowclean_freq, maxidle;
 
 		/*
 		 * The maximum idle time, as well as frequency are arbitrary.
 		 */
 		if (flow_full())
 			maxidle = 5;
 		else
 			maxidle = 30;
 
 		VNET_LIST_RLOCK();
 		VNET_FOREACH(vnet_iter) {
 			CURVNET_SET(vnet_iter);
 #ifdef INET
 			flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle);
 #endif
 #ifdef INET6
 			flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle);
 #endif
 			CURVNET_RESTORE();
 		}
 		VNET_LIST_RUNLOCK();
 
 		if (flow_full())
 			flowclean_freq = 4*hz;
 		else
 			flowclean_freq = 20*hz;
 		mtx_lock(&flowclean_lock);
 		thread_lock(td);
 		sched_prio(td, PPAUSE);
 		thread_unlock(td);
 		flowclean_cycles++;
 		cv_broadcast(&flowclean_f_cv);
 		cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
 		mtx_unlock(&flowclean_lock);
 	}
 }
 
 static void
 flowtable_flush(void *unused __unused)
 {
 	uint64_t start;
 
 	mtx_lock(&flowclean_lock);
 	start = flowclean_cycles;
 	while (start == flowclean_cycles) {
 		cv_broadcast(&flowclean_c_cv);
 		cv_wait(&flowclean_f_cv, &flowclean_lock);
 	}
 	mtx_unlock(&flowclean_lock);
 }
 
 static struct kproc_desc flow_kp = {
 	"flowcleaner",
 	flowtable_cleaner,
 	&flowcleanerproc
 };
 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
 
 static int
 flowtable_get_size(char *name)
 {
 	int size;
 
 	if (TUNABLE_INT_FETCH(name, &size)) {
 		if (size < 256)
 			size = 256;
 		if (!powerof2(size)) {
 			printf("%s must be power of 2\n", name);
 			size = 2048;
 		}
 	} else {
 		/*
 		 * round up to the next power of 2
 		 */
 		size = 1 << fls((1024 + maxusers * 64) - 1);
 	}
 
 	return (size);
 }
 
 static void
 flowtable_init(const void *unused __unused)
 {
 
 	flow_hashjitter = arc4random();
 
 	flow_zone = uma_zcreate("flows", sizeof(struct flentry),
 	    NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET);
 	uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus);
 
 	cv_init(&flowclean_c_cv, "c_flowcleanwait");
 	cv_init(&flowclean_f_cv, "f_flowcleanwait");
 	mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
 	EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
     flowtable_init, NULL);
 
 #ifdef INET
 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
     "Flowtable for IPv4");
 
 static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
 VNET_PCPUSTAT_SYSINIT(ip4_ftstat);
 VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat);
 SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat,
     ip4_ftstat, "Flowtable statistics for IPv4 "
     "(struct flowtable_stat, net/flowtable.h)");
 
 static void
 flowtable_init_vnet_v4(const void *unused __unused)
 {
 
 	V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
 	V_ip4_ft.ft_stat = VNET(ip4_ftstat);
 	flowtable_alloc(&V_ip4_ft);
 }
 VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     flowtable_init_vnet_v4, NULL);
 #endif /* INET */
 
 #ifdef INET6
 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
     "Flowtable for IPv6");
 
 static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
 VNET_PCPUSTAT_SYSINIT(ip6_ftstat);
 VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat);
 SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat,
     ip6_ftstat, "Flowtable statistics for IPv6 "
     "(struct flowtable_stat, net/flowtable.h)");
 
 static void
 flowtable_init_vnet_v6(const void *unused __unused)
 {
 
 	V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
 	V_ip6_ft.ft_stat = VNET(ip6_ftstat);
 	flowtable_alloc(&V_ip6_ft);
 }
 VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     flowtable_init_vnet_v6, NULL);
 #endif /* INET6 */
 
 #ifdef DDB
 static bitstr_t *
 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
 {
 
 	return (zpcpu_get_cpu(*ft->ft_masks, cpuid));
 }
 
 static struct flist *
 flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
 {
 
 	return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid));
 }
 
 static void
 flow_show(struct flowtable *ft, struct flentry *fle)
 {
 	int idle_time;
 	int rt_valid, ifp_valid;
 	volatile struct rtentry *rt;
 	struct ifnet *ifp = NULL;
 	uint32_t *hashkey = fle->f_key;
 
 	idle_time = (int)(time_uptime - fle->f_uptime);
 	rt = fle->f_rt;
 	rt_valid = rt != NULL;
 	if (rt_valid)
 		ifp = rt->rt_ifp;
 	ifp_valid = ifp != NULL;
 
 #ifdef INET
 	if (ft == &V_ip4_ft) {
 		char daddr[4*sizeof "123"];
 #ifdef FLOWTABLE_HASH_ALL
 		char saddr[4*sizeof "123"];
 		uint16_t sport, dport;
 #endif
 
 		inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr);
 #ifdef FLOWTABLE_HASH_ALL
 		inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
 		dport = ntohs((uint16_t)(hashkey[2] >> 16));
 		sport = ntohs((uint16_t)(hashkey[2] & 0xffff));
 		db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport);
 #else
 		db_printf("%s ", daddr);
 #endif
 	}
 #endif /* INET */
 #ifdef INET6
 	if (ft == &V_ip6_ft) {
 #ifdef FLOWTABLE_HASH_ALL
 		db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
 		    hashkey[0], hashkey[1], hashkey[2],
 		    hashkey[3], hashkey[4], hashkey[5],
 		    hashkey[6], hashkey[7], hashkey[8]);
 #else
 		db_printf("\n\tkey=%08x:%08x:%08x ",
 		    hashkey[0], hashkey[1], hashkey[2]);
 #endif
 	}
 #endif /* INET6 */
 
 	db_printf("hash=%08x idle_time=%03d"
 	    "\n\tfibnum=%02d rt=%p",
 	    fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt);
 
 #ifdef FLOWTABLE_HASH_ALL
 	if (fle->f_flags & FL_STALE)
 		db_printf(" FL_STALE ");
 #endif
 	if (rt_valid) {
 		if (rt->rt_flags & RTF_UP)
 			db_printf(" RTF_UP ");
 	}
 	if (ifp_valid) {
 		if (ifp->if_flags & IFF_LOOPBACK)
 			db_printf(" IFF_LOOPBACK ");
 		if (ifp->if_flags & IFF_UP)
 			db_printf(" IFF_UP ");
 		if (ifp->if_flags & IFF_POINTOPOINT)
 			db_printf(" IFF_POINTOPOINT ");
 	}
 	db_printf("\n");
 }
 
 static void
 flowtable_show(struct flowtable *ft, int cpuid)
 {
 	int curbit = 0;
 	bitstr_t *mask, *tmpmask;
 
 	if (cpuid != -1)
 		db_printf("cpu: %d\n", cpuid);
 	mask = flowtable_mask_pcpu(ft, cpuid);
 	tmpmask = ft->ft_tmpmask;
 	memcpy(tmpmask, mask, ft->ft_size/8);
 	/*
 	 * XXX Note to self, bit_ffs operates at the byte level
 	 * and thus adds gratuitous overhead
 	 */
 	bit_ffs(tmpmask, ft->ft_size, &curbit);
 	while (curbit != -1) {
 		struct flist *flist;
 		struct flentry *fle;
 
 		if (curbit >= ft->ft_size || curbit < -1) {
 			db_printf("warning: bad curbit value %d \n",
 			    curbit);
 			break;
 		}
 
 		flist = flowtable_list_pcpu(ft, curbit, cpuid);
 
 		SLIST_FOREACH(fle, flist, f_next)
 			flow_show(ft, fle);
 		bit_clear(tmpmask, curbit);
 		bit_ffs(tmpmask, ft->ft_size, &curbit);
 	}
 }
 
 static void
 flowtable_show_vnet(struct flowtable *ft)
 {
 
 	int i;
 
 	CPU_FOREACH(i)
 		flowtable_show(ft, i);
 }
 
 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 #ifdef VIMAGE
 		db_printf("vnet %p\n", vnet_iter);
 #endif
 #ifdef INET
 		printf("IPv4:\n");
 		flowtable_show_vnet(&V_ip4_ft);
 #endif
 #ifdef INET6
 		printf("IPv6:\n");
 		flowtable_show_vnet(&V_ip6_ft);
 #endif
 		CURVNET_RESTORE();
 	}
 }
 #endif
Index: user/dchagin/lemul/sys/net/if.c
===================================================================
--- user/dchagin/lemul/sys/net/if.c	(revision 274240)
+++ user/dchagin/lemul/sys/net/if.c	(revision 274241)
@@ -1,4030 +1,4026 @@
 /*-
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.c	8.5 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_inet6.h"
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/conf.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/bus.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/refcount.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
 #include <sys/sockio.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/priv.h>
 
 #include <machine/stdarg.h>
 #include <vm/uma.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_vlan_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <net/ethernet.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #ifdef INET
 #include <netinet/if_ether.h>
 #endif /* INET */
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif /* INET6 */
 #endif /* INET || INET6 */
 
 #include <security/mac/mac_framework.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 #endif
 
-struct ifindex_entry {
-	struct  ifnet *ife_ifnet;
-};
-
 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
 
 SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN,
     &ifqmaxlen, 0, "max send queue size");
 
 /* Log link state change events */
 static int log_link_state_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
 	&log_link_state_change, 0,
 	"log interface link state change events");
 
 /* Interface description */
 static unsigned int ifdescr_maxlen = 1024;
 SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
 	&ifdescr_maxlen, 0,
 	"administrative maximum length for interface description");
 
 static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");
 
 /* global sx for non-critical path ifdescr */
 static struct sx ifdescr_sx;
 SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");
 
 void	(*bridge_linkstate_p)(struct ifnet *ifp);
 void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
 void	(*lagg_linkstate_p)(struct ifnet *ifp, int state);
 /* These are external hooks for CARP. */
 void	(*carp_linkstate_p)(struct ifnet *ifp);
 void	(*carp_demote_adj_p)(int, char *);
 int	(*carp_master_p)(struct ifaddr *);
 #if defined(INET) || defined(INET6)
 int	(*carp_forus_p)(struct ifnet *ifp, u_char *dhost);
 int	(*carp_output_p)(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *sa);
 int	(*carp_ioctl_p)(struct ifreq *, u_long, struct thread *);   
 int	(*carp_attach_p)(struct ifaddr *, int);
 void	(*carp_detach_p)(struct ifaddr *);
 #endif
 #ifdef INET
 int	(*carp_iamatch_p)(struct ifaddr *, uint8_t **);
 #endif
 #ifdef INET6
 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6);
 caddr_t	(*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m,
     const struct in6_addr *taddr);
 #endif
 
 struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
 
 /*
  * XXX: Style; these should be sorted alphabetically, and unprototyped
  * static functions should be prototyped. Currently they are sorted by
  * declaration order.
  */
 static void	if_attachdomain(void *);
 static void	if_attachdomain1(struct ifnet *);
 static int	ifconf(u_long, caddr_t);
 static void	if_freemulti(struct ifmultiaddr *);
 static void	if_init(void *);
 static void	if_grow(void);
 static void	if_route(struct ifnet *, int flag, int fam);
 static int	if_setflag(struct ifnet *, int, int, int *, int);
 static int	if_transmit(struct ifnet *ifp, struct mbuf *m);
 static void	if_unroute(struct ifnet *, int flag, int fam);
 static void	link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 static int	if_rtdel(struct radix_node *, void *);
 static int	ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *);
 static int	if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
 static void	do_link_state_change(void *, int);
 static int	if_getgroup(struct ifgroupreq *, struct ifnet *);
 static int	if_getgroupmembers(struct ifgroupreq *);
 static void	if_delgroups(struct ifnet *);
 static void	if_attach_internal(struct ifnet *, int);
 static void	if_detach_internal(struct ifnet *, int);
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 VNET_DEFINE(int, if_index);
 int	ifqmaxlen = IFQ_MAXLEN;
 VNET_DEFINE(struct ifnethead, ifnet);	/* depend on static init XXX */
 VNET_DEFINE(struct ifgrouphead, ifg_head);
 
 static VNET_DEFINE(int, if_indexlim) = 8;
 
 /* Table of ifnet by index. */
-VNET_DEFINE(struct ifindex_entry *, ifindex_table);
+VNET_DEFINE(struct ifnet **, ifindex_table);
 
 #define	V_if_indexlim		VNET(if_indexlim)
 #define	V_ifindex_table		VNET(ifindex_table)
 
 /*
  * The global network interface list (V_ifnet) and related state (such as
  * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and
  * an rwlock.  Either may be acquired shared to stablize the list, but both
  * must be acquired writable to modify the list.  This model allows us to
  * both stablize the interface list during interrupt thread processing, but
  * also to stablize it over long-running ioctls, without introducing priority
  * inversions and deadlocks.
  */
 struct rwlock ifnet_rwlock;
 struct sx ifnet_sxlock;
 
 /*
  * The allocation of network interfaces is a rather non-atomic affair; we
  * need to select an index before we are ready to expose the interface for
  * use, so will use this pointer value to indicate reservation.
  */
 #define	IFNET_HOLD	(void *)(uintptr_t)(-1)
 
 static	if_com_alloc_t *if_com_alloc[256];
 static	if_com_free_t *if_com_free[256];
 
 static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
 
 struct ifnet *
 ifnet_byindex_locked(u_short idx)
 {
 
 	if (idx > V_if_index)
 		return (NULL);
-	if (V_ifindex_table[idx].ife_ifnet == IFNET_HOLD)
+	if (V_ifindex_table[idx] == IFNET_HOLD)
 		return (NULL);
-	return (V_ifindex_table[idx].ife_ifnet);
+	return (V_ifindex_table[idx]);
 }
 
 struct ifnet *
 ifnet_byindex(u_short idx)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 struct ifnet *
 ifnet_byindex_ref(u_short idx)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	if (ifp == NULL || (ifp->if_flags & IFF_DYING)) {
 		IFNET_RUNLOCK_NOSLEEP();
 		return (NULL);
 	}
 	if_ref(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 /*
  * Allocate an ifindex array entry; return 0 on success or an error on
  * failure.
  */
 static int
 ifindex_alloc_locked(u_short *idxp)
 {
 	u_short idx;
 
 	IFNET_WLOCK_ASSERT();
 
 retry:
 	/*
 	 * Try to find an empty slot below V_if_index.  If we fail, take the
 	 * next slot.
 	 */
 	for (idx = 1; idx <= V_if_index; idx++) {
-		if (V_ifindex_table[idx].ife_ifnet == NULL)
+		if (V_ifindex_table[idx] == NULL)
 			break;
 	}
 
 	/* Catch if_index overflow. */
 	if (idx >= V_if_indexlim) {
 		if_grow();
 		goto retry;
 	}
 	if (idx > V_if_index)
 		V_if_index = idx;
 	*idxp = idx;
 	return (0);
 }
 
 static void
 ifindex_free_locked(u_short idx)
 {
 
 	IFNET_WLOCK_ASSERT();
 
-	V_ifindex_table[idx].ife_ifnet = NULL;
+	V_ifindex_table[idx] = NULL;
 	while (V_if_index > 0 &&
-	    V_ifindex_table[V_if_index].ife_ifnet == NULL)
+	    V_ifindex_table[V_if_index] == NULL)
 		V_if_index--;
 }
 
 static void
 ifindex_free(u_short idx)
 {
 
 	IFNET_WLOCK();
 	ifindex_free_locked(idx);
 	IFNET_WUNLOCK();
 }
 
 static void
 ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp)
 {
 
 	IFNET_WLOCK_ASSERT();
 
-	V_ifindex_table[idx].ife_ifnet = ifp;
+	V_ifindex_table[idx] = ifp;
 }
 
 static void
 ifnet_setbyindex(u_short idx, struct ifnet *ifp)
 {
 
 	IFNET_WLOCK();
 	ifnet_setbyindex_locked(idx, ifp);
 	IFNET_WUNLOCK();
 }
 
 struct ifaddr *
 ifaddr_byindex(u_short idx)
 {
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifa = ifnet_byindex_locked(idx)->if_addr;
 	if (ifa != NULL)
 		ifa_ref(ifa);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Network interface utility routines.
  *
  * Routines with ifa_ifwith* names take sockaddr *'s as
  * parameters.
  */
 
 static void
 vnet_if_init(const void *unused __unused)
 {
 
 	TAILQ_INIT(&V_ifnet);
 	TAILQ_INIT(&V_ifg_head);
 	IFNET_WLOCK();
 	if_grow();				/* create initial table */
 	IFNET_WUNLOCK();
 	vnet_if_clone_init();
 }
 VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init,
     NULL);
 
 /* ARGSUSED*/
 static void
 if_init(void *dummy __unused)
 {
 
 	IFNET_LOCK_INIT();
 	if_clone_init();
 }
 SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_FIRST, if_init, NULL);
 
 
 #ifdef VIMAGE
 static void
 vnet_if_uninit(const void *unused __unused)
 {
 
 	VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p "
 	    "not empty", __func__, __LINE__, &V_ifnet));
 	VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p "
 	    "not empty", __func__, __LINE__, &V_ifg_head));
 
 	free((caddr_t)V_ifindex_table, M_IFNET);
 }
 VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
     vnet_if_uninit, NULL);
 #endif
 
 static void
 if_grow(void)
 {
 	int oldlim;
 	u_int n;
-	struct ifindex_entry *e;
+	struct ifnet **e;
 
 	IFNET_WLOCK_ASSERT();
 	oldlim = V_if_indexlim;
 	IFNET_WUNLOCK();
 	n = (oldlim << 1) * sizeof(*e);
 	e = malloc(n, M_IFNET, M_WAITOK | M_ZERO);
 	IFNET_WLOCK();
 	if (V_if_indexlim != oldlim) {
 		free(e, M_IFNET);
 		return;
 	}
 	if (V_ifindex_table != NULL) {
 		memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
 		free((caddr_t)V_ifindex_table, M_IFNET);
 	}
 	V_if_indexlim <<= 1;
 	V_ifindex_table = e;
 }
 
 /*
  * Allocate a struct ifnet and an index for an interface.  A layer 2
  * common structure will also be allocated if an allocation routine is
  * registered for the passed type.
  */
 struct ifnet *
 if_alloc(u_char type)
 {
 	struct ifnet *ifp;
 	u_short idx;
 
 	ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO);
 	IFNET_WLOCK();
 	if (ifindex_alloc_locked(&idx) != 0) {
 		IFNET_WUNLOCK();
 		free(ifp, M_IFNET);
 		return (NULL);
 	}
 	ifnet_setbyindex_locked(idx, IFNET_HOLD);
 	IFNET_WUNLOCK();
 	ifp->if_index = idx;
 	ifp->if_type = type;
 	ifp->if_alloctype = type;
 	if (if_com_alloc[type] != NULL) {
 		ifp->if_l2com = if_com_alloc[type](type, ifp);
 		if (ifp->if_l2com == NULL) {
 			free(ifp, M_IFNET);
 			ifindex_free(idx);
 			return (NULL);
 		}
 	}
 
 	IF_ADDR_LOCK_INIT(ifp);
 	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_LOCK_INIT(ifp);
 	TAILQ_INIT(&ifp->if_addrhead);
 	TAILQ_INIT(&ifp->if_multiaddrs);
 	TAILQ_INIT(&ifp->if_groups);
 #ifdef MAC
 	mac_ifnet_init(ifp);
 #endif
 	ifq_init(&ifp->if_snd, ifp);
 
 	refcount_init(&ifp->if_refcount, 1);	/* Index reference. */
 	for (int i = 0; i < IFCOUNTERS; i++)
 		ifp->if_counters[i] = counter_u64_alloc(M_WAITOK);
 	ifp->if_get_counter = if_get_counter_default;
 	ifnet_setbyindex(ifp->if_index, ifp);
 	return (ifp);
 }
 
 /*
  * Do the actual work of freeing a struct ifnet, and layer 2 common
  * structure.  This call is made when the last reference to an
  * interface is released.
  */
 static void
 if_free_internal(struct ifnet *ifp)
 {
 
 	KASSERT((ifp->if_flags & IFF_DYING),
 	    ("if_free_internal: interface not dying"));
 
 	if (if_com_free[ifp->if_alloctype] != NULL)
 		if_com_free[ifp->if_alloctype](ifp->if_l2com,
 		    ifp->if_alloctype);
 
 #ifdef MAC
 	mac_ifnet_destroy(ifp);
 #endif /* MAC */
 	if (ifp->if_description != NULL)
 		free(ifp->if_description, M_IFDESCR);
 	IF_AFDATA_DESTROY(ifp);
 	IF_ADDR_LOCK_DESTROY(ifp);
 	ifq_delete(&ifp->if_snd);
 
 	for (int i = 0; i < IFCOUNTERS; i++)
 		counter_u64_free(ifp->if_counters[i]);
 
 	free(ifp, M_IFNET);
 }
 
 /*
  * Deregister an interface and free the associated storage.
  */
 void
 if_free(struct ifnet *ifp)
 {
 
 	ifp->if_flags |= IFF_DYING;			/* XXX: Locking */
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	IFNET_WLOCK();
 	KASSERT(ifp == ifnet_byindex_locked(ifp->if_index),
 	    ("%s: freeing unallocated ifnet", ifp->if_xname));
 
 	ifindex_free_locked(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	if (refcount_release(&ifp->if_refcount))
 		if_free_internal(ifp);
 	CURVNET_RESTORE();
 }
 
 /*
  * Interfaces to keep an ifnet type-stable despite the possibility of the
  * driver calling if_free().  If there are additional references, we defer
  * freeing the underlying data structure.
  */
 void
 if_ref(struct ifnet *ifp)
 {
 
 	/* We don't assert the ifnet list lock here, but arguably should. */
 	refcount_acquire(&ifp->if_refcount);
 }
 
 void
 if_rele(struct ifnet *ifp)
 {
 
 	if (!refcount_release(&ifp->if_refcount))
 		return;
 	if_free_internal(ifp);
 }
 
 void
 ifq_init(struct ifaltq *ifq, struct ifnet *ifp)
 {
 	
 	mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
 
 	if (ifq->ifq_maxlen == 0) 
 		ifq->ifq_maxlen = ifqmaxlen;
 
 	ifq->altq_type = 0;
 	ifq->altq_disc = NULL;
 	ifq->altq_flags &= ALTQF_CANTCHANGE;
 	ifq->altq_tbr  = NULL;
 	ifq->altq_ifp  = ifp;
 }
 
 void
 ifq_delete(struct ifaltq *ifq)
 {
 	mtx_destroy(&ifq->ifq_mtx);
 }
 
 /*
  * Perform generic interface initalization tasks and attach the interface
  * to the list of "active" interfaces.  If vmove flag is set on entry
  * to if_attach_internal(), perform only a limited subset of initialization
  * tasks, given that we are moving from one vnet to another an ifnet which
  * has already been fully initialized.
  *
  * XXX:
  *  - The decision to return void and thus require this function to
  *    succeed is questionable.
  *  - We should probably do more sanity checking.  For instance we don't
  *    do anything to insure if_xname is unique or non-empty.
  */
 void
 if_attach(struct ifnet *ifp)
 {
 
 	if_attach_internal(ifp, 0);
 }
 
 /*
  * Compute the least common TSO limit.
  */
 void
 if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	/*
 	 * 1) If there is no limit currently, take the limit from
 	 * the network adapter.
 	 *
 	 * 2) If the network adapter has a limit below the current
 	 * limit, apply it.
 	 */
 	if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 &&
 	    ifp->if_hw_tsomax < pmax->tsomaxbytes)) {
 		pmax->tsomaxbytes = ifp->if_hw_tsomax;
 	}
 	if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 &&
 	    ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) {
 		pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 	}
 	if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 &&
 	    ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) {
 		pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 	}
 }
 
 /*
  * Update TSO limit of a network adapter.
  *
  * Returns zero if no change. Else non-zero.
  */
 int
 if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	int retval = 0;
 	if (ifp->if_hw_tsomax != pmax->tsomaxbytes) {
 		ifp->if_hw_tsomax = pmax->tsomaxbytes;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) {
 		ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) {
 		ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount;
 		retval++;
 	}
 	return (retval);
 }
 
 static void
 if_attach_internal(struct ifnet *ifp, int vmove)
 {
 	unsigned socksize, ifasize;
 	int namelen, masklen;
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 
 	if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index))
 		panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
 		    ifp->if_xname);
 
 #ifdef VIMAGE
 	ifp->if_vnet = curvnet;
 	if (ifp->if_home_vnet == NULL)
 		ifp->if_home_vnet = curvnet;
 #endif
 
 	if_addgroup(ifp, IFG_ALL);
 
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_epoch = time_uptime;
 
 	KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) ||
 	    (ifp->if_transmit != NULL && ifp->if_qflush != NULL),
 	    ("transmit and qflush must both either be set or both be NULL"));
 	if (ifp->if_transmit == NULL) {
 		ifp->if_transmit = if_transmit;
 		ifp->if_qflush = if_qflush;
 	}
 
 	if (!vmove) {
 #ifdef MAC
 		mac_ifnet_create(ifp);
 #endif
 
 		/*
 		 * Create a Link Level name for this device.
 		 */
 		namelen = strlen(ifp->if_xname);
 		/*
 		 * Always save enough space for any possiable name so we
 		 * can do a rename in place later.
 		 */
 		masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
 		socksize = masklen + ifp->if_addrlen;
 		if (socksize < sizeof(*sdl))
 			socksize = sizeof(*sdl);
 		socksize = roundup2(socksize, sizeof(long));
 		ifasize = sizeof(*ifa) + 2 * socksize;
 		ifa = ifa_alloc(ifasize, M_WAITOK);
 		sdl = (struct sockaddr_dl *)(ifa + 1);
 		sdl->sdl_len = socksize;
 		sdl->sdl_family = AF_LINK;
 		bcopy(ifp->if_xname, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl->sdl_index = ifp->if_index;
 		sdl->sdl_type = ifp->if_type;
 		ifp->if_addr = ifa;
 		ifa->ifa_ifp = ifp;
 		ifa->ifa_rtrequest = link_rtrequest;
 		ifa->ifa_addr = (struct sockaddr *)sdl;
 		sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
 		ifa->ifa_netmask = (struct sockaddr *)sdl;
 		sdl->sdl_len = masklen;
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 		/* Reliably crash if used uninitialized. */
 		ifp->if_broadcastaddr = NULL;
 
 #if defined(INET) || defined(INET6)
 		/* Use defaults for TSO, if nothing is set */
 		if (ifp->if_hw_tsomax == 0 &&
 		    ifp->if_hw_tsomaxsegcount == 0 &&
 		    ifp->if_hw_tsomaxsegsize == 0) {
 			/*
 			 * The TSO defaults needs to be such that an
 			 * NFS mbuf list of 35 mbufs totalling just
 			 * below 64K works and that a chain of mbufs
 			 * can be defragged into at most 32 segments:
 			 */
 			ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) -
 			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
 			ifp->if_hw_tsomaxsegcount = 35;
 			ifp->if_hw_tsomaxsegsize = 2048;	/* 2K */
 
 			/* XXX some drivers set IFCAP_TSO after ethernet attach */
 			if (ifp->if_capabilities & IFCAP_TSO) {
 				if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n",
 				    ifp->if_hw_tsomax,
 				    ifp->if_hw_tsomaxsegcount,
 				    ifp->if_hw_tsomaxsegsize);
 			}
 		}
 		/*
 		 * If the "if_hw_tsomax" limit is set, check if it is
 		 * too small:
 		 */
 		KASSERT(ifp->if_hw_tsomax == 0 ||
 		    ifp->if_hw_tsomax >= (IP_MAXPACKET / 8),
 		    ("%s: if_hw_tsomax is outside of range", __func__));
 #endif
 	}
 #ifdef VIMAGE
 	else {
 		/*
 		 * Update the interface index in the link layer address
 		 * of the interface.
 		 */
 		for (ifa = ifp->if_addr; ifa != NULL;
 		    ifa = TAILQ_NEXT(ifa, ifa_link)) {
 			if (ifa->ifa_addr->sa_family == AF_LINK) {
 				sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 				sdl->sdl_index = ifp->if_index;
 			}
 		}
 	}
 #endif
 
 	IFNET_WLOCK();
 	TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
 #ifdef VIMAGE
 	curvnet->vnet_ifcnt++;
 #endif
 	IFNET_WUNLOCK();
 
 	if (domain_init_status >= 2)
 		if_attachdomain1(ifp);
 
 	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
 
 	/* Announce the interface. */
 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 }
 
 static void
 if_attachdomain(void *dummy)
 {
 	struct ifnet *ifp;
 
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		if_attachdomain1(ifp);
 }
 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
     if_attachdomain, NULL);
 
 static void
 if_attachdomain1(struct ifnet *ifp)
 {
 	struct domain *dp;
 
 	/*
 	 * Since dp->dom_ifattach calls malloc() with M_WAITOK, we
 	 * cannot lock ifp->if_afdata initialization, entirely.
 	 */
 	if (IF_AFDATA_TRYLOCK(ifp) == 0)
 		return;
 	if (ifp->if_afdata_initialized >= domain_init_status) {
 		IF_AFDATA_UNLOCK(ifp);
 		log(LOG_WARNING, "%s called more than once on %s\n",
 		    __func__, ifp->if_xname);
 		return;
 	}
 	ifp->if_afdata_initialized = domain_init_status;
 	IF_AFDATA_UNLOCK(ifp);
 
 	/* address family dependent data region */
 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_ifattach)
 			ifp->if_afdata[dp->dom_family] =
 			    (*dp->dom_ifattach)(ifp);
 	}
 }
 
 /*
  * Remove any unicast or broadcast network addresses from an interface.
  */
 void
 if_purgeaddrs(struct ifnet *ifp)
 {
 	struct ifaddr *ifa, *next;
 
 	TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
 		if (ifa->ifa_addr->sa_family == AF_LINK)
 			continue;
 #ifdef INET
 		/* XXX: Ugly!! ad hoc just for INET */
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			struct ifaliasreq ifr;
 
 			bzero(&ifr, sizeof(ifr));
 			ifr.ifra_addr = *ifa->ifa_addr;
 			if (ifa->ifa_dstaddr)
 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
 			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
 			    NULL) == 0)
 				continue;
 		}
 #endif /* INET */
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6) {
 			in6_purgeaddr(ifa);
 			/* ifp_addrhead is already updated */
 			continue;
 		}
 #endif /* INET6 */
 		TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
 		ifa_free(ifa);
 	}
 }
 
 /*
  * Remove any multicast network addresses from an interface when an ifnet
  * is going away.
  */
 static void
 if_purgemaddrs(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 1);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Detach an interface, removing it from the list of "active" interfaces.
  * If vmove flag is set on entry to if_detach_internal(), perform only a
  * limited subset of cleanup tasks, given that we are moving an ifnet from
  * one vnet to another, where it must be fully operational.
  *
  * XXXRW: There are some significant questions about event ordering, and
  * how to prevent things from starting to use the interface during detach.
  */
 void
 if_detach(struct ifnet *ifp)
 {
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	if_detach_internal(ifp, 0);
 	CURVNET_RESTORE();
 }
 
 static void
 if_detach_internal(struct ifnet *ifp, int vmove)
 {
 	struct ifaddr *ifa;
 	struct radix_node_head	*rnh;
 	int i, j;
 	struct domain *dp;
  	struct ifnet *iter;
  	int found = 0;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(iter, &V_ifnet, if_link)
 		if (iter == ifp) {
 			TAILQ_REMOVE(&V_ifnet, ifp, if_link);
 			found = 1;
 			break;
 		}
 #ifdef VIMAGE
 	if (found)
 		curvnet->vnet_ifcnt--;
 #endif
 	IFNET_WUNLOCK();
 	if (!found) {
 		if (vmove)
 			panic("%s: ifp=%p not on the ifnet tailq %p",
 			    __func__, ifp, &V_ifnet);
 		else
 			return; /* XXX this should panic as well? */
 	}
 
 	/*
 	 * Remove/wait for pending events.
 	 */
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 
 	/*
 	 * Remove routes and flush queues.
 	 */
 	if_down(ifp);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		altq_disable(&ifp->if_snd);
 	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
 		altq_detach(&ifp->if_snd);
 #endif
 
 	if_purgeaddrs(ifp);
 
 #ifdef INET
 	in_ifdetach(ifp);
 #endif
 
 #ifdef INET6
 	/*
 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
 	 * before removing routing entries below, since IPv6 interface direct
 	 * routes are expected to be removed by the IPv6-specific kernel API.
 	 * Otherwise, the kernel will detect some inconsistency and bark it.
 	 */
 	in6_ifdetach(ifp);
 #endif
 	if_purgemaddrs(ifp);
 
 	/* Announce that the interface is gone. */
 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
 
 	if (!vmove) {
 		/*
 		 * Prevent further calls into the device driver via ifnet.
 		 */
 		if_dead(ifp);
 
 		/*
 		 * Remove link ifaddr pointer and maybe decrement if_index.
 		 * Clean up all addresses.
 		 */
 		ifp->if_addr = NULL;
 
 		/* We can now free link ifaddr. */
 		if (!TAILQ_EMPTY(&ifp->if_addrhead)) {
 			ifa = TAILQ_FIRST(&ifp->if_addrhead);
 			TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
 			ifa_free(ifa);
 		}
 	}
 
 	/*
 	 * Delete all remaining routes using this interface
 	 * Unfortuneatly the only way to do this is to slog through
 	 * the entire routing table looking for routes which point
 	 * to this interface...oh well...
 	 */
 	for (i = 1; i <= AF_MAX; i++) {
 		for (j = 0; j < rt_numfibs; j++) {
 			rnh = rt_tables_get_rnh(j, i);
 			if (rnh == NULL)
 				continue;
 			RADIX_NODE_HEAD_LOCK(rnh);
 			(void) rnh->rnh_walktree(rnh, if_rtdel, ifp);
 			RADIX_NODE_HEAD_UNLOCK(rnh);
 		}
 	}
 
 	if_delgroups(ifp);
 
 	/*
 	 * We cannot hold the lock over dom_ifdetach calls as they might
 	 * sleep, for example trying to drain a callout, thus open up the
 	 * theoretical race with re-attaching.
 	 */
 	IF_AFDATA_LOCK(ifp);
 	i = ifp->if_afdata_initialized;
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_UNLOCK(ifp);
 	for (dp = domains; i > 0 && dp; dp = dp->dom_next) {
 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
 			(*dp->dom_ifdetach)(ifp,
 			    ifp->if_afdata[dp->dom_family]);
 	}
 }
 
 #ifdef VIMAGE
 /*
  * if_vmove() performs a limited version of if_detach() in current
  * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
  * An attempt is made to shrink if_index in current vnet, find an
  * unused if_index in target vnet and calls if_grow() if necessary,
  * and finally find an unused if_xname for the target vnet.
  */
 void
 if_vmove(struct ifnet *ifp, struct vnet *new_vnet)
 {
 	u_short idx;
 
 	/*
 	 * Detach from current vnet, but preserve LLADDR info, do not
 	 * mark as dead etc. so that the ifnet can be reattached later.
 	 */
 	if_detach_internal(ifp, 1);
 
 	/*
 	 * Unlink the ifnet from ifindex_table[] in current vnet, and shrink
 	 * the if_index for that vnet if possible.
 	 *
 	 * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized,
 	 * or we'd lock on one vnet and unlock on another.
 	 */
 	IFNET_WLOCK();
 	ifindex_free_locked(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	/*
 	 * Perform interface-specific reassignment tasks, if provided by
 	 * the driver.
 	 */
 	if (ifp->if_reassign != NULL)
 		ifp->if_reassign(ifp, new_vnet, NULL);
 
 	/*
 	 * Switch to the context of the target vnet.
 	 */
 	CURVNET_SET_QUIET(new_vnet);
 
 	IFNET_WLOCK();
 	if (ifindex_alloc_locked(&idx) != 0) {
 		IFNET_WUNLOCK();
 		panic("if_index overflow");
 	}
 	ifp->if_index = idx;
 	ifnet_setbyindex_locked(ifp->if_index, ifp);
 	IFNET_WUNLOCK();
 
 	if_attach_internal(ifp, 1);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Move an ifnet to or from another child prison/vnet, specified by the jail id.
  */
 static int
 if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct ifnet *difp;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Do not try to move the iface from and to the same prison. */
 	if (pr->pr_vnet == ifp->if_vnet) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the named iface does not exists in the dst. prison/vnet. */
 	/* XXX Lock interfaces to avoid races. */
 	CURVNET_SET_QUIET(pr->pr_vnet);
 	difp = ifunit(ifname);
 	CURVNET_RESTORE();
 	if (difp != NULL) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Move the interface into the child jail/vnet. */
 	if_vmove(ifp, pr->pr_vnet);
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 
 static int
 if_vmove_reclaim(struct thread *td, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct vnet *vnet_dst;
 	struct ifnet *ifp;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Make sure the named iface exists in the source prison/vnet. */
 	CURVNET_SET(pr->pr_vnet);
 	ifp = ifunit(ifname);		/* XXX Lock to avoid races. */
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (ENXIO);
 	}
 
 	/* Do not try to move the iface from and to the same prison. */
 	vnet_dst = TD_TO_VNET(td);
 	if (vnet_dst == ifp->if_vnet) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Get interface back from child jail/vnet. */
 	if_vmove(ifp, vnet_dst);
 	CURVNET_RESTORE();
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 #endif /* VIMAGE */
 
 /*
  * Add a group to an interface
  */
 int
 if_addgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_group	*ifg = NULL;
 	struct ifg_member	*ifgm;
 	int 			 new = 0;
 
 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
 	    groupname[strlen(groupname) - 1] <= '9')
 		return (EINVAL);
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
 			IFNET_WUNLOCK();
 			return (EEXIST);
 		}
 
 	if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP,
 	    M_NOWAIT)) == NULL) {
 	    	IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member),
 	    M_TEMP, M_NOWAIT)) == NULL) {
 		free(ifgl, M_TEMP);
 		IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, groupname))
 			break;
 
 	if (ifg == NULL) {
 		if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group),
 		    M_TEMP, M_NOWAIT)) == NULL) {
 			free(ifgl, M_TEMP);
 			free(ifgm, M_TEMP);
 			IFNET_WUNLOCK();
 			return (ENOMEM);
 		}
 		strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
 		ifg->ifg_refcnt = 0;
 		TAILQ_INIT(&ifg->ifg_members);
 		TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
 		new = 1;
 	}
 
 	ifg->ifg_refcnt++;
 	ifgl->ifgl_group = ifg;
 	ifgm->ifgm_ifp = ifp;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
 	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	IFNET_WUNLOCK();
 
 	if (new)
 		EVENTHANDLER_INVOKE(group_attach_event, ifg);
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove a group from an interface
  */
 int
 if_delgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
 			break;
 	if (ifgl == NULL) {
 		IFNET_WUNLOCK();
 		return (ENOENT);
 	}
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 		if (ifgm->ifgm_ifp == ifp)
 			break;
 
 	if (ifgm != NULL) {
 		TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
 		free(ifgm, M_TEMP);
 	}
 
 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 		TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 		IFNET_WUNLOCK();
 		EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
 		free(ifgl->ifgl_group, M_TEMP);
 	} else
 		IFNET_WUNLOCK();
 
 	free(ifgl, M_TEMP);
 
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove an interface from all groups
  */
 static void
 if_delgroups(struct ifnet *ifp)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 	char groupname[IFNAMSIZ];
 
 	IFNET_WLOCK();
 	while (!TAILQ_EMPTY(&ifp->if_groups)) {
 		ifgl = TAILQ_FIRST(&ifp->if_groups);
 
 		strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
 
 		IF_ADDR_WLOCK(ifp);
 		TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
 		IF_ADDR_WUNLOCK(ifp);
 
 		TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 			if (ifgm->ifgm_ifp == ifp)
 				break;
 
 		if (ifgm != NULL) {
 			TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm,
 			    ifgm_next);
 			free(ifgm, M_TEMP);
 		}
 
 		if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 			TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 			IFNET_WUNLOCK();
 			EVENTHANDLER_INVOKE(group_detach_event,
 			    ifgl->ifgl_group);
 			free(ifgl->ifgl_group, M_TEMP);
 		} else
 			IFNET_WUNLOCK();
 
 		free(ifgl, M_TEMP);
 
 		EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 		IFNET_WLOCK();
 	}
 	IFNET_WUNLOCK();
 }
 
 /*
  * Stores all groups from an interface in memory pointed
  * to by data
  */
 static int
 if_getgroup(struct ifgroupreq *data, struct ifnet *ifp)
 {
 	int			 len, error;
 	struct ifg_list		*ifgl;
 	struct ifg_req		 ifgrq, *ifgp;
 	struct ifgroupreq	*ifgr = data;
 
 	if (ifgr->ifgr_len == 0) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 			ifgr->ifgr_len += sizeof(struct ifg_req);
 		IF_ADDR_RUNLOCK(ifp);
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	/* XXX: wire */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
 		if (len < sizeof(ifgrq)) {
 			IF_ADDR_RUNLOCK(ifp);
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
 		    sizeof(ifgrq.ifgrq_group));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 		    	IF_ADDR_RUNLOCK(ifp);
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (0);
 }
 
 /*
  * Stores all members of a group in memory pointed to by data
  */
 static int
 if_getgroupmembers(struct ifgroupreq *data)
 {
 	struct ifgroupreq	*ifgr = data;
 	struct ifg_group	*ifg;
 	struct ifg_member	*ifgm;
 	struct ifg_req		 ifgrq, *ifgp;
 	int			 len, error;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
 			break;
 	if (ifg == NULL) {
 		IFNET_RUNLOCK();
 		return (ENOENT);
 	}
 
 	if (ifgr->ifgr_len == 0) {
 		TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
 			ifgr->ifgr_len += sizeof(ifgrq);
 		IFNET_RUNLOCK();
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
 		if (len < sizeof(ifgrq)) {
 			IFNET_RUNLOCK();
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
 		    sizeof(ifgrq.ifgrq_member));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 			IFNET_RUNLOCK();
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IFNET_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Delete Routes for a Network Interface
  *
  * Called for each routing entry via the rnh->rnh_walktree() call above
  * to delete all route entries referencing a detaching network interface.
  *
  * Arguments:
  *	rn	pointer to node in the routing table
  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
  *
  * Returns:
  *	0	successful
  *	errno	failed - reason indicated
  *
  */
 static int
 if_rtdel(struct radix_node *rn, void *arg)
 {
 	struct rtentry	*rt = (struct rtentry *)rn;
 	struct ifnet	*ifp = arg;
 	int		err;
 
 	if (rt->rt_ifp == ifp) {
 
 		/*
 		 * Protect (sorta) against walktree recursion problems
 		 * with cloned routes
 		 */
 		if ((rt->rt_flags & RTF_UP) == 0)
 			return (0);
 
 		err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway,
 				rt_mask(rt),
 				rt->rt_flags|RTF_RNH_LOCKED|RTF_PINNED,
 				(struct rtentry **) NULL, rt->rt_fibnum);
 		if (err) {
 			log(LOG_WARNING, "if_rtdel: error %d\n", err);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Return counter values from counter(9)s stored in ifnet.
  */
 uint64_t
 if_get_counter_default(struct ifnet *ifp, ift_counter cnt)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	return (counter_u64_fetch(ifp->if_counters[cnt]));
 }
 
 /*
  * Increase an ifnet counter. Usually used for counters shared
  * between the stack and a driver, but function supports them all.
  */
 void
 if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	counter_u64_add(ifp->if_counters[cnt], inc);
 }
 
 /*
  * Copy data from ifnet to userland API structure if_data.
  */
 void
 if_data_copy(struct ifnet *ifp, struct if_data *ifd)
 {
 
 	ifd->ifi_type = ifp->if_type;
 	ifd->ifi_physical = 0;
 	ifd->ifi_addrlen = ifp->if_addrlen;
 	ifd->ifi_hdrlen = ifp->if_hdrlen;
 	ifd->ifi_link_state = ifp->if_link_state;
 	ifd->ifi_vhid = 0;
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_mtu = ifp->if_mtu;
 	ifd->ifi_metric = ifp->if_metric;
 	ifd->ifi_baudrate = ifp->if_baudrate;
 	ifd->ifi_hwassist = ifp->if_hwassist;
 	ifd->ifi_epoch = ifp->if_epoch;
 	ifd->ifi_lastchange = ifp->if_lastchange;
 
 	ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS);
 	ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS);
 	ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS);
 	ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS);
 	ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS);
 	ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES);
 	ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES);
 	ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS);
 	ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS);
 	ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS);
 	ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
 	ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO);
 }
 
 /*
  * Wrapper functions for struct ifnet address list locking macros.  These are
  * used by kernel modules to avoid encoding programming interface or binary
  * interface assumptions that may be violated when kernel-internal locking
  * approaches change.
  */
 void
 if_addr_rlock(struct ifnet *ifp)
 {
 
 	IF_ADDR_RLOCK(ifp);
 }
 
 void
 if_addr_runlock(struct ifnet *ifp)
 {
 
 	IF_ADDR_RUNLOCK(ifp);
 }
 
 void
 if_maddr_rlock(if_t ifp)
 {
 
 	IF_ADDR_RLOCK((struct ifnet *)ifp);
 }
 
 void
 if_maddr_runlock(if_t ifp)
 {
 
 	IF_ADDR_RUNLOCK((struct ifnet *)ifp);
 }
 
 /*
  * Initialization, destruction and refcounting functions for ifaddrs.
  */
 struct ifaddr *
 ifa_alloc(size_t size, int flags)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(size >= sizeof(struct ifaddr),
 	    ("%s: invalid size %zu", __func__, size));
 
 	ifa = malloc(size, M_IFADDR, M_ZERO | flags);
 	if (ifa == NULL)
 		return (NULL);
 
 	if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 
 	refcount_init(&ifa->ifa_refcnt, 1);
 
 	return (ifa);
 
 fail:
 	/* free(NULL) is okay */
 	counter_u64_free(ifa->ifa_opackets);
 	counter_u64_free(ifa->ifa_ipackets);
 	counter_u64_free(ifa->ifa_obytes);
 	counter_u64_free(ifa->ifa_ibytes);
 	free(ifa, M_IFADDR);
 
 	return (NULL);
 }
 
 void
 ifa_ref(struct ifaddr *ifa)
 {
 
 	refcount_acquire(&ifa->ifa_refcnt);
 }
 
 void
 ifa_free(struct ifaddr *ifa)
 {
 
 	if (refcount_release(&ifa->ifa_refcnt)) {
 		counter_u64_free(ifa->ifa_opackets);
 		counter_u64_free(ifa->ifa_ipackets);
 		counter_u64_free(ifa->ifa_obytes);
 		counter_u64_free(ifa->ifa_ibytes);
 		free(ifa, M_IFADDR);
 	}
 }
 
 int
 ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 	int error = 0;
 	struct rtentry *rt = NULL;
 	struct rt_addrinfo info;
 	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
 
 	bzero(&info, sizeof(info));
 	info.rti_ifp = V_loif;
 	info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
 	info.rti_info[RTAX_DST] = ia;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
 	error = rtrequest1_fib(RTM_ADD, &info, &rt, ifa->ifa_ifp->if_fib);
 
 	if (error == 0 && rt != NULL) {
 		RT_LOCK(rt);
 		((struct sockaddr_dl *)rt->rt_gateway)->sdl_type  =
 			ifa->ifa_ifp->if_type;
 		((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
 			ifa->ifa_ifp->if_index;
 		RT_REMREF(rt);
 		RT_UNLOCK(rt);
 	} else if (error != 0)
 		log(LOG_DEBUG, "%s: insertion failed: %u\n", __func__, error);
 
 	return (error);
 }
 
 int
 ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 	int error = 0;
 	struct rt_addrinfo info;
 	struct sockaddr_dl null_sdl;
 
 	bzero(&null_sdl, sizeof(null_sdl));
 	null_sdl.sdl_len = sizeof(null_sdl);
 	null_sdl.sdl_family = AF_LINK;
 	null_sdl.sdl_type = ifa->ifa_ifp->if_type;
 	null_sdl.sdl_index = ifa->ifa_ifp->if_index;
 	bzero(&info, sizeof(info));
 	info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
 	info.rti_info[RTAX_DST] = ia;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
 	error = rtrequest1_fib(RTM_DELETE, &info, NULL, ifa->ifa_ifp->if_fib);
 
 	if (error != 0)
 		log(LOG_DEBUG, "%s: deletion failed: %u\n", __func__, error);
 
 	return (error);
 }
 
 int
 ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *sa, int fib)
 {
 	struct rtentry *rt;
 
 	rt = rtalloc1_fib(sa, 0, 0, fib);
 	if (rt == NULL) {
 		log(LOG_DEBUG, "%s: fail", __func__);
 		return (EHOSTUNREACH);
 	}
 	((struct sockaddr_dl *)rt->rt_gateway)->sdl_type =
 	    ifa->ifa_ifp->if_type;
 	((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
 	    ifa->ifa_ifp->if_index;
 	RTFREE_LOCKED(rt);
 
 	return (0);
 }
 
 /*
  * XXX: Because sockaddr_dl has deeper structure than the sockaddr
  * structs used to represent other address families, it is necessary
  * to perform a different comparison.
  */
 
 #define	sa_dl_equal(a1, a2)	\
 	((((struct sockaddr_dl *)(a1))->sdl_len ==			\
 	 ((struct sockaddr_dl *)(a2))->sdl_len) &&			\
 	 (bcmp(LLADDR((struct sockaddr_dl *)(a1)),			\
 	       LLADDR((struct sockaddr_dl *)(a2)),			\
 	       ((struct sockaddr_dl *)(a1))->sdl_alen) == 0))
 
 /*
  * Locate an interface based on a complete address.
  */
 /*ARGSUSED*/
 static struct ifaddr *
 ifa_ifwithaddr_internal(struct sockaddr *addr, int getref)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (sa_equal(addr, ifa->ifa_addr)) {
 				if (getref)
 					ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 			/* IP6 doesn't have broadcast */
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				if (getref)
 					ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 struct ifaddr *
 ifa_ifwithaddr(struct sockaddr *addr)
 {
 
 	return (ifa_ifwithaddr_internal(addr, 1));
 }
 
 int
 ifa_ifwithaddr_check(struct sockaddr *addr)
 {
 
 	return (ifa_ifwithaddr_internal(addr, 0) != NULL);
 }
 
 /*
  * Locate an interface based on the broadcast address.
  */
 /* ARGSUSED */
 struct ifaddr *
 ifa_ifwithbroadaddr(struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Locate the point to point interface with a given destination address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithdstaddr(struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
 			continue;
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (ifa->ifa_dstaddr != NULL &&
 			    sa_equal(addr, ifa->ifa_dstaddr)) {
 				ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Find an interface on a specific network.  If many, choice
  * is most specific found.
  */
 struct ifaddr *
 ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 	char *addr_data = addr->sa_data, *cplim;
 
 	/*
 	 * AF_LINK addresses can be looked up directly by their index number,
 	 * so do that if we can.
 	 */
 	if (af == AF_LINK) {
 	    struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
 	    if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
 		return (ifaddr_byindex(sdl->sdl_index));
 	}
 
 	/*
 	 * Scan though each interface, looking for ones that have addresses
 	 * in this address family and the requested fib.  Maintain a reference
 	 * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that
 	 * kept it stable when we move onto the next interface.
 	 */
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			char *cp, *cp2, *cp3;
 
 			if (ifa->ifa_addr->sa_family != af)
 next:				continue;
 			if (af == AF_INET && 
 			    ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
 				/*
 				 * This is a bit broken as it doesn't
 				 * take into account that the remote end may
 				 * be a single node in the network we are
 				 * looking for.
 				 * The trouble is that we don't know the
 				 * netmask for the remote end.
 				 */
 				if (ifa->ifa_dstaddr != NULL &&
 				    sa_equal(addr, ifa->ifa_dstaddr)) {
 					ifa_ref(ifa);
 					IF_ADDR_RUNLOCK(ifp);
 					goto done;
 				}
 			} else {
 				/*
 				 * Scan all the bits in the ifa's address.
 				 * If a bit dissagrees with what we are
 				 * looking for, mask it with the netmask
 				 * to see if it really matters.
 				 * (A byte at a time)
 				 */
 				if (ifa->ifa_netmask == 0)
 					continue;
 				cp = addr_data;
 				cp2 = ifa->ifa_addr->sa_data;
 				cp3 = ifa->ifa_netmask->sa_data;
 				cplim = ifa->ifa_netmask->sa_len
 					+ (char *)ifa->ifa_netmask;
 				while (cp3 < cplim)
 					if ((*cp++ ^ *cp2++) & *cp3++)
 						goto next; /* next address! */
 				/*
 				 * If the netmask of what we just found
 				 * is more specific than what we had before
 				 * (if we had one), or if the virtual status
 				 * of new prefix is better than of the old one,
 				 * then remember the new one before continuing
 				 * to search for an even better one.
 				 */
 				if (ifa_maybe == NULL ||
 				    ifa_preferred(ifa_maybe, ifa) ||
 				    rn_refines((caddr_t)ifa->ifa_netmask,
 				    (caddr_t)ifa_maybe->ifa_netmask)) {
 					if (ifa_maybe != NULL)
 						ifa_free(ifa_maybe);
 					ifa_maybe = ifa;
 					ifa_ref(ifa_maybe);
 				}
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = ifa_maybe;
 	ifa_maybe = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	if (ifa_maybe != NULL)
 		ifa_free(ifa_maybe);
 	return (ifa);
 }
 
 /*
  * Find an interface address specific to an interface best matching
  * a given address.
  */
 struct ifaddr *
 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	char *cp, *cp2, *cp3;
 	char *cplim;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 
 	if (af >= AF_MAX)
 		return (NULL);
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != af)
 			continue;
 		if (ifa_maybe == NULL)
 			ifa_maybe = ifa;
 		if (ifa->ifa_netmask == 0) {
 			if (sa_equal(addr, ifa->ifa_addr) ||
 			    (ifa->ifa_dstaddr &&
 			    sa_equal(addr, ifa->ifa_dstaddr)))
 				goto done;
 			continue;
 		}
 		if (ifp->if_flags & IFF_POINTOPOINT) {
 			if (sa_equal(addr, ifa->ifa_dstaddr))
 				goto done;
 		} else {
 			cp = addr->sa_data;
 			cp2 = ifa->ifa_addr->sa_data;
 			cp3 = ifa->ifa_netmask->sa_data;
 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
 			for (; cp3 < cplim; cp3++)
 				if ((*cp++ ^ *cp2++) & *cp3)
 					break;
 			if (cp3 == cplim)
 				goto done;
 		}
 	}
 	ifa = ifa_maybe;
 done:
 	if (ifa != NULL)
 		ifa_ref(ifa);
 	IF_ADDR_RUNLOCK(ifp);
 	return (ifa);
 }
 
 /*
  * See whether new ifa is better than current one:
  * 1) A non-virtual one is preferred over virtual.
  * 2) A virtual in master state preferred over any other state.
  *
  * Used in several address selecting functions.
  */
 int
 ifa_preferred(struct ifaddr *cur, struct ifaddr *next)
 {
 
 	return (cur->ifa_carp && (!next->ifa_carp ||
 	    ((*carp_master_p)(next) && !(*carp_master_p)(cur))));
 }
 
 #include <net/if_llatbl.h>
 
 /*
  * Default action when installing a route with a Link Level gateway.
  * Lookup an appropriate real ifa to point to.
  * This should be moved to /sys/net/link.c eventually.
  */
 static void
 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct ifaddr *ifa, *oifa;
 	struct sockaddr *dst;
 	struct ifnet *ifp;
 
 	if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) ||
 	    ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0))
 		return;
 	ifa = ifaof_ifpforaddr(dst, ifp);
 	if (ifa) {
 		oifa = rt->rt_ifa;
 		rt->rt_ifa = ifa;
 		ifa_free(oifa);
 		if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
 			ifa->ifa_rtrequest(cmd, rt, info);
 	}
 }
 
 struct sockaddr_dl *
 link_alloc_sdl(size_t size, int flags)
 {
 
 	return (malloc(size, M_TEMP, flags));
 }
 
 void
 link_free_sdl(struct sockaddr *sa)
 {
 	free(sa, M_TEMP);
 }
 
 /*
  * Fills in given sdl with interface basic info.
  * Returns pointer to filled sdl.
  */
 struct sockaddr_dl *
 link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)paddr;
 	memset(sdl, 0, sizeof(struct sockaddr_dl));
 	sdl->sdl_len = sizeof(struct sockaddr_dl);
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = iftype;
 
 	return (sdl);
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 static void
 if_unroute(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
 
 	ifp->if_flags &= ~flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
 	ifp->if_qflush(ifp);
 
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 static void
 if_route(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
 
 	ifp->if_flags |= flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFUP, ifa->ifa_addr);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 #ifdef INET6
 	in6_if_up(ifp);
 #endif
 }
 
 void	(*vlan_link_state_p)(struct ifnet *);	/* XXX: private from if_vlan */
 void	(*vlan_trunk_cap_p)(struct ifnet *);		/* XXX: private from if_vlan */
 struct ifnet *(*vlan_trunkdev_p)(struct ifnet *);
 struct	ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t);
 int	(*vlan_tag_p)(struct ifnet *, uint16_t *);
 int	(*vlan_setcookie_p)(struct ifnet *, void *);
 void	*(*vlan_cookie_p)(struct ifnet *);
 
 /*
  * Handle a change in the interface link state. To avoid LORs
  * between driver lock and upper layer locks, as well as possible
  * recursions, we post event to taskqueue, and all job
  * is done in static do_link_state_change().
  */
 void
 if_link_state_change(struct ifnet *ifp, int link_state)
 {
 	/* Return if state hasn't changed. */
 	if (ifp->if_link_state == link_state)
 		return;
 
 	ifp->if_link_state = link_state;
 
 	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
 }
 
 static void
 do_link_state_change(void *arg, int pending)
 {
 	struct ifnet *ifp = (struct ifnet *)arg;
 	int link_state = ifp->if_link_state;
 	CURVNET_SET(ifp->if_vnet);
 
 	/* Notify that the link state has changed. */
 	rt_ifmsg(ifp);
 	if (ifp->if_vlantrunk != NULL)
 		(*vlan_link_state_p)(ifp);
 
 	if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) &&
-	    IFP2AC(ifp)->ac_netgraph != NULL)
+	    ifp->if_l2com != NULL)
 		(*ng_ether_link_state_p)(ifp, link_state);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	if (ifp->if_bridge)
 		(*bridge_linkstate_p)(ifp);
 	if (ifp->if_lagg)
 		(*lagg_linkstate_p)(ifp, link_state);
 
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
 		    NULL);
 	if (pending > 1)
 		if_printf(ifp, "%d link states coalesced\n", pending);
 	if (log_link_state_change)
 		log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
 	EVENTHANDLER_INVOKE(ifnet_link_event, ifp, ifp->if_link_state);
 	CURVNET_RESTORE();
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 void
 if_down(struct ifnet *ifp)
 {
 
 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 void
 if_up(struct ifnet *ifp)
 {
 
 	if_route(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Flush an interface queue.
  */
 void
 if_qflush(struct ifnet *ifp)
 {
 	struct mbuf *m, *n;
 	struct ifaltq *ifq;
 	
 	ifq = &ifp->if_snd;
 	IFQ_LOCK(ifq);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(ifq))
 		ALTQ_PURGE(ifq);
 #endif
 	n = ifq->ifq_head;
 	while ((m = n) != 0) {
 		n = m->m_nextpkt;
 		m_freem(m);
 	}
 	ifq->ifq_head = 0;
 	ifq->ifq_tail = 0;
 	ifq->ifq_len = 0;
 	IFQ_UNLOCK(ifq);
 }
 
 /*
  * Map interface name to interface structure pointer, with or without
  * returning a reference.
  */
 struct ifnet *
 ifunit_ref(const char *name)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
 		    !(ifp->if_flags & IFF_DYING))
 			break;
 	}
 	if (ifp != NULL)
 		if_ref(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 struct ifnet *
 ifunit(const char *name)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
 			break;
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 /*
  * Hardware specific interface ioctls.
  */
 static int
 ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 {
 	struct ifreq *ifr;
 	int error = 0;
 	int new_flags, temp_flags;
 	size_t namelen, onamelen;
 	size_t descrlen;
 	char *descrbuf, *odescrbuf;
 	char new_name[IFNAMSIZ];
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCGIFINDEX:
 		ifr->ifr_index = ifp->if_index;
 		break;
 
 	case SIOCGIFFLAGS:
 		temp_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifr->ifr_flags = temp_flags & 0xffff;
 		ifr->ifr_flagshigh = temp_flags >> 16;
 		break;
 
 	case SIOCGIFCAP:
 		ifr->ifr_reqcap = ifp->if_capabilities;
 		ifr->ifr_curcap = ifp->if_capenable;
 		break;
 
 #ifdef MAC
 	case SIOCGIFMAC:
 		error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCGIFMETRIC:
 		ifr->ifr_metric = ifp->if_metric;
 		break;
 
 	case SIOCGIFMTU:
 		ifr->ifr_mtu = ifp->if_mtu;
 		break;
 
 	case SIOCGIFPHYS:
 		/* XXXGL: did this ever worked? */
 		ifr->ifr_phys = 0;
 		break;
 
 	case SIOCGIFDESCR:
 		error = 0;
 		sx_slock(&ifdescr_sx);
 		if (ifp->if_description == NULL)
 			error = ENOMSG;
 		else {
 			/* space for terminating nul */
 			descrlen = strlen(ifp->if_description) + 1;
 			if (ifr->ifr_buffer.length < descrlen)
 				ifr->ifr_buffer.buffer = NULL;
 			else
 				error = copyout(ifp->if_description,
 				    ifr->ifr_buffer.buffer, descrlen);
 			ifr->ifr_buffer.length = descrlen;
 		}
 		sx_sunlock(&ifdescr_sx);
 		break;
 
 	case SIOCSIFDESCR:
 		error = priv_check(td, PRIV_NET_SETIFDESCR);
 		if (error)
 			return (error);
 
 		/*
 		 * Copy only (length-1) bytes to make sure that
 		 * if_description is always nul terminated.  The
 		 * length parameter is supposed to count the
 		 * terminating nul in.
 		 */
 		if (ifr->ifr_buffer.length > ifdescr_maxlen)
 			return (ENAMETOOLONG);
 		else if (ifr->ifr_buffer.length == 0)
 			descrbuf = NULL;
 		else {
 			descrbuf = malloc(ifr->ifr_buffer.length, M_IFDESCR,
 			    M_WAITOK | M_ZERO);
 			error = copyin(ifr->ifr_buffer.buffer, descrbuf,
 			    ifr->ifr_buffer.length - 1);
 			if (error) {
 				free(descrbuf, M_IFDESCR);
 				break;
 			}
 		}
 
 		sx_xlock(&ifdescr_sx);
 		odescrbuf = ifp->if_description;
 		ifp->if_description = descrbuf;
 		sx_xunlock(&ifdescr_sx);
 
 		getmicrotime(&ifp->if_lastchange);
 		free(odescrbuf, M_IFDESCR);
 		break;
 
 	case SIOCGIFFIB:
 		ifr->ifr_fib = ifp->if_fib;
 		break;
 
 	case SIOCSIFFIB:
 		error = priv_check(td, PRIV_NET_SETIFFIB);
 		if (error)
 			return (error);
 		if (ifr->ifr_fib >= rt_numfibs)
 			return (EINVAL);
 
 		ifp->if_fib = ifr->ifr_fib;
 		break;
 
 	case SIOCSIFFLAGS:
 		error = priv_check(td, PRIV_NET_SETIFFLAGS);
 		if (error)
 			return (error);
 		/*
 		 * Currently, no driver owned flags pass the IFF_CANTCHANGE
 		 * check, so we don't need special handling here yet.
 		 */
 		new_flags = (ifr->ifr_flags & 0xffff) |
 		    (ifr->ifr_flagshigh << 16);
 		if (ifp->if_flags & IFF_UP &&
 		    (new_flags & IFF_UP) == 0) {
 			if_down(ifp);
 		} else if (new_flags & IFF_UP &&
 		    (ifp->if_flags & IFF_UP) == 0) {
 			if_up(ifp);
 		}
 		/* See if permanently promiscuous mode bit is about to flip */
 		if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
 			if (new_flags & IFF_PPROMISC)
 				ifp->if_flags |= IFF_PROMISC;
 			else if (ifp->if_pcount == 0)
 				ifp->if_flags &= ~IFF_PROMISC;
 			log(LOG_INFO, "%s: permanently promiscuous mode %s\n",
 			    ifp->if_xname,
 			    (new_flags & IFF_PPROMISC) ? "enabled" : "disabled");
 		}
 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
 			(new_flags &~ IFF_CANTCHANGE);
 		if (ifp->if_ioctl) {
 			(void) (*ifp->if_ioctl)(ifp, cmd, data);
 		}
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFCAP:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		if (ifr->ifr_reqcap & ~ifp->if_capabilities)
 			return (EINVAL);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 #ifdef MAC
 	case SIOCSIFMAC:
 		error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCSIFNAME:
 		error = priv_check(td, PRIV_NET_SETIFNAME);
 		if (error)
 			return (error);
 		error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
 		if (error != 0)
 			return (error);
 		if (new_name[0] == '\0')
 			return (EINVAL);
 		if (ifunit(new_name) != NULL)
 			return (EEXIST);
 
 		/*
 		 * XXX: Locking.  Nothing else seems to lock if_flags,
 		 * and there are numerous other races with the
 		 * ifunit() checks not being atomic with namespace
 		 * changes (renames, vmoves, if_attach, etc).
 		 */
 		ifp->if_flags |= IFF_RENAMING;
 		
 		/* Announce the departure of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		log(LOG_INFO, "%s: changing name to '%s'\n",
 		    ifp->if_xname, new_name);
 
 		IF_ADDR_WLOCK(ifp);
 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
 		ifa = ifp->if_addr;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		namelen = strlen(new_name);
 		onamelen = sdl->sdl_nlen;
 		/*
 		 * Move the address if needed.  This is safe because we
 		 * allocate space for a name of length IFNAMSIZ when we
 		 * create this in if_attach().
 		 */
 		if (namelen != onamelen) {
 			bcopy(sdl->sdl_data + onamelen,
 			    sdl->sdl_data + namelen, sdl->sdl_alen);
 		}
 		bcopy(new_name, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
 		bzero(sdl->sdl_data, onamelen);
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		IF_ADDR_WUNLOCK(ifp);
 
 		EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 		/* Announce the return of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 
 		ifp->if_flags &= ~IFF_RENAMING;
 		break;
 
 #ifdef VIMAGE
 	case SIOCSIFVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error)
 			return (error);
 		error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
 		break;
 #endif
 
 	case SIOCSIFMETRIC:
 		error = priv_check(td, PRIV_NET_SETIFMETRIC);
 		if (error)
 			return (error);
 		ifp->if_metric = ifr->ifr_metric;
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYS:
 		error = priv_check(td, PRIV_NET_SETIFPHYS);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFMTU:
 	{
 		u_long oldmtu = ifp->if_mtu;
 
 		error = priv_check(td, PRIV_NET_SETIFMTU);
 		if (error)
 			return (error);
 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
 			return (EINVAL);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0) {
 			getmicrotime(&ifp->if_lastchange);
 			rt_ifmsg(ifp);
 		}
 		/*
 		 * If the link MTU changed, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 		}
 		break;
 	}
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (cmd == SIOCADDMULTI)
 			error = priv_check(td, PRIV_NET_ADDMULTI);
 		else
 			error = priv_check(td, PRIV_NET_DELMULTI);
 		if (error)
 			return (error);
 
 		/* Don't allow group membership on non-multicast interfaces. */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EOPNOTSUPP);
 
 		/* Don't let users screw up protocols' entries. */
 		if (ifr->ifr_addr.sa_family != AF_LINK)
 			return (EINVAL);
 
 		if (cmd == SIOCADDMULTI) {
 			struct ifmultiaddr *ifma;
 
 			/*
 			 * Userland is only permitted to join groups once
 			 * via the if_addmulti() KPI, because it cannot hold
 			 * struct ifmultiaddr * between calls. It may also
 			 * lose a race while we check if the membership
 			 * already exists.
 			 */
 			IF_ADDR_RLOCK(ifp);
 			ifma = if_findmulti(ifp, &ifr->ifr_addr);
 			IF_ADDR_RUNLOCK(ifp);
 			if (ifma != NULL)
 				error = EADDRINUSE;
 			else
 				error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
 		} else {
 			error = if_delmulti(ifp, &ifr->ifr_addr);
 		}
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYADDR:
 	case SIOCDIFPHYADDR:
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 #endif
 	case SIOCSIFMEDIA:
 	case SIOCSIFGENERIC:
 		error = priv_check(td, PRIV_NET_HWIOCTL);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCGIFSTATUS:
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 	case SIOCGIFMEDIA:
 	case SIOCGIFGENERIC:
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		break;
 
 	case SIOCSIFLLADDR:
 		error = priv_check(td, PRIV_NET_SETLLADDR);
 		if (error)
 			return (error);
 		error = if_setlladdr(ifp,
 		    ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
 		EVENTHANDLER_INVOKE(iflladdr_event, ifp);
 		break;
 
 	case SIOCAIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_ADDIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_addgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	case SIOCGIFGROUP:
 		if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp)))
 			return (error);
 		break;
 
 	case SIOCDIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_DELIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_delgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	default:
 		error = ENOIOCTL;
 		break;
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD32
 struct ifconf32 {
 	int32_t	ifc_len;
 	union {
 		uint32_t	ifcu_buf;
 		uint32_t	ifcu_req;
 	} ifc_ifcu;
 };
 #define	SIOCGIFCONF32	_IOWR('i', 36, struct ifconf32)
 #endif
 
 /*
  * Interface ioctls.
  */
 int
 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
 {
 	struct ifnet *ifp;
 	struct ifreq *ifr;
 	int error;
 	int oif_flags;
 
 	CURVNET_SET(so->so_vnet);
 	switch (cmd) {
 	case SIOCGIFCONF:
 		error = ifconf(cmd, data);
 		CURVNET_RESTORE();
 		return (error);
 
 #ifdef COMPAT_FREEBSD32
 	case SIOCGIFCONF32:
 		{
 			struct ifconf32 *ifc32;
 			struct ifconf ifc;
 
 			ifc32 = (struct ifconf32 *)data;
 			ifc.ifc_len = ifc32->ifc_len;
 			ifc.ifc_buf = PTRIN(ifc32->ifc_buf);
 
 			error = ifconf(SIOCGIFCONF, (void *)&ifc);
 			CURVNET_RESTORE();
 			if (error == 0)
 				ifc32->ifc_len = ifc.ifc_len;
 			return (error);
 		}
 #endif
 	}
 	ifr = (struct ifreq *)data;
 
 	switch (cmd) {
 #ifdef VIMAGE
 	case SIOCSIFRVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error == 0)
 			error = if_vmove_reclaim(td, ifr->ifr_name,
 			    ifr->ifr_jid);
 		CURVNET_RESTORE();
 		return (error);
 #endif
 	case SIOCIFCREATE:
 	case SIOCIFCREATE2:
 		error = priv_check(td, PRIV_NET_IFCREATE);
 		if (error == 0)
 			error = if_clone_create(ifr->ifr_name,
 			    sizeof(ifr->ifr_name),
 			    cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL);
 		CURVNET_RESTORE();
 		return (error);
 	case SIOCIFDESTROY:
 		error = priv_check(td, PRIV_NET_IFDESTROY);
 		if (error == 0)
 			error = if_clone_destroy(ifr->ifr_name);
 		CURVNET_RESTORE();
 		return (error);
 
 	case SIOCIFGCLONERS:
 		error = if_clone_list((struct if_clonereq *)data);
 		CURVNET_RESTORE();
 		return (error);
 	case SIOCGIFGMEMB:
 		error = if_getgroupmembers((struct ifgroupreq *)data);
 		CURVNET_RESTORE();
 		return (error);
 #if defined(INET) || defined(INET6)
 	case SIOCSVH:
 	case SIOCGVH:
 		if (carp_ioctl_p == NULL)
 			error = EPROTONOSUPPORT;
 		else
 			error = (*carp_ioctl_p)(ifr, cmd, td);
 		CURVNET_RESTORE();
 		return (error);
 #endif
 	}
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		return (ENXIO);
 	}
 
 	error = ifhwioctl(cmd, ifp, data, td);
 	if (error != ENOIOCTL) {
 		if_rele(ifp);
 		CURVNET_RESTORE();
 		return (error);
 	}
 
 	oif_flags = ifp->if_flags;
 	if (so->so_proto == NULL) {
 		if_rele(ifp);
 		CURVNET_RESTORE();
 		return (EOPNOTSUPP);
 	}
 
 	/*
 	 * Pass the request on to the socket control method, and if the
 	 * latter returns EOPNOTSUPP, directly to the interface.
 	 *
 	 * Make an exception for the legacy SIOCSIF* requests.  Drivers
 	 * trust SIOCSIFADDR et al to come from an already privileged
 	 * layer, and do not perform any credentials checks or input
 	 * validation.
 	 */
 	error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data,
 	    ifp, td));
 	if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL &&
 	    cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR &&
 	    cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK)
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 
 	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
 #ifdef INET6
 		if (ifp->if_flags & IFF_UP)
 			in6_if_up(ifp);
 #endif
 	}
 	if_rele(ifp);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * The code common to handling reference counted flags,
  * e.g., in ifpromisc() and if_allmulti().
  * The "pflag" argument can specify a permanent mode flag to check,
  * such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
  *
  * Only to be used on stack-owned flags, not driver-owned flags.
  */
 static int
 if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
 {
 	struct ifreq ifr;
 	int error;
 	int oldflags, oldcount;
 
 	/* Sanity checks to catch programming errors */
 	KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0,
 	    ("%s: setting driver-owned flag %d", __func__, flag));
 
 	if (onswitch)
 		KASSERT(*refcount >= 0,
 		    ("%s: increment negative refcount %d for flag %d",
 		    __func__, *refcount, flag));
 	else
 		KASSERT(*refcount > 0,
 		    ("%s: decrement non-positive refcount %d for flag %d",
 		    __func__, *refcount, flag));
 
 	/* In case this mode is permanent, just touch refcount */
 	if (ifp->if_flags & pflag) {
 		*refcount += onswitch ? 1 : -1;
 		return (0);
 	}
 
 	/* Save ifnet parameters for if_ioctl() may fail */
 	oldcount = *refcount;
 	oldflags = ifp->if_flags;
 	
 	/*
 	 * See if we aren't the only and touching refcount is enough.
 	 * Actually toggle interface flag if we are the first or last.
 	 */
 	if (onswitch) {
 		if ((*refcount)++)
 			return (0);
 		ifp->if_flags |= flag;
 	} else {
 		if (--(*refcount))
 			return (0);
 		ifp->if_flags &= ~flag;
 	}
 
 	/* Call down the driver since we've changed interface flags */
 	if (ifp->if_ioctl == NULL) {
 		error = EOPNOTSUPP;
 		goto recover;
 	}
 	ifr.ifr_flags = ifp->if_flags & 0xffff;
 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
 	error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 	if (error)
 		goto recover;
 	/* Notify userland that interface flags have changed */
 	rt_ifmsg(ifp);
 	return (0);
 
 recover:
 	/* Recover after driver error */
 	*refcount = oldcount;
 	ifp->if_flags = oldflags;
 	return (error);
 }
 
 /*
  * Set/clear promiscuous mode on interface ifp based on the truth value
  * of pswitch.  The calls are reference counted so that only the first
  * "on" request actually has an effect, as does the final "off" request.
  * Results are undefined if the "off" and "on" requests are not matched.
  */
 int
 ifpromisc(struct ifnet *ifp, int pswitch)
 {
 	int error;
 	int oldflags = ifp->if_flags;
 
 	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
 			   &ifp->if_pcount, pswitch);
 	/* If promiscuous mode status has changed, log a message */
 	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC))
 		log(LOG_INFO, "%s: promiscuous mode %s\n",
 		    ifp->if_xname,
 		    (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
 	return (error);
 }
 
 /*
  * Return interface configuration
  * of system.  List may be used
  * in later ioctl's (above) to get
  * other information.
  */
 /*ARGSUSED*/
 static int
 ifconf(u_long cmd, caddr_t data)
 {
 	struct ifconf *ifc = (struct ifconf *)data;
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 	struct sbuf *sb;
 	int error, full = 0, valid_len, max_len;
 
 	/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
 	max_len = MAXPHYS - 1;
 
 	/* Prevent hostile input from being able to crash the system */
 	if (ifc->ifc_len <= 0)
 		return (EINVAL);
 
 again:
 	if (ifc->ifc_len <= max_len) {
 		max_len = ifc->ifc_len;
 		full = 1;
 	}
 	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
 	max_len = 0;
 	valid_len = 0;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		int addrs;
 
 		/*
 		 * Zero the ifr_name buffer to make sure we don't
 		 * disclose the contents of the stack.
 		 */
 		memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name));
 
 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
 		    >= sizeof(ifr.ifr_name)) {
 			sbuf_delete(sb);
 			IFNET_RUNLOCK();
 			return (ENAMETOOLONG);
 		}
 
 		addrs = 0;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa = ifa->ifa_addr;
 
 			if (prison_if(curthread->td_ucred, sa) != 0)
 				continue;
 			addrs++;
 			if (sa->sa_len <= sizeof(*sa)) {
 				ifr.ifr_addr = *sa;
 				sbuf_bcat(sb, &ifr, sizeof(ifr));
 				max_len += sizeof(ifr);
 			} else {
 				sbuf_bcat(sb, &ifr,
 				    offsetof(struct ifreq, ifr_addr));
 				max_len += offsetof(struct ifreq, ifr_addr);
 				sbuf_bcat(sb, sa, sa->sa_len);
 				max_len += sa->sa_len;
 			}
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (addrs == 0) {
 			bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
 			sbuf_bcat(sb, &ifr, sizeof(ifr));
 			max_len += sizeof(ifr);
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 	}
 	IFNET_RUNLOCK();
 
 	/*
 	 * If we didn't allocate enough space (uncommon), try again.  If
 	 * we have already allocated as much space as we are allowed,
 	 * return what we've got.
 	 */
 	if (valid_len != max_len && !full) {
 		sbuf_delete(sb);
 		goto again;
 	}
 
 	ifc->ifc_len = valid_len;
 	sbuf_finish(sb);
 	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
 	sbuf_delete(sb);
 	return (error);
 }
 
 /*
  * Just like ifpromisc(), but for all-multicast-reception mode.
  */
 int
 if_allmulti(struct ifnet *ifp, int onswitch)
 {
 
 	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
 }
 
 struct ifmultiaddr *
 if_findmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (sa->sa_family == AF_LINK) {
 			if (sa_dl_equal(ifma->ifma_addr, sa))
 				break;
 		} else {
 			if (sa_equal(ifma->ifma_addr, sa))
 				break;
 		}
 	}
 
 	return ifma;
 }
 
 /*
  * Allocate a new ifmultiaddr and initialize based on passed arguments.  We
  * make copies of passed sockaddrs.  The ifmultiaddr will not be added to
  * the ifnet multicast address list here, so the caller must do that and
  * other setup work (such as notifying the device driver).  The reference
  * count is initialized to 1.
  */
 static struct ifmultiaddr *
 if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
     int mflags)
 {
 	struct ifmultiaddr *ifma;
 	struct sockaddr *dupsa;
 
 	ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
 	    M_ZERO);
 	if (ifma == NULL)
 		return (NULL);
 
 	dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(sa, dupsa, sa->sa_len);
 	ifma->ifma_addr = dupsa;
 
 	ifma->ifma_ifp = ifp;
 	ifma->ifma_refcount = 1;
 	ifma->ifma_protospec = NULL;
 
 	if (llsa == NULL) {
 		ifma->ifma_lladdr = NULL;
 		return (ifma);
 	}
 
 	dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma->ifma_addr, M_IFMADDR);
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(llsa, dupsa, llsa->sa_len);
 	ifma->ifma_lladdr = dupsa;
 
 	return (ifma);
 }
 
 /*
  * if_freemulti: free ifmultiaddr structure and possibly attached related
  * addresses.  The caller is responsible for implementing reference
  * counting, notifying the driver, handling routing messages, and releasing
  * any dependent link layer state.
  */
 static void
 if_freemulti(struct ifmultiaddr *ifma)
 {
 
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
 	    ifma->ifma_refcount));
 
 	if (ifma->ifma_lladdr != NULL)
 		free(ifma->ifma_lladdr, M_IFMADDR);
 	free(ifma->ifma_addr, M_IFMADDR);
 	free(ifma, M_IFMADDR);
 }
 
 /*
  * Register an additional multicast address with a network interface.
  *
  * - If the address is already present, bump the reference count on the
  *   address and return.
  * - If the address is not link-layer, look up a link layer address.
  * - Allocate address structures for one or both addresses, and attach to the
  *   multicast address list on the interface.  If automatically adding a link
  *   layer address, the protocol address will own a reference to the link
  *   layer address, to be freed when it is freed.
  * - Notify the network device driver of an addition to the multicast address
  *   list.
  *
  * 'sa' points to caller-owned memory with the desired multicast address.
  *
  * 'retifma' will be used to return a pointer to the resulting multicast
  * address reference, if desired.
  */
 int
 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
     struct ifmultiaddr **retifma)
 {
 	struct ifmultiaddr *ifma, *ll_ifma;
 	struct sockaddr *llsa;
 	struct sockaddr_dl sdl;
 	int error;
 
 	/*
 	 * If the address is already present, return a new reference to it;
 	 * otherwise, allocate storage and set up a new address.
 	 */
 	IF_ADDR_WLOCK(ifp);
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL) {
 		ifma->ifma_refcount++;
 		if (retifma != NULL)
 			*retifma = ifma;
 		IF_ADDR_WUNLOCK(ifp);
 		return (0);
 	}
 
 	/*
 	 * The address isn't already present; resolve the protocol address
 	 * into a link layer address, and then look that up, bump its
 	 * refcount or allocate an ifma for that also.
 	 * Most link layer resolving functions returns address data which
 	 * fits inside default sockaddr_dl structure. However callback
 	 * can allocate another sockaddr structure, in that case we need to
 	 * free it later.
 	 */
 	llsa = NULL;
 	ll_ifma = NULL;
 	if (ifp->if_resolvemulti != NULL) {
 		/* Provide called function with buffer size information */
 		sdl.sdl_len = sizeof(sdl);
 		llsa = (struct sockaddr *)&sdl;
 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
 		if (error)
 			goto unlock_out;
 	}
 
 	/*
 	 * Allocate the new address.  Don't hook it up yet, as we may also
 	 * need to allocate a link layer multicast address.
 	 */
 	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
 	if (ifma == NULL) {
 		error = ENOMEM;
 		goto free_llsa_out;
 	}
 
 	/*
 	 * If a link layer address is found, we'll need to see if it's
 	 * already present in the address list, or allocate is as well.
 	 * When this block finishes, the link layer address will be on the
 	 * list.
 	 */
 	if (llsa != NULL) {
 		ll_ifma = if_findmulti(ifp, llsa);
 		if (ll_ifma == NULL) {
 			ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
 			if (ll_ifma == NULL) {
 				--ifma->ifma_refcount;
 				if_freemulti(ifma);
 				error = ENOMEM;
 				goto free_llsa_out;
 			}
 			TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
 			    ifma_link);
 		} else
 			ll_ifma->ifma_refcount++;
 		ifma->ifma_llifma = ll_ifma;
 	}
 
 	/*
 	 * We now have a new multicast address, ifma, and possibly a new or
 	 * referenced link layer address.  Add the primary address to the
 	 * ifnet address list.
 	 */
 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if (retifma != NULL)
 		*retifma = ifma;
 
 	/*
 	 * Must generate the message while holding the lock so that 'ifma'
 	 * pointer is still valid.
 	 */
 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
 	IF_ADDR_WUNLOCK(ifp);
 
 	/*
 	 * We are certain we have added something, so call down to the
 	 * interface to let them know about it.
 	 */
 	if (ifp->if_ioctl != NULL) {
 		(void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
 	}
 
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 	return (0);
 
 free_llsa_out:
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 unlock_out:
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Delete a multicast group membership by network-layer group address.
  *
  * Returns ENOENT if the entry could not be found. If ifp no longer
  * exists, results are undefined. This entry point should only be used
  * from subsystems which do appropriate locking to hold ifp for the
  * duration of the call.
  * Network-layer protocol domains must use if_delmulti_ifma().
  */
 int
 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 	int lastref;
 #ifdef INVARIANTS
 	struct ifnet *oifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 		if (ifp == oifp)
 			break;
 	if (ifp != oifp)
 		ifp = NULL;
 	IFNET_RUNLOCK_NOSLEEP();
 
 	KASSERT(ifp != NULL, ("%s: ifnet went away", __func__));
 #endif
 	if (ifp == NULL)
 		return (ENOENT);
 
 	IF_ADDR_WLOCK(ifp);
 	lastref = 0;
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL)
 		lastref = if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 
 	if (ifma == NULL)
 		return (ENOENT);
 
 	if (lastref && ifp->if_ioctl != NULL) {
 		(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 	}
 
 	return (0);
 }
 
 /*
  * Delete all multicast group membership for an interface.
  * Should be used to quickly flush all multicast filters.
  */
 void
 if_delallmulti(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Delete a multicast group membership by group membership pointer.
  * Network-layer protocol domains must use this routine.
  *
  * It is safe to call this routine if the ifp disappeared.
  */
 void
 if_delmulti_ifma(struct ifmultiaddr *ifma)
 {
 	struct ifnet *ifp;
 	int lastref;
 
 	ifp = ifma->ifma_ifp;
 #ifdef DIAGNOSTIC
 	if (ifp == NULL) {
 		printf("%s: ifma_ifp seems to be detached\n", __func__);
 	} else {
 		struct ifnet *oifp;
 
 		IFNET_RLOCK_NOSLEEP();
 		TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 			if (ifp == oifp)
 				break;
 		if (ifp != oifp) {
 			printf("%s: ifnet %p disappeared\n", __func__, ifp);
 			ifp = NULL;
 		}
 		IFNET_RUNLOCK_NOSLEEP();
 	}
 #endif
 	/*
 	 * If and only if the ifnet instance exists: Acquire the address lock.
 	 */
 	if (ifp != NULL)
 		IF_ADDR_WLOCK(ifp);
 
 	lastref = if_delmulti_locked(ifp, ifma, 0);
 
 	if (ifp != NULL) {
 		/*
 		 * If and only if the ifnet instance exists:
 		 *  Release the address lock.
 		 *  If the group was left: update the hardware hash filter.
 		 */
 		IF_ADDR_WUNLOCK(ifp);
 		if (lastref && ifp->if_ioctl != NULL) {
 			(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 		}
 	}
 }
 
 /*
  * Perform deletion of network-layer and/or link-layer multicast address.
  *
  * Return 0 if the reference count was decremented.
  * Return 1 if the final reference was released, indicating that the
  * hardware hash filter should be reprogrammed.
  */
 static int
 if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
 {
 	struct ifmultiaddr *ll_ifma;
 
 	if (ifp != NULL && ifma->ifma_ifp != NULL) {
 		KASSERT(ifma->ifma_ifp == ifp,
 		    ("%s: inconsistent ifp %p", __func__, ifp));
 		IF_ADDR_WLOCK_ASSERT(ifp);
 	}
 
 	ifp = ifma->ifma_ifp;
 
 	/*
 	 * If the ifnet is detaching, null out references to ifnet,
 	 * so that upper protocol layers will notice, and not attempt
 	 * to obtain locks for an ifnet which no longer exists. The
 	 * routing socket announcement must happen before the ifnet
 	 * instance is detached from the system.
 	 */
 	if (detaching) {
 #ifdef DIAGNOSTIC
 		printf("%s: detaching ifnet instance %p\n", __func__, ifp);
 #endif
 		/*
 		 * ifp may already be nulled out if we are being reentered
 		 * to delete the ll_ifma.
 		 */
 		if (ifp != NULL) {
 			rt_newmaddrmsg(RTM_DELMADDR, ifma);
 			ifma->ifma_ifp = NULL;
 		}
 	}
 
 	if (--ifma->ifma_refcount > 0)
 		return 0;
 
 	/*
 	 * If this ifma is a network-layer ifma, a link-layer ifma may
 	 * have been associated with it. Release it first if so.
 	 */
 	ll_ifma = ifma->ifma_llifma;
 	if (ll_ifma != NULL) {
 		KASSERT(ifma->ifma_lladdr != NULL,
 		    ("%s: llifma w/o lladdr", __func__));
 		if (detaching)
 			ll_ifma->ifma_ifp = NULL;	/* XXX */
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ifp != NULL) {
 				TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma,
 				    ifma_link);
 			}
 			if_freemulti(ll_ifma);
 		}
 	}
 
 	if (ifp != NULL)
 		TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if_freemulti(ifma);
 
 	/*
 	 * The last reference to this instance of struct ifmultiaddr
 	 * was released; the hardware should be notified of this change.
 	 */
 	return 1;
 }
 
 /*
  * Set the link layer address on an interface.
  *
  * At this time we only support certain types of interfaces,
  * and we don't allow the length of the address to change.
  */
 int
 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
 {
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 
 	IF_ADDR_RLOCK(ifp);
 	ifa = ifp->if_addr;
 	if (ifa == NULL) {
 		IF_ADDR_RUNLOCK(ifp);
 		return (EINVAL);
 	}
 	ifa_ref(ifa);
 	IF_ADDR_RUNLOCK(ifp);
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	if (sdl == NULL) {
 		ifa_free(ifa);
 		return (EINVAL);
 	}
 	if (len != sdl->sdl_alen) {	/* don't allow length to change */
 		ifa_free(ifa);
 		return (EINVAL);
 	}
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_XETHER:
 	case IFT_ISO88025:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_ARCNET:
 	case IFT_IEEE8023ADLAG:
 	case IFT_IEEE80211:
 		bcopy(lladdr, LLADDR(sdl), len);
 		ifa_free(ifa);
 		break;
 	default:
 		ifa_free(ifa);
 		return (ENODEV);
 	}
 
 	/*
 	 * If the interface is already up, we need
 	 * to re-init it in order to reprogram its
 	 * address filter.
 	 */
 	if ((ifp->if_flags & IFF_UP) != 0) {
 		if (ifp->if_ioctl) {
 			ifp->if_flags &= ~IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 			ifp->if_flags |= IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 		}
 #ifdef INET
 		/*
 		 * Also send gratuitous ARPs to notify other nodes about
 		 * the address change.
 		 */
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family == AF_INET)
 				arp_ifinit(ifp, ifa);
 		}
 #endif
 	}
 	return (0);
 }
 
 /*
  * The name argument must be a pointer to storage which will last as
  * long as the interface does.  For physical devices, the result of
  * device_get_name(dev) is a good choice and for pseudo-devices a
  * static string works well.
  */
 void
 if_initname(struct ifnet *ifp, const char *name, int unit)
 {
 	ifp->if_dname = name;
 	ifp->if_dunit = unit;
 	if (unit != IF_DUNIT_NONE)
 		snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
 	else
 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
 }
 
 int
 if_printf(struct ifnet *ifp, const char * fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	retval = printf("%s: ", ifp->if_xname);
 	va_start(ap, fmt);
 	retval += vprintf(fmt, ap);
 	va_end(ap);
 	return (retval);
 }
 
 void
 if_start(struct ifnet *ifp)
 {
 
 	(*(ifp)->if_start)(ifp);
 }
 
 /*
  * Backwards compatibility interface for drivers 
  * that have not implemented it
  */
 static int
 if_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	int error;
 
 	IFQ_HANDOFF(ifp, m, error);
 	return (error);
 }
 
 int
 if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
 {
 	int active = 0;
 
 	IF_LOCK(ifq);
 	if (_IF_QFULL(ifq)) {
 		IF_UNLOCK(ifq);
 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		m_freem(m);
 		return (0);
 	}
 	if (ifp != NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust);
 		if (m->m_flags & (M_BCAST|M_MCAST))
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
 	}
 	_IF_ENQUEUE(ifq, m);
 	IF_UNLOCK(ifq);
 	if (ifp != NULL && !active)
 		(*(ifp)->if_start)(ifp);
 	return (1);
 }
 
 void
 if_register_com_alloc(u_char type,
     if_com_alloc_t *a, if_com_free_t *f)
 {
 	
 	KASSERT(if_com_alloc[type] == NULL,
 	    ("if_register_com_alloc: %d already registered", type));
 	KASSERT(if_com_free[type] == NULL,
 	    ("if_register_com_alloc: %d free already registered", type));
 
 	if_com_alloc[type] = a;
 	if_com_free[type] = f;
 }
 
 void
 if_deregister_com_alloc(u_char type)
 {
 	
 	KASSERT(if_com_alloc[type] != NULL,
 	    ("if_deregister_com_alloc: %d not registered", type));
 	KASSERT(if_com_free[type] != NULL,
 	    ("if_deregister_com_alloc: %d free not registered", type));
 	if_com_alloc[type] = NULL;
 	if_com_free[type] = NULL;
 }
 
 /* API for driver access to network stack owned ifnet.*/
 uint64_t
 if_setbaudrate(struct ifnet *ifp, uint64_t baudrate)
 {
 	uint64_t oldbrate;
 
 	oldbrate = ifp->if_baudrate;
 	ifp->if_baudrate = baudrate;
 	return (oldbrate);
 }
 
 uint64_t
 if_getbaudrate(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_baudrate);
 }
 
 int
 if_setcapabilities(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capabilities = capabilities;
 	return (0);
 }
 
 int
 if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit)
 {
 	((struct ifnet *)ifp)->if_capabilities |= setbit;
 	((struct ifnet *)ifp)->if_capabilities &= ~clearbit;
 
 	return (0);
 }
 
 int
 if_getcapabilities(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capabilities;
 }
 
 int 
 if_setcapenable(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capenable = capabilities;
 	return (0);
 }
 
 int 
 if_setcapenablebit(if_t ifp, int setcap, int clearcap)
 {
 	if(setcap) 
 		((struct ifnet *)ifp)->if_capenable |= setcap;
 	if(clearcap)
 		((struct ifnet *)ifp)->if_capenable &= ~clearcap;
 
 	return (0);
 }
 
 const char *
 if_getdname(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_dname;
 }
 
 int 
 if_togglecapenable(if_t ifp, int togglecap)
 {
 	((struct ifnet *)ifp)->if_capenable ^= togglecap;
 	return (0);
 }
 
 int
 if_getcapenable(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capenable;
 }
 
 /*
  * This is largely undesirable because it ties ifnet to a device, but does
  * provide flexiblity for an embedded product vendor. Should be used with
  * the understanding that it violates the interface boundaries, and should be
  * a last resort only.
  */
 int
 if_setdev(if_t ifp, void *dev)
 {
 	return (0);
 }
 
 int
 if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags |= set_flags;
 	((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags;
 
 	return (0);
 }
 
 int
 if_getdrvflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_drv_flags;
 }
  
 int
 if_setdrvflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags = flags;
 	return (0);
 }
 
 
 int
 if_setflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_flags = flags;
 	return (0);
 }
 
 int
 if_setflagbits(if_t ifp, int set, int clear)
 {
 	((struct ifnet *)ifp)->if_flags |= set;
 	((struct ifnet *)ifp)->if_flags &= ~clear;
 
 	return (0);
 }
 
 int
 if_getflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_flags;
 }
 
 int
 if_clearhwassist(if_t ifp)
 {
 	((struct ifnet *)ifp)->if_hwassist = 0;
 	return (0);
 }
 
 int
 if_sethwassistbits(if_t ifp, int toset, int toclear)
 {
 	((struct ifnet *)ifp)->if_hwassist |= toset;
 	((struct ifnet *)ifp)->if_hwassist &= ~toclear;
 
 	return (0);
 }
 
 int
 if_sethwassist(if_t ifp, int hwassist_bit)
 {
 	((struct ifnet *)ifp)->if_hwassist = hwassist_bit;
 	return (0);
 }
 
 int
 if_gethwassist(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_hwassist;
 }
 
 int
 if_setmtu(if_t ifp, int mtu)
 {
 	((struct ifnet *)ifp)->if_mtu = mtu;
 	return (0);
 }
 
 int
 if_getmtu(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_mtu;
 }
 
 int
 if_getmtu_family(if_t ifp, int family)
 {
 	struct domain *dp;
 
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_family == family && dp->dom_ifmtu != NULL)
 			return (dp->dom_ifmtu((struct ifnet *)ifp));
 	}
 
 	return (((struct ifnet *)ifp)->if_mtu);
 }
 
 int
 if_setsoftc(if_t ifp, void *softc)
 {
 	((struct ifnet *)ifp)->if_softc = softc;
 	return (0);
 }
 
 void *
 if_getsoftc(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_softc;
 }
 
 void 
 if_setrcvif(struct mbuf *m, if_t ifp)
 {
 	m->m_pkthdr.rcvif = (struct ifnet *)ifp;
 }
 
 void 
 if_setvtag(struct mbuf *m, uint16_t tag)
 {
 	m->m_pkthdr.ether_vtag = tag;	
 }
 
 uint16_t
 if_getvtag(struct mbuf *m)
 {
 
 	return (m->m_pkthdr.ether_vtag);
 }
 
 int
 if_sendq_empty(if_t ifp)
 {
 	return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd);
 }
 
 struct ifaddr *
 if_getifaddr(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_addr;
 }
 
 int
 if_getamcount(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_amcount;
 }
 
 
 int
 if_setsendqready(if_t ifp)
 {
 	IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd);
 	return (0);
 }
 
 int
 if_setsendqlen(if_t ifp, int tx_desc_count)
 {
 	IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count);
 	((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count;
 
 	return (0);
 }
 
 int
 if_vlantrunkinuse(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0;
 }
 
 int
 if_input(if_t ifp, struct mbuf* sendmp)
 {
 	(*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp);
 	return (0);
 
 }
 
 /* XXX */
 #ifndef ETH_ADDR_LEN
 #define ETH_ADDR_LEN 6
 #endif
 
 int 
 if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max)
 {
 	struct ifmultiaddr *ifma;
 	uint8_t *lmta = (uint8_t *)mta;
 	int mcnt = 0;
 
 	TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 
 		if (mcnt == max)
 			break;
 
 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
 		    &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN);
 		mcnt++;
 	}
 	*cnt = mcnt;
 
 	return (0);
 }
 
 int
 if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max)
 {
 	int error;
 
 	if_maddr_rlock(ifp);
 	error = if_setupmultiaddr(ifp, mta, cnt, max);
 	if_maddr_runlock(ifp);
 	return (error);
 }
 
 int
 if_multiaddr_count(if_t ifp, int max)
 {
 	struct ifmultiaddr *ifma;
 	int count;
 
 	count = 0;
 	if_maddr_rlock(ifp);
 	TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		count++;
 		if (count == max)
 			break;
 	}
 	if_maddr_runlock(ifp);
 	return (count);
 }
 
 struct mbuf *
 if_dequeue(if_t ifp)
 {
 	struct mbuf *m;
 	IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m);
 
 	return (m);
 }
 
 int
 if_sendq_prepend(if_t ifp, struct mbuf *m)
 {
 	IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m);
 	return (0);
 }
 
 int
 if_setifheaderlen(if_t ifp, int len)
 {
 	((struct ifnet *)ifp)->if_hdrlen = len;
 	return (0);
 }
 
 caddr_t
 if_getlladdr(if_t ifp)
 {
 	return (IF_LLADDR((struct ifnet *)ifp));
 }
 
 void *
 if_gethandle(u_char type)
 {
 	return (if_alloc(type));
 }
 
 void
 if_bpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	BPF_MTAP(ifp, m);
 }
 
 void
 if_etherbpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	ETHER_BPF_MTAP(ifp, m);
 }
 
 void
 if_vlancap(if_t ifh)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 	VLAN_CAPABILITIES(ifp);
 }
 
 void
 if_setinitfn(if_t ifp, void (*init_fn)(void *))
 {
 	((struct ifnet *)ifp)->if_init = init_fn;
 }
 
 void
 if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t))
 {
 	((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn;
 }
 
 void
 if_setstartfn(if_t ifp, void (*start_fn)(if_t))
 {
 	((struct ifnet *)ifp)->if_start = (void *)start_fn;
 }
 
 void
 if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn)
 {
 	((struct ifnet *)ifp)->if_transmit = start_fn;
 }
 
 void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn)
 {
 	((struct ifnet *)ifp)->if_qflush = flush_fn;
 	
 }
 
 void
 if_setgetcounterfn(if_t ifp, if_get_counter_t fn)
 {
 
 	ifp->if_get_counter = fn;
 }
 
 /* Revisit these - These are inline functions originally. */
 int
 drbr_inuse_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_inuse_drv(ifh, br);
 }
 
 struct mbuf*
 drbr_dequeue_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_dequeue(ifh, br);
 }
 
 int
 drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br)
 {
 	return drbr_needs_enqueue(ifh, br);
 }
 
 int
 drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m)
 {
 	return drbr_enqueue(ifh, br, m);
 
 }
Index: user/dchagin/lemul/sys/net/if_arp.h
===================================================================
--- user/dchagin/lemul/sys/net/if_arp.h	(revision 274240)
+++ user/dchagin/lemul/sys/net/if_arp.h	(revision 274241)
@@ -1,147 +1,133 @@
 /*-
  * Copyright (c) 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_arp.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NET_IF_ARP_H_
 #define	_NET_IF_ARP_H_
 
 /*
  * Address Resolution Protocol.
  *
  * See RFC 826 for protocol description.  ARP packets are variable
  * in size; the arphdr structure defines the fixed-length portion.
  * Protocol type values are the same as those for 10 Mb/s Ethernet.
  * It is followed by the variable-sized fields ar_sha, arp_spa,
  * arp_tha and arp_tpa in that order, according to the lengths
  * specified.  Field names used correspond to RFC 826.
  */
 struct	arphdr {
 	u_short	ar_hrd;		/* format of hardware address */
 #define ARPHRD_ETHER 	1	/* ethernet hardware format */
 #define ARPHRD_IEEE802	6	/* token-ring hardware format */
 #define ARPHRD_ARCNET	7	/* arcnet hardware format */
 #define ARPHRD_FRELAY 	15	/* frame relay hardware format */
 #define ARPHRD_IEEE1394	24	/* firewire hardware format */
 #define ARPHRD_INFINIBAND 32	/* infiniband hardware format */
 	u_short	ar_pro;		/* format of protocol address */
 	u_char	ar_hln;		/* length of hardware address */
 	u_char	ar_pln;		/* length of protocol address */
 	u_short	ar_op;		/* one of: */
 #define	ARPOP_REQUEST	1	/* request to resolve address */
 #define	ARPOP_REPLY	2	/* response to previous request */
 #define	ARPOP_REVREQUEST 3	/* request protocol address given hardware */
 #define	ARPOP_REVREPLY	4	/* response giving protocol address */
 #define ARPOP_INVREQUEST 8 	/* request to identify peer */
 #define ARPOP_INVREPLY	9	/* response identifying peer */
 /*
  * The remaining fields are variable in size,
  * according to the sizes above.
  */
 #ifdef COMMENT_ONLY
 	u_char	ar_sha[];	/* sender hardware address */
 	u_char	ar_spa[];	/* sender protocol address */
 	u_char	ar_tha[];	/* target hardware address */
 	u_char	ar_tpa[];	/* target protocol address */
 #endif
 };
 
 #define ar_sha(ap)	(((caddr_t)((ap)+1)) +   0)
 #define ar_spa(ap)	(((caddr_t)((ap)+1)) +   (ap)->ar_hln)
 #define ar_tha(ap)	(((caddr_t)((ap)+1)) +   (ap)->ar_hln + (ap)->ar_pln)
 #define ar_tpa(ap)	(((caddr_t)((ap)+1)) + 2*(ap)->ar_hln + (ap)->ar_pln)
 
 #define arphdr_len2(ar_hln, ar_pln)					\
 	(sizeof(struct arphdr) + 2*(ar_hln) + 2*(ar_pln))
 #define arphdr_len(ap)	(arphdr_len2((ap)->ar_hln, (ap)->ar_pln))
 
 /*
  * ARP ioctl request
  */
 struct arpreq {
 	struct	sockaddr arp_pa;		/* protocol address */
 	struct	sockaddr arp_ha;		/* hardware address */
 	int	arp_flags;			/* flags */
 };
 /*  arp_flags and at_flags field values */
 #define	ATF_INUSE	0x01	/* entry in use */
 #define ATF_COM		0x02	/* completed entry (enaddr valid) */
 #define	ATF_PERM	0x04	/* permanent entry */
 #define	ATF_PUBL	0x08	/* publish entry (respond for other host) */
 #define	ATF_USETRAILERS	0x10	/* has requested trailers */
 
-#ifdef _KERNEL
-/*
- * Structure shared between the ethernet driver modules and
- * the address resolution code.
- */
-struct	arpcom {
-	struct 	ifnet *ac_ifp;		/* network-visible interface */
-	void	*ac_netgraph;		/* ng_ether(4) netgraph node info */
-};
-#define IFP2AC(ifp) ((struct arpcom *)(ifp->if_l2com))
-#define AC2IFP(ac) ((ac)->ac_ifp)
-
-#endif /* _KERNEL */
-
 struct arpstat {
 	/* Normal things that happen: */
 	uint64_t txrequests;	/* # of ARP requests sent by this host. */
 	uint64_t txreplies;	/* # of ARP replies sent by this host. */
 	uint64_t rxrequests;	/* # of ARP requests received by this host. */
 	uint64_t rxreplies;	/* # of ARP replies received by this host. */
 	uint64_t received;	/* # of ARP packets received by this host. */
 
 	uint64_t arp_spares[4];	/* For either the upper or lower half. */
 	/* Abnormal event and error  counting: */
 	uint64_t dropped;	/* # of packets dropped waiting for a reply. */
 	uint64_t timeouts;	/* # of times with entries removed */
 				/* due to timeout. */
 	uint64_t dupips;	/* # of duplicate IPs detected. */
 };
 
 #ifdef _KERNEL
 #include <sys/counter.h>
 #include <net/vnet.h>
 
 VNET_PCPUSTAT_DECLARE(struct arpstat, arpstat);
 /*
  * In-kernel consumers can use these accessor macros directly to update
  * stats.
  */
 #define	ARPSTAT_ADD(name, val)	\
     VNET_PCPUSTAT_ADD(struct arpstat, arpstat, name, (val))
 #define	ARPSTAT_SUB(name, val)	ARPSTAT_ADD(name, -(val))
 #define	ARPSTAT_INC(name)	ARPSTAT_ADD(name, 1)
 #define	ARPSTAT_DEC(name)	ARPSTAT_SUB(name, 1)
 
 #endif /* _KERNEL */
 
 #endif /* !_NET_IF_ARP_H_ */
Index: user/dchagin/lemul/sys/net/if_bridge.c
===================================================================
--- user/dchagin/lemul/sys/net/if_bridge.c	(revision 274240)
+++ user/dchagin/lemul/sys/net/if_bridge.c	(revision 274241)
@@ -1,3562 +1,3562 @@
 /*	$NetBSD: if_bridge.c,v 1.31 2005/06/01 19:45:34 jdc Exp $	*/
 
 /*
  * Copyright 2001 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Jason R. Thorpe for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed for the NetBSD Project by
  *	Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * OpenBSD: if_bridge.c,v 1.60 2001/06/15 03:38:33 itojun Exp
  */
 
 /*
  * Network interface bridge support.
  *
  * TODO:
  *
  *	- Currently only supports Ethernet-like interfaces (Ethernet,
  *	  802.11, VLANs on Ethernet, etc.)  Figure out a nice way
  *	  to bridge other types of interfaces (FDDI-FDDI, and maybe
  *	  consider heterogenous bridges).
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/protosw.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/time.h>
 #include <sys/socket.h> /* for net/if.h */
 #include <sys/sockio.h>
 #include <sys/ctype.h>  /* string functions */
 #include <sys/kernel.h>
 #include <sys/random.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <vm/uma.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
-#include <netinet/in.h> /* for struct arpcom */
+#include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif
 #if defined(INET) || defined(INET6)
 #include <netinet/ip_carp.h>
 #endif
 #include <machine/in_cksum.h>
-#include <netinet/if_ether.h> /* for struct arpcom */
+#include <netinet/if_ether.h>
 #include <net/bridgestp.h>
 #include <net/if_bridgevar.h>
 #include <net/if_llc.h>
 #include <net/if_vlan_var.h>
 
 #include <net/route.h>
 
 /*
  * Size of the route hash table.  Must be a power of two.
  */
 #ifndef BRIDGE_RTHASH_SIZE
 #define	BRIDGE_RTHASH_SIZE		1024
 #endif
 
 #define	BRIDGE_RTHASH_MASK		(BRIDGE_RTHASH_SIZE - 1)
 
 /*
  * Default maximum number of addresses to cache.
  */
 #ifndef BRIDGE_RTABLE_MAX
 #define	BRIDGE_RTABLE_MAX		2000
 #endif
 
 /*
  * Timeout (in seconds) for entries learned dynamically.
  */
 #ifndef BRIDGE_RTABLE_TIMEOUT
 #define	BRIDGE_RTABLE_TIMEOUT		(20 * 60)	/* same as ARP */
 #endif
 
 /*
  * Number of seconds between walks of the route list.
  */
 #ifndef BRIDGE_RTABLE_PRUNE_PERIOD
 #define	BRIDGE_RTABLE_PRUNE_PERIOD	(5 * 60)
 #endif
 
 /*
  * List of capabilities to possibly mask on the member interface.
  */
 #define	BRIDGE_IFCAPS_MASK		(IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM)
 
 /*
  * List of capabilities to strip
  */
 #define	BRIDGE_IFCAPS_STRIP		IFCAP_LRO
 
 /*
  * Bridge interface list entry.
  */
 struct bridge_iflist {
 	LIST_ENTRY(bridge_iflist) bif_next;
 	struct ifnet		*bif_ifp;	/* member if */
 	struct bstp_port	bif_stp;	/* STP state */
 	uint32_t		bif_flags;	/* member if flags */
 	int			bif_savedcaps;	/* saved capabilities */
 	uint32_t		bif_addrmax;	/* max # of addresses */
 	uint32_t		bif_addrcnt;	/* cur. # of addresses */
 	uint32_t		bif_addrexceeded;/* # of address violations */
 };
 
 /*
  * Bridge route node.
  */
 struct bridge_rtnode {
 	LIST_ENTRY(bridge_rtnode) brt_hash;	/* hash table linkage */
 	LIST_ENTRY(bridge_rtnode) brt_list;	/* list linkage */
 	struct bridge_iflist	*brt_dst;	/* destination if */
 	unsigned long		brt_expire;	/* expiration time */
 	uint8_t			brt_flags;	/* address flags */
 	uint8_t			brt_addr[ETHER_ADDR_LEN];
 	uint16_t		brt_vlan;	/* vlan id */
 };
 #define	brt_ifp			brt_dst->bif_ifp
 
 /*
  * Software state for each bridge.
  */
 struct bridge_softc {
 	struct ifnet		*sc_ifp;	/* make this an interface */
 	LIST_ENTRY(bridge_softc) sc_list;
 	struct mtx		sc_mtx;
 	struct cv		sc_cv;
 	uint32_t		sc_brtmax;	/* max # of addresses */
 	uint32_t		sc_brtcnt;	/* cur. # of addresses */
 	uint32_t		sc_brttimeout;	/* rt timeout in seconds */
 	struct callout		sc_brcallout;	/* bridge callout */
 	uint32_t		sc_iflist_ref;	/* refcount for sc_iflist */
 	uint32_t		sc_iflist_xcnt;	/* refcount for sc_iflist */
 	LIST_HEAD(, bridge_iflist) sc_iflist;	/* member interface list */
 	LIST_HEAD(, bridge_rtnode) *sc_rthash;	/* our forwarding table */
 	LIST_HEAD(, bridge_rtnode) sc_rtlist;	/* list version of above */
 	uint32_t		sc_rthash_key;	/* key for hash */
 	LIST_HEAD(, bridge_iflist) sc_spanlist;	/* span ports list */
 	struct bstp_state	sc_stp;		/* STP state */
 	uint32_t		sc_brtexceeded;	/* # of cache drops */
 	struct ifnet		*sc_ifaddr;	/* member mac copied from */
 	u_char			sc_defaddr[6];	/* Default MAC address */
 };
 
 static VNET_DEFINE(struct mtx, bridge_list_mtx);
 #define	V_bridge_list_mtx	VNET(bridge_list_mtx)
 eventhandler_tag	bridge_detach_cookie = NULL;
 
 int	bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD;
 
 uma_zone_t bridge_rtnode_zone;
 
 static int	bridge_clone_create(struct if_clone *, int, caddr_t);
 static void	bridge_clone_destroy(struct ifnet *);
 
 static int	bridge_ioctl(struct ifnet *, u_long, caddr_t);
 static void	bridge_mutecaps(struct bridge_softc *);
 static void	bridge_set_ifcap(struct bridge_softc *, struct bridge_iflist *,
 		    int);
 static void	bridge_ifdetach(void *arg __unused, struct ifnet *);
 static void	bridge_init(void *);
 static void	bridge_dummynet(struct mbuf *, struct ifnet *);
 static void	bridge_stop(struct ifnet *, int);
 static int	bridge_transmit(struct ifnet *, struct mbuf *);
 static void	bridge_qflush(struct ifnet *);
 static struct mbuf *bridge_input(struct ifnet *, struct mbuf *);
 static int	bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *,
 		    struct rtentry *);
 static int	bridge_enqueue(struct bridge_softc *, struct ifnet *,
 		    struct mbuf *);
 static void	bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int);
 
 static void	bridge_forward(struct bridge_softc *, struct bridge_iflist *,
 		    struct mbuf *m);
 
 static void	bridge_timer(void *);
 
 static void	bridge_broadcast(struct bridge_softc *, struct ifnet *,
 		    struct mbuf *, int);
 static void	bridge_span(struct bridge_softc *, struct mbuf *);
 
 static int	bridge_rtupdate(struct bridge_softc *, const uint8_t *,
 		    uint16_t, struct bridge_iflist *, int, uint8_t);
 static struct ifnet *bridge_rtlookup(struct bridge_softc *, const uint8_t *,
 		    uint16_t);
 static void	bridge_rttrim(struct bridge_softc *);
 static void	bridge_rtage(struct bridge_softc *);
 static void	bridge_rtflush(struct bridge_softc *, int);
 static int	bridge_rtdaddr(struct bridge_softc *, const uint8_t *,
 		    uint16_t);
 
 static void	bridge_rtable_init(struct bridge_softc *);
 static void	bridge_rtable_fini(struct bridge_softc *);
 
 static int	bridge_rtnode_addr_cmp(const uint8_t *, const uint8_t *);
 static struct bridge_rtnode *bridge_rtnode_lookup(struct bridge_softc *,
 		    const uint8_t *, uint16_t);
 static int	bridge_rtnode_insert(struct bridge_softc *,
 		    struct bridge_rtnode *);
 static void	bridge_rtnode_destroy(struct bridge_softc *,
 		    struct bridge_rtnode *);
 static void	bridge_rtable_expire(struct ifnet *, int);
 static void	bridge_state_change(struct ifnet *, int);
 
 static struct bridge_iflist *bridge_lookup_member(struct bridge_softc *,
 		    const char *name);
 static struct bridge_iflist *bridge_lookup_member_if(struct bridge_softc *,
 		    struct ifnet *ifp);
 static void	bridge_delete_member(struct bridge_softc *,
 		    struct bridge_iflist *, int);
 static void	bridge_delete_span(struct bridge_softc *,
 		    struct bridge_iflist *);
 
 static int	bridge_ioctl_add(struct bridge_softc *, void *);
 static int	bridge_ioctl_del(struct bridge_softc *, void *);
 static int	bridge_ioctl_gifflags(struct bridge_softc *, void *);
 static int	bridge_ioctl_sifflags(struct bridge_softc *, void *);
 static int	bridge_ioctl_scache(struct bridge_softc *, void *);
 static int	bridge_ioctl_gcache(struct bridge_softc *, void *);
 static int	bridge_ioctl_gifs(struct bridge_softc *, void *);
 static int	bridge_ioctl_rts(struct bridge_softc *, void *);
 static int	bridge_ioctl_saddr(struct bridge_softc *, void *);
 static int	bridge_ioctl_sto(struct bridge_softc *, void *);
 static int	bridge_ioctl_gto(struct bridge_softc *, void *);
 static int	bridge_ioctl_daddr(struct bridge_softc *, void *);
 static int	bridge_ioctl_flush(struct bridge_softc *, void *);
 static int	bridge_ioctl_gpri(struct bridge_softc *, void *);
 static int	bridge_ioctl_spri(struct bridge_softc *, void *);
 static int	bridge_ioctl_ght(struct bridge_softc *, void *);
 static int	bridge_ioctl_sht(struct bridge_softc *, void *);
 static int	bridge_ioctl_gfd(struct bridge_softc *, void *);
 static int	bridge_ioctl_sfd(struct bridge_softc *, void *);
 static int	bridge_ioctl_gma(struct bridge_softc *, void *);
 static int	bridge_ioctl_sma(struct bridge_softc *, void *);
 static int	bridge_ioctl_sifprio(struct bridge_softc *, void *);
 static int	bridge_ioctl_sifcost(struct bridge_softc *, void *);
 static int	bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *);
 static int	bridge_ioctl_addspan(struct bridge_softc *, void *);
 static int	bridge_ioctl_delspan(struct bridge_softc *, void *);
 static int	bridge_ioctl_gbparam(struct bridge_softc *, void *);
 static int	bridge_ioctl_grte(struct bridge_softc *, void *);
 static int	bridge_ioctl_gifsstp(struct bridge_softc *, void *);
 static int	bridge_ioctl_sproto(struct bridge_softc *, void *);
 static int	bridge_ioctl_stxhc(struct bridge_softc *, void *);
 static int	bridge_pfil(struct mbuf **, struct ifnet *, struct ifnet *,
 		    int);
 static int	bridge_ip_checkbasic(struct mbuf **mp);
 #ifdef INET6
 static int	bridge_ip6_checkbasic(struct mbuf **mp);
 #endif /* INET6 */
 static int	bridge_fragment(struct ifnet *, struct mbuf *,
 		    struct ether_header *, int, struct llc *);
 static void	bridge_linkstate(struct ifnet *ifp);
 static void	bridge_linkcheck(struct bridge_softc *sc);
 
 extern void (*bridge_linkstate_p)(struct ifnet *ifp);
 
 /* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */
 #define	VLANTAGOF(_m)	\
     (_m->m_flags & M_VLANTAG) ? EVL_VLANOFTAG(_m->m_pkthdr.ether_vtag) : 1
 
 static struct bstp_cb_ops bridge_ops = {
 	.bcb_state = bridge_state_change,
 	.bcb_rtage = bridge_rtable_expire
 };
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW, 0, "Bridge");
 
 /* only pass IP[46] packets when pfil is enabled */
 static VNET_DEFINE(int, pfil_onlyip) = 1;
 #define	V_pfil_onlyip	VNET(pfil_onlyip)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_onlyip), 0,
     "Only pass IP packets when pfil is enabled");
 
 /* run pfil hooks on the bridge interface */
 static VNET_DEFINE(int, pfil_bridge) = 1;
 #define	V_pfil_bridge	VNET(pfil_bridge)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_bridge), 0,
     "Packet filter on the bridge interface");
 
 /* layer2 filter with ipfw */
 static VNET_DEFINE(int, pfil_ipfw);
 #define	V_pfil_ipfw	VNET(pfil_ipfw)
 
 /* layer2 ARP filter with ipfw */
 static VNET_DEFINE(int, pfil_ipfw_arp);
 #define	V_pfil_ipfw_arp	VNET(pfil_ipfw_arp)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_ipfw_arp), 0,
     "Filter ARP packets through IPFW layer2");
 
 /* run pfil hooks on the member interface */
 static VNET_DEFINE(int, pfil_member) = 1;
 #define	V_pfil_member	VNET(pfil_member)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_member), 0,
     "Packet filter on the member interface");
 
 /* run pfil hooks on the physical interface for locally destined packets */
 static VNET_DEFINE(int, pfil_local_phys);
 #define	V_pfil_local_phys	VNET(pfil_local_phys)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_local_phys), 0,
     "Packet filter on the physical interface for locally destined packets");
 
 /* log STP state changes */
 static VNET_DEFINE(int, log_stp);
 #define	V_log_stp	VNET(log_stp)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(log_stp), 0,
     "Log STP state changes");
 
 /* share MAC with first bridge member */
 static VNET_DEFINE(int, bridge_inherit_mac);
 #define	V_bridge_inherit_mac	VNET(bridge_inherit_mac)
 SYSCTL_INT(_net_link_bridge, OID_AUTO, inherit_mac,
     CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(bridge_inherit_mac), 0,
     "Inherit MAC address from the first bridge member");
 
 static VNET_DEFINE(int, allow_llz_overlap) = 0;
 #define	V_allow_llz_overlap	VNET(allow_llz_overlap)
-SYSCTL_VNET_INT(_net_link_bridge, OID_AUTO, allow_llz_overlap,
-    CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(allow_llz_overlap), 0,
+SYSCTL_INT(_net_link_bridge, OID_AUTO, allow_llz_overlap,
+    CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(allow_llz_overlap), 0,
     "Allow overlap of link-local scope "
     "zones of a bridge interface and the member interfaces");
 
 struct bridge_control {
 	int	(*bc_func)(struct bridge_softc *, void *);
 	int	bc_argsize;
 	int	bc_flags;
 };
 
 #define	BC_F_COPYIN		0x01	/* copy arguments in */
 #define	BC_F_COPYOUT		0x02	/* copy arguments out */
 #define	BC_F_SUSER		0x04	/* do super-user check */
 
 const struct bridge_control bridge_control_table[] = {
 	{ bridge_ioctl_add,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 	{ bridge_ioctl_del,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gifflags,	sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_COPYOUT },
 	{ bridge_ioctl_sifflags,	sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_scache,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 	{ bridge_ioctl_gcache,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 
 	{ bridge_ioctl_gifs,		sizeof(struct ifbifconf),
 	  BC_F_COPYIN|BC_F_COPYOUT },
 	{ bridge_ioctl_rts,		sizeof(struct ifbaconf),
 	  BC_F_COPYIN|BC_F_COPYOUT },
 
 	{ bridge_ioctl_saddr,		sizeof(struct ifbareq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_sto,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 	{ bridge_ioctl_gto,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 
 	{ bridge_ioctl_daddr,		sizeof(struct ifbareq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_flush,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gpri,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 	{ bridge_ioctl_spri,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_ght,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 	{ bridge_ioctl_sht,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gfd,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 	{ bridge_ioctl_sfd,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gma,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 	{ bridge_ioctl_sma,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_sifprio,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_sifcost,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_addspan,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 	{ bridge_ioctl_delspan,		sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_gbparam,		sizeof(struct ifbropreq),
 	  BC_F_COPYOUT },
 
 	{ bridge_ioctl_grte,		sizeof(struct ifbrparam),
 	  BC_F_COPYOUT },
 
 	{ bridge_ioctl_gifsstp,		sizeof(struct ifbpstpconf),
 	  BC_F_COPYIN|BC_F_COPYOUT },
 
 	{ bridge_ioctl_sproto,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_stxhc,		sizeof(struct ifbrparam),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 	{ bridge_ioctl_sifmaxaddr,	sizeof(struct ifbreq),
 	  BC_F_COPYIN|BC_F_SUSER },
 
 };
 const int bridge_control_table_size = nitems(bridge_control_table);
 
 static VNET_DEFINE(LIST_HEAD(, bridge_softc), bridge_list);
 #define	V_bridge_list	VNET(bridge_list)
 #define	BRIDGE_LIST_LOCK_INIT(x)	mtx_init(&V_bridge_list_mtx,	\
 					    "if_bridge list", NULL, MTX_DEF)
 #define	BRIDGE_LIST_LOCK_DESTROY(x)	mtx_destroy(&V_bridge_list_mtx)
 #define	BRIDGE_LIST_LOCK(x)		mtx_lock(&V_bridge_list_mtx)
 #define	BRIDGE_LIST_UNLOCK(x)		mtx_unlock(&V_bridge_list_mtx)
 
 static VNET_DEFINE(struct if_clone *, bridge_cloner);
 #define	V_bridge_cloner	VNET(bridge_cloner)
 
 static const char bridge_name[] = "bridge";
 
 static void
 vnet_bridge_init(const void *unused __unused)
 {
 
 	BRIDGE_LIST_LOCK_INIT();
 	LIST_INIT(&V_bridge_list);
 	V_bridge_cloner = if_clone_simple(bridge_name,
 	    bridge_clone_create, bridge_clone_destroy, 0);
 }
 VNET_SYSINIT(vnet_bridge_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_bridge_init, NULL);
 
 static void
 vnet_bridge_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_bridge_cloner);
 	BRIDGE_LIST_LOCK_DESTROY();
 }
 VNET_SYSUNINIT(vnet_bridge_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_bridge_uninit, NULL);
 
 static int
 bridge_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		bridge_rtnode_zone = uma_zcreate("bridge_rtnode",
 		    sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		bridge_input_p = bridge_input;
 		bridge_output_p = bridge_output;
 		bridge_dn_p = bridge_dummynet;
 		bridge_linkstate_p = bridge_linkstate;
 		bridge_detach_cookie = EVENTHANDLER_REGISTER(
 		    ifnet_departure_event, bridge_ifdetach, NULL,
 		    EVENTHANDLER_PRI_ANY);
 		break;
 	case MOD_UNLOAD:
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 		    bridge_detach_cookie);
 		uma_zdestroy(bridge_rtnode_zone);
 		bridge_input_p = NULL;
 		bridge_output_p = NULL;
 		bridge_dn_p = NULL;
 		bridge_linkstate_p = NULL;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t bridge_mod = {
 	"if_bridge",
 	bridge_modevent,
 	0
 };
 
 DECLARE_MODULE(if_bridge, bridge_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_DEPEND(if_bridge, bridgestp, 1, 1, 1);
 
 /*
  * handler for net.link.bridge.ipfw
  */
 static int
 sysctl_pfil_ipfw(SYSCTL_HANDLER_ARGS)
 {
 	int enable = V_pfil_ipfw;
 	int error;
 
 	error = sysctl_handle_int(oidp, &enable, 0, req);
 	enable &= 1;
 
 	if (enable != V_pfil_ipfw) {
 		V_pfil_ipfw = enable;
 
 		/*
 		 * Disable pfil so that ipfw doesnt run twice, if the user
 		 * really wants both then they can re-enable pfil_bridge and/or
 		 * pfil_member. Also allow non-ip packets as ipfw can filter by
 		 * layer2 type.
 		 */
 		if (V_pfil_ipfw) {
 			V_pfil_onlyip = 0;
 			V_pfil_bridge = 0;
 			V_pfil_member = 0;
 		}
 	}
 
 	return (error);
 }
 SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(pfil_ipfw), 0, &sysctl_pfil_ipfw, "I",
     "Layer2 filter with IPFW");
 
 /*
  * bridge_clone_create:
  *
  *	Create a new bridge instance.
  */
 static int
 bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct bridge_softc *sc, *sc2;
 	struct ifnet *bifp, *ifp;
 	int fb, retry;
 	unsigned long hostid;
 
 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		free(sc, M_DEVBUF);
 		return (ENOSPC);
 	}
 
 	BRIDGE_LOCK_INIT(sc);
 	sc->sc_brtmax = BRIDGE_RTABLE_MAX;
 	sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT;
 
 	/* Initialize our routing table. */
 	bridge_rtable_init(sc);
 
 	callout_init_mtx(&sc->sc_brcallout, &sc->sc_mtx, 0);
 
 	LIST_INIT(&sc->sc_iflist);
 	LIST_INIT(&sc->sc_spanlist);
 
 	ifp->if_softc = sc;
 	if_initname(ifp, bridge_name, unit);
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_ioctl = bridge_ioctl;
 	ifp->if_transmit = bridge_transmit;
 	ifp->if_qflush = bridge_qflush;
 	ifp->if_init = bridge_init;
 	ifp->if_type = IFT_BRIDGE;
 
 	/*
 	 * Generate an ethernet address with a locally administered address.
 	 *
 	 * Since we are using random ethernet addresses for the bridge, it is
 	 * possible that we might have address collisions, so make sure that
 	 * this hardware address isn't already in use on another bridge.
 	 * The first try uses the hostid and falls back to arc4rand().
 	 */
 	fb = 0;
 	getcredhostid(curthread->td_ucred, &hostid);
 	do {
 		if (fb || hostid == 0) {
 			arc4rand(sc->sc_defaddr, ETHER_ADDR_LEN, 1);
 			sc->sc_defaddr[0] &= ~1;/* clear multicast bit */
 			sc->sc_defaddr[0] |= 2;	/* set the LAA bit */
 		} else {
 			sc->sc_defaddr[0] = 0x2;
 			sc->sc_defaddr[1] = (hostid >> 24) & 0xff;
 			sc->sc_defaddr[2] = (hostid >> 16) & 0xff;
 			sc->sc_defaddr[3] = (hostid >> 8 ) & 0xff;
 			sc->sc_defaddr[4] =  hostid        & 0xff;
 			sc->sc_defaddr[5] = ifp->if_dunit & 0xff;
 		}
 
 		fb = 1;
 		retry = 0;
 		BRIDGE_LIST_LOCK();
 		LIST_FOREACH(sc2, &V_bridge_list, sc_list) {
 			bifp = sc2->sc_ifp;
 			if (memcmp(sc->sc_defaddr,
 			    IF_LLADDR(bifp), ETHER_ADDR_LEN) == 0) {
 				retry = 1;
 				break;
 			}
 		}
 		BRIDGE_LIST_UNLOCK();
 	} while (retry == 1);
 
 	bstp_attach(&sc->sc_stp, &bridge_ops);
 	ether_ifattach(ifp, sc->sc_defaddr);
 	/* Now undo some of the damage... */
 	ifp->if_baudrate = 0;
 	ifp->if_type = IFT_BRIDGE;
 
 	BRIDGE_LIST_LOCK();
 	LIST_INSERT_HEAD(&V_bridge_list, sc, sc_list);
 	BRIDGE_LIST_UNLOCK();
 
 	return (0);
 }
 
 /*
  * bridge_clone_destroy:
  *
  *	Destroy a bridge instance.
  */
 static void
 bridge_clone_destroy(struct ifnet *ifp)
 {
 	struct bridge_softc *sc = ifp->if_softc;
 	struct bridge_iflist *bif;
 
 	BRIDGE_LOCK(sc);
 
 	bridge_stop(ifp, 1);
 	ifp->if_flags &= ~IFF_UP;
 
 	while ((bif = LIST_FIRST(&sc->sc_iflist)) != NULL)
 		bridge_delete_member(sc, bif, 0);
 
 	while ((bif = LIST_FIRST(&sc->sc_spanlist)) != NULL) {
 		bridge_delete_span(sc, bif);
 	}
 
 	BRIDGE_UNLOCK(sc);
 
 	callout_drain(&sc->sc_brcallout);
 
 	BRIDGE_LIST_LOCK();
 	LIST_REMOVE(sc, sc_list);
 	BRIDGE_LIST_UNLOCK();
 
 	bstp_detach(&sc->sc_stp);
 	ether_ifdetach(ifp);
 	if_free(ifp);
 
 	/* Tear down the routing table. */
 	bridge_rtable_fini(sc);
 
 	BRIDGE_LOCK_DESTROY(sc);
 	free(sc, M_DEVBUF);
 }
 
 /*
  * bridge_ioctl:
  *
  *	Handle a control request from the operator.
  */
 static int
 bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct bridge_softc *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct bridge_iflist *bif;
 	struct thread *td = curthread;
 	union {
 		struct ifbreq ifbreq;
 		struct ifbifconf ifbifconf;
 		struct ifbareq ifbareq;
 		struct ifbaconf ifbaconf;
 		struct ifbrparam ifbrparam;
 		struct ifbropreq ifbropreq;
 	} args;
 	struct ifdrv *ifd = (struct ifdrv *) data;
 	const struct bridge_control *bc;
 	int error = 0;
 
 	switch (cmd) {
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		break;
 
 	case SIOCGDRVSPEC:
 	case SIOCSDRVSPEC:
 		if (ifd->ifd_cmd >= bridge_control_table_size) {
 			error = EINVAL;
 			break;
 		}
 		bc = &bridge_control_table[ifd->ifd_cmd];
 
 		if (cmd == SIOCGDRVSPEC &&
 		    (bc->bc_flags & BC_F_COPYOUT) == 0) {
 			error = EINVAL;
 			break;
 		}
 		else if (cmd == SIOCSDRVSPEC &&
 		    (bc->bc_flags & BC_F_COPYOUT) != 0) {
 			error = EINVAL;
 			break;
 		}
 
 		if (bc->bc_flags & BC_F_SUSER) {
 			error = priv_check(td, PRIV_NET_BRIDGE);
 			if (error)
 				break;
 		}
 
 		if (ifd->ifd_len != bc->bc_argsize ||
 		    ifd->ifd_len > sizeof(args)) {
 			error = EINVAL;
 			break;
 		}
 
 		bzero(&args, sizeof(args));
 		if (bc->bc_flags & BC_F_COPYIN) {
 			error = copyin(ifd->ifd_data, &args, ifd->ifd_len);
 			if (error)
 				break;
 		}
 
 		BRIDGE_LOCK(sc);
 		error = (*bc->bc_func)(sc, &args);
 		BRIDGE_UNLOCK(sc);
 		if (error)
 			break;
 
 		if (bc->bc_flags & BC_F_COPYOUT)
 			error = copyout(&args, ifd->ifd_data, ifd->ifd_len);
 
 		break;
 
 	case SIOCSIFFLAGS:
 		if (!(ifp->if_flags & IFF_UP) &&
 		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked down and it is running,
 			 * then stop and disable it.
 			 */
 			BRIDGE_LOCK(sc);
 			bridge_stop(ifp, 1);
 			BRIDGE_UNLOCK(sc);
 		} else if ((ifp->if_flags & IFF_UP) &&
 		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 			/*
 			 * If interface is marked up and it is stopped, then
 			 * start it.
 			 */
 			(*ifp->if_init)(sc);
 		}
 		break;
 
 	case SIOCSIFMTU:
 		if (ifr->ifr_mtu < 576) {
 			error = EINVAL;
 			break;
 		}
 		if (LIST_EMPTY(&sc->sc_iflist)) {
 			sc->sc_ifp->if_mtu = ifr->ifr_mtu;
 			break;
 		}
 		BRIDGE_LOCK(sc);
 		LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 			if (bif->bif_ifp->if_mtu != ifr->ifr_mtu) {
 				log(LOG_NOTICE, "%s: invalid MTU: %u(%s)"
 				    " != %d\n", sc->sc_ifp->if_xname,
 				    bif->bif_ifp->if_mtu,
 				    bif->bif_ifp->if_xname, ifr->ifr_mtu);
 				error = EINVAL;
 				break;
 			}
 		}
 		if (!error)
 			sc->sc_ifp->if_mtu = ifr->ifr_mtu;
 		BRIDGE_UNLOCK(sc);
 		break;
 	default:
 		/*
 		 * drop the lock as ether_ioctl() will call bridge_start() and
 		 * cause the lock to be recursed.
 		 */
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * bridge_mutecaps:
  *
  *	Clear or restore unwanted capabilities on the member interface
  */
 static void
 bridge_mutecaps(struct bridge_softc *sc)
 {
 	struct bridge_iflist *bif;
 	int enabled, mask;
 
 	/* Initial bitmask of capabilities to test */
 	mask = BRIDGE_IFCAPS_MASK;
 
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		/* Every member must support it or its disabled */
 		mask &= bif->bif_savedcaps;
 	}
 
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		enabled = bif->bif_ifp->if_capenable;
 		enabled &= ~BRIDGE_IFCAPS_STRIP;
 		/* strip off mask bits and enable them again if allowed */
 		enabled &= ~BRIDGE_IFCAPS_MASK;
 		enabled |= mask;
 		bridge_set_ifcap(sc, bif, enabled);
 	}
 
 }
 
 static void
 bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set)
 {
 	struct ifnet *ifp = bif->bif_ifp;
 	struct ifreq ifr;
 	int error;
 
 	bzero(&ifr, sizeof(ifr));
 	ifr.ifr_reqcap = set;
 
 	if (ifp->if_capenable != set) {
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifr);
 		if (error)
 			if_printf(sc->sc_ifp,
 			    "error setting interface capabilities on %s\n",
 			    ifp->if_xname);
 	}
 }
 
 /*
  * bridge_lookup_member:
  *
  *	Lookup a bridge member interface.
  */
 static struct bridge_iflist *
 bridge_lookup_member(struct bridge_softc *sc, const char *name)
 {
 	struct bridge_iflist *bif;
 	struct ifnet *ifp;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		ifp = bif->bif_ifp;
 		if (strcmp(ifp->if_xname, name) == 0)
 			return (bif);
 	}
 
 	return (NULL);
 }
 
 /*
  * bridge_lookup_member_if:
  *
  *	Lookup a bridge member interface by ifnet*.
  */
 static struct bridge_iflist *
 bridge_lookup_member_if(struct bridge_softc *sc, struct ifnet *member_ifp)
 {
 	struct bridge_iflist *bif;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if (bif->bif_ifp == member_ifp)
 			return (bif);
 	}
 
 	return (NULL);
 }
 
 /*
  * bridge_delete_member:
  *
  *	Delete the specified member interface.
  */
 static void
 bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif,
     int gone)
 {
 	struct ifnet *ifs = bif->bif_ifp;
 	struct ifnet *fif = NULL;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	if (bif->bif_flags & IFBIF_STP)
 		bstp_disable(&bif->bif_stp);
 
 	ifs->if_bridge = NULL;
 	BRIDGE_XLOCK(sc);
 	LIST_REMOVE(bif, bif_next);
 	BRIDGE_XDROP(sc);
 
 	/*
 	 * If removing the interface that gave the bridge its mac address, set
 	 * the mac address of the bridge to the address of the next member, or
 	 * to its default address if no members are left.
 	 */
 	if (V_bridge_inherit_mac && sc->sc_ifaddr == ifs) {
 		if (LIST_EMPTY(&sc->sc_iflist)) {
 			bcopy(sc->sc_defaddr,
 			    IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 			sc->sc_ifaddr = NULL;
 		} else {
 			fif = LIST_FIRST(&sc->sc_iflist)->bif_ifp;
 			bcopy(IF_LLADDR(fif),
 			    IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 			sc->sc_ifaddr = fif;
 		}
 		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 	}
 
 	bridge_linkcheck(sc);
 	bridge_mutecaps(sc);	/* recalcuate now this interface is removed */
 	bridge_rtdelete(sc, ifs, IFBF_FLUSHALL);
 	KASSERT(bif->bif_addrcnt == 0,
 	    ("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt));
 
 	BRIDGE_UNLOCK(sc);
 	if (!gone) {
 		switch (ifs->if_type) {
 		case IFT_ETHER:
 		case IFT_L2VLAN:
 			/*
 			 * Take the interface out of promiscuous mode.
 			 */
 			(void) ifpromisc(ifs, 0);
 			break;
 
 		case IFT_GIF:
 			break;
 
 		default:
 #ifdef DIAGNOSTIC
 			panic("bridge_delete_member: impossible");
 #endif
 			break;
 		}
 		/* reneable any interface capabilities */
 		bridge_set_ifcap(sc, bif, bif->bif_savedcaps);
 	}
 	bstp_destroy(&bif->bif_stp);	/* prepare to free */
 	BRIDGE_LOCK(sc);
 	free(bif, M_DEVBUF);
 }
 
 /*
  * bridge_delete_span:
  *
  *	Delete the specified span interface.
  */
 static void
 bridge_delete_span(struct bridge_softc *sc, struct bridge_iflist *bif)
 {
 	BRIDGE_LOCK_ASSERT(sc);
 
 	KASSERT(bif->bif_ifp->if_bridge == NULL,
 	    ("%s: not a span interface", __func__));
 
 	LIST_REMOVE(bif, bif_next);
 	free(bif, M_DEVBUF);
 }
 
 static int
 bridge_ioctl_add(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif = NULL;
 	struct ifnet *ifs;
 	int error = 0;
 
 	ifs = ifunit(req->ifbr_ifsname);
 	if (ifs == NULL)
 		return (ENOENT);
 	if (ifs->if_ioctl == NULL)	/* must be supported */
 		return (EINVAL);
 
 	/* If it's in the span list, it can't be a member. */
 	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 		if (ifs == bif->bif_ifp)
 			return (EBUSY);
 
 	if (ifs->if_bridge == sc)
 		return (EEXIST);
 
 	if (ifs->if_bridge != NULL)
 		return (EBUSY);
 
 	switch (ifs->if_type) {
 	case IFT_ETHER:
 	case IFT_L2VLAN:
 	case IFT_GIF:
 		/* permitted interface types */
 		break;
 	default:
 		return (EINVAL);
 	}
 
 #ifdef INET6
 	/*
 	 * Two valid inet6 addresses with link-local scope must not be
 	 * on the parent interface and the member interfaces at the
 	 * same time.  This restriction is needed to prevent violation
 	 * of link-local scope zone.  Attempts to add a member
 	 * interface which has inet6 addresses when the parent has
 	 * inet6 triggers removal of all inet6 addresses on the member
 	 * interface.
 	 */
 
 	/* Check if the parent interface has a link-local scope addr. */
 	if (V_allow_llz_overlap == 0 &&
 	    in6ifa_llaonifp(sc->sc_ifp) != NULL) {
 		/*
 		 * If any, remove all inet6 addresses from the member
 		 * interfaces.
 		 */
 		BRIDGE_XLOCK(sc);
 		LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
  			if (in6ifa_llaonifp(bif->bif_ifp)) {
 				BRIDGE_UNLOCK(sc);
 				in6_ifdetach(bif->bif_ifp);
 				BRIDGE_LOCK(sc);
 				if_printf(sc->sc_ifp,
 				    "IPv6 addresses on %s have been removed "
 				    "before adding it as a member to prevent "
 				    "IPv6 address scope violation.\n",
 				    bif->bif_ifp->if_xname);
 			}
 		}
 		BRIDGE_XDROP(sc);
 		if (in6ifa_llaonifp(ifs)) {
 			BRIDGE_UNLOCK(sc);
 			in6_ifdetach(ifs);
 			BRIDGE_LOCK(sc);
 			if_printf(sc->sc_ifp,
 			    "IPv6 addresses on %s have been removed "
 			    "before adding it as a member to prevent "
 			    "IPv6 address scope violation.\n",
 			    ifs->if_xname);
 		}
 	}
 #endif
 	/* Allow the first Ethernet member to define the MTU */
 	if (LIST_EMPTY(&sc->sc_iflist))
 		sc->sc_ifp->if_mtu = ifs->if_mtu;
 	else if (sc->sc_ifp->if_mtu != ifs->if_mtu) {
 		if_printf(sc->sc_ifp, "invalid MTU: %u(%s) != %u\n",
 		    ifs->if_mtu, ifs->if_xname, sc->sc_ifp->if_mtu);
 		return (EINVAL);
 	}
 
 	bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (bif == NULL)
 		return (ENOMEM);
 
 	bif->bif_ifp = ifs;
 	bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER;
 	bif->bif_savedcaps = ifs->if_capenable;
 
 	/*
 	 * Assign the interface's MAC address to the bridge if it's the first
 	 * member and the MAC address of the bridge has not been changed from
 	 * the default randomly generated one.
 	 */
 	if (V_bridge_inherit_mac && LIST_EMPTY(&sc->sc_iflist) &&
 	    !memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr, ETHER_ADDR_LEN)) {
 		bcopy(IF_LLADDR(ifs), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
 		sc->sc_ifaddr = ifs;
 		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
 	}
 
 	ifs->if_bridge = sc;
 	bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp);
 	/*
 	 * XXX: XLOCK HERE!?!
 	 *
 	 * NOTE: insert_***HEAD*** should be safe for the traversals.
 	 */
 	LIST_INSERT_HEAD(&sc->sc_iflist, bif, bif_next);
 
 	/* Set interface capabilities to the intersection set of all members */
 	bridge_mutecaps(sc);
 	bridge_linkcheck(sc);
 
 	/* Place the interface into promiscuous mode */
 	switch (ifs->if_type) {
 		case IFT_ETHER:
 		case IFT_L2VLAN:
 			BRIDGE_UNLOCK(sc);
 			error = ifpromisc(ifs, 1);
 			BRIDGE_LOCK(sc);
 			break;
 	}
 
 	if (error) {
 		bridge_delete_member(sc, bif, 0);
 		free(bif, M_DEVBUF);
 	}
 	return (error);
 }
 
 static int
 bridge_ioctl_del(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	bridge_delete_member(sc, bif, 0);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 	struct bstp_port *bp;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	bp = &bif->bif_stp;
 	req->ifbr_ifsflags = bif->bif_flags;
 	req->ifbr_state = bp->bp_state;
 	req->ifbr_priority = bp->bp_priority;
 	req->ifbr_path_cost = bp->bp_path_cost;
 	req->ifbr_portno = bif->bif_ifp->if_index & 0xfff;
 	req->ifbr_proto = bp->bp_protover;
 	req->ifbr_role = bp->bp_role;
 	req->ifbr_stpflags = bp->bp_flags;
 	req->ifbr_addrcnt = bif->bif_addrcnt;
 	req->ifbr_addrmax = bif->bif_addrmax;
 	req->ifbr_addrexceeded = bif->bif_addrexceeded;
 
 	/* Copy STP state options as flags */
 	if (bp->bp_operedge)
 		req->ifbr_ifsflags |= IFBIF_BSTP_EDGE;
 	if (bp->bp_flags & BSTP_PORT_AUTOEDGE)
 		req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE;
 	if (bp->bp_ptp_link)
 		req->ifbr_ifsflags |= IFBIF_BSTP_PTP;
 	if (bp->bp_flags & BSTP_PORT_AUTOPTP)
 		req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP;
 	if (bp->bp_flags & BSTP_PORT_ADMEDGE)
 		req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE;
 	if (bp->bp_flags & BSTP_PORT_ADMCOST)
 		req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST;
 	return (0);
 }
 
 static int
 bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 	struct bstp_port *bp;
 	int error;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 	bp = &bif->bif_stp;
 
 	if (req->ifbr_ifsflags & IFBIF_SPAN)
 		/* SPAN is readonly */
 		return (EINVAL);
 
 	if (req->ifbr_ifsflags & IFBIF_STP) {
 		if ((bif->bif_flags & IFBIF_STP) == 0) {
 			error = bstp_enable(&bif->bif_stp);
 			if (error)
 				return (error);
 		}
 	} else {
 		if ((bif->bif_flags & IFBIF_STP) != 0)
 			bstp_disable(&bif->bif_stp);
 	}
 
 	/* Pass on STP flags */
 	bstp_set_edge(bp, req->ifbr_ifsflags & IFBIF_BSTP_EDGE ? 1 : 0);
 	bstp_set_autoedge(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOEDGE ? 1 : 0);
 	bstp_set_ptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_PTP ? 1 : 0);
 	bstp_set_autoptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOPTP ? 1 : 0);
 
 	/* Save the bits relating to the bridge */
 	bif->bif_flags = req->ifbr_ifsflags & IFBIFMASK;
 
 	return (0);
 }
 
 static int
 bridge_ioctl_scache(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	sc->sc_brtmax = param->ifbrp_csize;
 	bridge_rttrim(sc);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gcache(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	param->ifbrp_csize = sc->sc_brtmax;
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gifs(struct bridge_softc *sc, void *arg)
 {
 	struct ifbifconf *bifc = arg;
 	struct bridge_iflist *bif;
 	struct ifbreq breq;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0;
 
 	count = 0;
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next)
 		count++;
 	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 		count++;
 
 	buflen = sizeof(breq) * count;
 	if (bifc->ifbic_len == 0) {
 		bifc->ifbic_len = buflen;
 		return (0);
 	}
 	BRIDGE_UNLOCK(sc);
 	outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 	BRIDGE_LOCK(sc);
 
 	count = 0;
 	buf = outbuf;
 	len = min(bifc->ifbic_len, buflen);
 	bzero(&breq, sizeof(breq));
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if (len < sizeof(breq))
 			break;
 
 		strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname,
 		    sizeof(breq.ifbr_ifsname));
 		/* Fill in the ifbreq structure */
 		error = bridge_ioctl_gifflags(sc, &breq);
 		if (error)
 			break;
 		memcpy(buf, &breq, sizeof(breq));
 		count++;
 		buf += sizeof(breq);
 		len -= sizeof(breq);
 	}
 	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) {
 		if (len < sizeof(breq))
 			break;
 
 		strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname,
 		    sizeof(breq.ifbr_ifsname));
 		breq.ifbr_ifsflags = bif->bif_flags;
 		breq.ifbr_portno = bif->bif_ifp->if_index & 0xfff;
 		memcpy(buf, &breq, sizeof(breq));
 		count++;
 		buf += sizeof(breq);
 		len -= sizeof(breq);
 	}
 
 	BRIDGE_UNLOCK(sc);
 	bifc->ifbic_len = sizeof(breq) * count;
 	error = copyout(outbuf, bifc->ifbic_req, bifc->ifbic_len);
 	BRIDGE_LOCK(sc);
 	free(outbuf, M_TEMP);
 	return (error);
 }
 
 static int
 bridge_ioctl_rts(struct bridge_softc *sc, void *arg)
 {
 	struct ifbaconf *bac = arg;
 	struct bridge_rtnode *brt;
 	struct ifbareq bareq;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0;
 
 	if (bac->ifbac_len == 0)
 		return (0);
 
 	count = 0;
 	LIST_FOREACH(brt, &sc->sc_rtlist, brt_list)
 		count++;
 	buflen = sizeof(bareq) * count;
 
 	BRIDGE_UNLOCK(sc);
 	outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 	BRIDGE_LOCK(sc);
 
 	count = 0;
 	buf = outbuf;
 	len = min(bac->ifbac_len, buflen);
 	bzero(&bareq, sizeof(bareq));
 	LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) {
 		if (len < sizeof(bareq))
 			goto out;
 		strlcpy(bareq.ifba_ifsname, brt->brt_ifp->if_xname,
 		    sizeof(bareq.ifba_ifsname));
 		memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr));
 		bareq.ifba_vlan = brt->brt_vlan;
 		if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC &&
 				time_uptime < brt->brt_expire)
 			bareq.ifba_expire = brt->brt_expire - time_uptime;
 		else
 			bareq.ifba_expire = 0;
 		bareq.ifba_flags = brt->brt_flags;
 
 		memcpy(buf, &bareq, sizeof(bareq));
 		count++;
 		buf += sizeof(bareq);
 		len -= sizeof(bareq);
 	}
 out:
 	BRIDGE_UNLOCK(sc);
 	bac->ifbac_len = sizeof(bareq) * count;
 	error = copyout(outbuf, bac->ifbac_req, bac->ifbac_len);
 	BRIDGE_LOCK(sc);
 	free(outbuf, M_TEMP);
 	return (error);
 }
 
 static int
 bridge_ioctl_saddr(struct bridge_softc *sc, void *arg)
 {
 	struct ifbareq *req = arg;
 	struct bridge_iflist *bif;
 	int error;
 
 	bif = bridge_lookup_member(sc, req->ifba_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	error = bridge_rtupdate(sc, req->ifba_dst, req->ifba_vlan, bif, 1,
 	    req->ifba_flags);
 
 	return (error);
 }
 
 static int
 bridge_ioctl_sto(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	sc->sc_brttimeout = param->ifbrp_ctime;
 	return (0);
 }
 
 static int
 bridge_ioctl_gto(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	param->ifbrp_ctime = sc->sc_brttimeout;
 	return (0);
 }
 
 static int
 bridge_ioctl_daddr(struct bridge_softc *sc, void *arg)
 {
 	struct ifbareq *req = arg;
 
 	return (bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan));
 }
 
 static int
 bridge_ioctl_flush(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 
 	bridge_rtflush(sc, req->ifbr_ifsflags);
 	return (0);
 }
 
 static int
 bridge_ioctl_gpri(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 
 	param->ifbrp_prio = bs->bs_bridge_priority;
 	return (0);
 }
 
 static int
 bridge_ioctl_spri(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_priority(&sc->sc_stp, param->ifbrp_prio));
 }
 
 static int
 bridge_ioctl_ght(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 
 	param->ifbrp_hellotime = bs->bs_bridge_htime >> 8;
 	return (0);
 }
 
 static int
 bridge_ioctl_sht(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime));
 }
 
 static int
 bridge_ioctl_gfd(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 
 	param->ifbrp_fwddelay = bs->bs_bridge_fdelay >> 8;
 	return (0);
 }
 
 static int
 bridge_ioctl_sfd(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay));
 }
 
 static int
 bridge_ioctl_gma(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 
 	param->ifbrp_maxage = bs->bs_bridge_max_age >> 8;
 	return (0);
 }
 
 static int
 bridge_ioctl_sma(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage));
 }
 
 static int
 bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority));
 }
 
 static int
 bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost));
 }
 
 static int
 bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 
 	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
 	if (bif == NULL)
 		return (ENOENT);
 
 	bif->bif_addrmax = req->ifbr_addrmax;
 	return (0);
 }
 
 static int
 bridge_ioctl_addspan(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif = NULL;
 	struct ifnet *ifs;
 
 	ifs = ifunit(req->ifbr_ifsname);
 	if (ifs == NULL)
 		return (ENOENT);
 
 	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 		if (ifs == bif->bif_ifp)
 			return (EBUSY);
 
 	if (ifs->if_bridge != NULL)
 		return (EBUSY);
 
 	switch (ifs->if_type) {
 		case IFT_ETHER:
 		case IFT_GIF:
 		case IFT_L2VLAN:
 			break;
 		default:
 			return (EINVAL);
 	}
 
 	bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO);
 	if (bif == NULL)
 		return (ENOMEM);
 
 	bif->bif_ifp = ifs;
 	bif->bif_flags = IFBIF_SPAN;
 
 	LIST_INSERT_HEAD(&sc->sc_spanlist, bif, bif_next);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_delspan(struct bridge_softc *sc, void *arg)
 {
 	struct ifbreq *req = arg;
 	struct bridge_iflist *bif;
 	struct ifnet *ifs;
 
 	ifs = ifunit(req->ifbr_ifsname);
 	if (ifs == NULL)
 		return (ENOENT);
 
 	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 		if (ifs == bif->bif_ifp)
 			break;
 
 	if (bif == NULL)
 		return (ENOENT);
 
 	bridge_delete_span(sc, bif);
 
 	return (0);
 }
 
 static int
 bridge_ioctl_gbparam(struct bridge_softc *sc, void *arg)
 {
 	struct ifbropreq *req = arg;
 	struct bstp_state *bs = &sc->sc_stp;
 	struct bstp_port *root_port;
 
 	req->ifbop_maxage = bs->bs_bridge_max_age >> 8;
 	req->ifbop_hellotime = bs->bs_bridge_htime >> 8;
 	req->ifbop_fwddelay = bs->bs_bridge_fdelay >> 8;
 
 	root_port = bs->bs_root_port;
 	if (root_port == NULL)
 		req->ifbop_root_port = 0;
 	else
 		req->ifbop_root_port = root_port->bp_ifp->if_index;
 
 	req->ifbop_holdcount = bs->bs_txholdcount;
 	req->ifbop_priority = bs->bs_bridge_priority;
 	req->ifbop_protocol = bs->bs_protover;
 	req->ifbop_root_path_cost = bs->bs_root_pv.pv_cost;
 	req->ifbop_bridgeid = bs->bs_bridge_pv.pv_dbridge_id;
 	req->ifbop_designated_root = bs->bs_root_pv.pv_root_id;
 	req->ifbop_designated_bridge = bs->bs_root_pv.pv_dbridge_id;
 	req->ifbop_last_tc_time.tv_sec = bs->bs_last_tc_time.tv_sec;
 	req->ifbop_last_tc_time.tv_usec = bs->bs_last_tc_time.tv_usec;
 
 	return (0);
 }
 
 static int
 bridge_ioctl_grte(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	param->ifbrp_cexceeded = sc->sc_brtexceeded;
 	return (0);
 }
 
 static int
 bridge_ioctl_gifsstp(struct bridge_softc *sc, void *arg)
 {
 	struct ifbpstpconf *bifstp = arg;
 	struct bridge_iflist *bif;
 	struct bstp_port *bp;
 	struct ifbpstpreq bpreq;
 	char *buf, *outbuf;
 	int count, buflen, len, error = 0;
 
 	count = 0;
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if ((bif->bif_flags & IFBIF_STP) != 0)
 			count++;
 	}
 
 	buflen = sizeof(bpreq) * count;
 	if (bifstp->ifbpstp_len == 0) {
 		bifstp->ifbpstp_len = buflen;
 		return (0);
 	}
 
 	BRIDGE_UNLOCK(sc);
 	outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 	BRIDGE_LOCK(sc);
 
 	count = 0;
 	buf = outbuf;
 	len = min(bifstp->ifbpstp_len, buflen);
 	bzero(&bpreq, sizeof(bpreq));
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if (len < sizeof(bpreq))
 			break;
 
 		if ((bif->bif_flags & IFBIF_STP) == 0)
 			continue;
 
 		bp = &bif->bif_stp;
 		bpreq.ifbp_portno = bif->bif_ifp->if_index & 0xfff;
 		bpreq.ifbp_fwd_trans = bp->bp_forward_transitions;
 		bpreq.ifbp_design_cost = bp->bp_desg_pv.pv_cost;
 		bpreq.ifbp_design_port = bp->bp_desg_pv.pv_port_id;
 		bpreq.ifbp_design_bridge = bp->bp_desg_pv.pv_dbridge_id;
 		bpreq.ifbp_design_root = bp->bp_desg_pv.pv_root_id;
 
 		memcpy(buf, &bpreq, sizeof(bpreq));
 		count++;
 		buf += sizeof(bpreq);
 		len -= sizeof(bpreq);
 	}
 
 	BRIDGE_UNLOCK(sc);
 	bifstp->ifbpstp_len = sizeof(bpreq) * count;
 	error = copyout(outbuf, bifstp->ifbpstp_req, bifstp->ifbpstp_len);
 	BRIDGE_LOCK(sc);
 	free(outbuf, M_TEMP);
 	return (error);
 }
 
 static int
 bridge_ioctl_sproto(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto));
 }
 
 static int
 bridge_ioctl_stxhc(struct bridge_softc *sc, void *arg)
 {
 	struct ifbrparam *param = arg;
 
 	return (bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc));
 }
 
 /*
  * bridge_ifdetach:
  *
  *	Detach an interface from a bridge.  Called when a member
  *	interface is detaching.
  */
 static void
 bridge_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	struct bridge_iflist *bif;
 
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 
 	/* Check if the interface is a bridge member */
 	if (sc != NULL) {
 		BRIDGE_LOCK(sc);
 
 		bif = bridge_lookup_member_if(sc, ifp);
 		if (bif != NULL)
 			bridge_delete_member(sc, bif, 1);
 
 		BRIDGE_UNLOCK(sc);
 		return;
 	}
 
 	/* Check if the interface is a span port */
 	BRIDGE_LIST_LOCK();
 	LIST_FOREACH(sc, &V_bridge_list, sc_list) {
 		BRIDGE_LOCK(sc);
 		LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
 			if (ifp == bif->bif_ifp) {
 				bridge_delete_span(sc, bif);
 				break;
 			}
 
 		BRIDGE_UNLOCK(sc);
 	}
 	BRIDGE_LIST_UNLOCK();
 }
 
 /*
  * bridge_init:
  *
  *	Initialize a bridge interface.
  */
 static void
 bridge_init(void *xsc)
 {
 	struct bridge_softc *sc = (struct bridge_softc *)xsc;
 	struct ifnet *ifp = sc->sc_ifp;
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 		return;
 
 	BRIDGE_LOCK(sc);
 	callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz,
 	    bridge_timer, sc);
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	bstp_init(&sc->sc_stp);		/* Initialize Spanning Tree */
 
 	BRIDGE_UNLOCK(sc);
 }
 
 /*
  * bridge_stop:
  *
  *	Stop the bridge interface.
  */
 static void
 bridge_stop(struct ifnet *ifp, int disable)
 {
 	struct bridge_softc *sc = ifp->if_softc;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	callout_stop(&sc->sc_brcallout);
 	bstp_stop(&sc->sc_stp);
 
 	bridge_rtflush(sc, IFBF_FLUSHDYN);
 
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 }
 
 /*
  * bridge_enqueue:
  *
  *	Enqueue a packet on a bridge member interface.
  *
  */
 static int
 bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m)
 {
 	int len, err = 0;
 	short mflags;
 	struct mbuf *m0;
 
 	/* We may be sending a fragment so traverse the mbuf */
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		len = m->m_pkthdr.len;
 		mflags = m->m_flags;
 
 		/*
 		 * If underlying interface can not do VLAN tag insertion itself
 		 * then attach a packet tag that holds it.
 		 */
 		if ((m->m_flags & M_VLANTAG) &&
 		    (dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) {
 			m = ether_vlanencap(m, m->m_pkthdr.ether_vtag);
 			if (m == NULL) {
 				if_printf(dst_ifp,
 				    "unable to prepend VLAN header\n");
 				if_inc_counter(dst_ifp, IFCOUNTER_OERRORS, 1);
 				continue;
 			}
 			m->m_flags &= ~M_VLANTAG;
 		}
 
 		if ((err = dst_ifp->if_transmit(dst_ifp, m))) {
 			m_freem(m0);
 			if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 			break;
 		}
 
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, len);
 		if (mflags & M_MCAST)
 			if_inc_counter(sc->sc_ifp, IFCOUNTER_OMCASTS, 1);
 	}
 
 	return (err);
 }
 
 /*
  * bridge_dummynet:
  *
  * 	Receive a queued packet from dummynet and pass it on to the output
  * 	interface.
  *
  *	The mbuf has the Ethernet header already attached.
  */
 static void
 bridge_dummynet(struct mbuf *m, struct ifnet *ifp)
 {
 	struct bridge_softc *sc;
 
 	sc = ifp->if_bridge;
 
 	/*
 	 * The packet didnt originate from a member interface. This should only
 	 * ever happen if a member interface is removed while packets are
 	 * queued for it.
 	 */
 	if (sc == NULL) {
 		m_freem(m);
 		return;
 	}
 
 	if (PFIL_HOOKED(&V_inet_pfil_hook)
 #ifdef INET6
 	    || PFIL_HOOKED(&V_inet6_pfil_hook)
 #endif
 	    ) {
 		if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0)
 			return;
 		if (m == NULL)
 			return;
 	}
 
 	bridge_enqueue(sc, ifp, m);
 }
 
 /*
  * bridge_output:
  *
  *	Send output from a bridge member interface.  This
  *	performs the bridging function for locally originated
  *	packets.
  *
  *	The mbuf has the Ethernet header already attached.  We must
  *	enqueue or free the mbuf before returning.
  */
 static int
 bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
     struct rtentry *rt)
 {
 	struct ether_header *eh;
 	struct ifnet *dst_if;
 	struct bridge_softc *sc;
 	uint16_t vlan;
 
 	if (m->m_len < ETHER_HDR_LEN) {
 		m = m_pullup(m, ETHER_HDR_LEN);
 		if (m == NULL)
 			return (0);
 	}
 
 	eh = mtod(m, struct ether_header *);
 	sc = ifp->if_bridge;
 	vlan = VLANTAGOF(m);
 
 	BRIDGE_LOCK(sc);
 
 	/*
 	 * If bridge is down, but the original output interface is up,
 	 * go ahead and send out that interface.  Otherwise, the packet
 	 * is dropped below.
 	 */
 	if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		dst_if = ifp;
 		goto sendunicast;
 	}
 
 	/*
 	 * If the packet is a multicast, or we don't know a better way to
 	 * get there, send to all interfaces.
 	 */
 	if (ETHER_IS_MULTICAST(eh->ether_dhost))
 		dst_if = NULL;
 	else
 		dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan);
 	if (dst_if == NULL) {
 		struct bridge_iflist *bif;
 		struct mbuf *mc;
 		int error = 0, used = 0;
 
 		bridge_span(sc, m);
 
 		BRIDGE_LOCK2REF(sc, error);
 		if (error) {
 			m_freem(m);
 			return (0);
 		}
 
 		LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 			dst_if = bif->bif_ifp;
 
 			if (dst_if->if_type == IFT_GIF)
 				continue;
 			if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
 				continue;
 
 			/*
 			 * If this is not the original output interface,
 			 * and the interface is participating in spanning
 			 * tree, make sure the port is in a state that
 			 * allows forwarding.
 			 */
 			if (dst_if != ifp && (bif->bif_flags & IFBIF_STP) &&
 			    bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 				continue;
 
 			if (LIST_NEXT(bif, bif_next) == NULL) {
 				used = 1;
 				mc = m;
 			} else {
 				mc = m_copypacket(m, M_NOWAIT);
 				if (mc == NULL) {
 					if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 					continue;
 				}
 			}
 
 			bridge_enqueue(sc, dst_if, mc);
 		}
 		if (used == 0)
 			m_freem(m);
 		BRIDGE_UNREF(sc);
 		return (0);
 	}
 
 sendunicast:
 	/*
 	 * XXX Spanning tree consideration here?
 	 */
 
 	bridge_span(sc, m);
 	if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		m_freem(m);
 		BRIDGE_UNLOCK(sc);
 		return (0);
 	}
 
 	BRIDGE_UNLOCK(sc);
 	bridge_enqueue(sc, dst_if, m);
 	return (0);
 }
 
 /*
  * bridge_transmit:
  *
  *	Do output on a bridge.
  *
  */
 static int
 bridge_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct bridge_softc *sc;
 	struct ether_header *eh;
 	struct ifnet *dst_if;
 	int error = 0;
 
 	sc = ifp->if_softc;
 
 	ETHER_BPF_MTAP(ifp, m);
 
 	eh = mtod(m, struct ether_header *);
 
 	BRIDGE_LOCK(sc);
 	if (((m->m_flags & (M_BCAST|M_MCAST)) == 0) &&
 	    (dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1)) != NULL) {
 		BRIDGE_UNLOCK(sc);
 		error = bridge_enqueue(sc, dst_if, m);
 	} else
 		bridge_broadcast(sc, ifp, m, 0);
 
 	return (error);
 }
 
 /*
  * The ifp->if_qflush entry point for if_bridge(4) is no-op.
  */
 static void
 bridge_qflush(struct ifnet *ifp __unused)
 {
 }
 
 /*
  * bridge_forward:
  *
  *	The forwarding function of the bridge.
  *
  *	NOTE: Releases the lock on return.
  */
 static void
 bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif,
     struct mbuf *m)
 {
 	struct bridge_iflist *dbif;
 	struct ifnet *src_if, *dst_if, *ifp;
 	struct ether_header *eh;
 	uint16_t vlan;
 	uint8_t *dst;
 	int error;
 
 	src_if = m->m_pkthdr.rcvif;
 	ifp = sc->sc_ifp;
 
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	vlan = VLANTAGOF(m);
 
 	if ((sbif->bif_flags & IFBIF_STP) &&
 	    sbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 		goto drop;
 
 	eh = mtod(m, struct ether_header *);
 	dst = eh->ether_dhost;
 
 	/* If the interface is learning, record the address. */
 	if (sbif->bif_flags & IFBIF_LEARNING) {
 		error = bridge_rtupdate(sc, eh->ether_shost, vlan,
 		    sbif, 0, IFBAF_DYNAMIC);
 		/*
 		 * If the interface has addresses limits then deny any source
 		 * that is not in the cache.
 		 */
 		if (error && sbif->bif_addrmax)
 			goto drop;
 	}
 
 	if ((sbif->bif_flags & IFBIF_STP) != 0 &&
 	    sbif->bif_stp.bp_state == BSTP_IFSTATE_LEARNING)
 		goto drop;
 
 	/*
 	 * At this point, the port either doesn't participate
 	 * in spanning tree or it is in the forwarding state.
 	 */
 
 	/*
 	 * If the packet is unicast, destined for someone on
 	 * "this" side of the bridge, drop it.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) {
 		dst_if = bridge_rtlookup(sc, dst, vlan);
 		if (src_if == dst_if)
 			goto drop;
 	} else {
 		/*
 		 * Check if its a reserved multicast address, any address
 		 * listed in 802.1D section 7.12.6 may not be forwarded by the
 		 * bridge.
 		 * This is currently 01-80-C2-00-00-00 to 01-80-C2-00-00-0F
 		 */
 		if (dst[0] == 0x01 && dst[1] == 0x80 &&
 		    dst[2] == 0xc2 && dst[3] == 0x00 &&
 		    dst[4] == 0x00 && dst[5] <= 0x0f)
 			goto drop;
 
 		/* ...forward it to all interfaces. */
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 		dst_if = NULL;
 	}
 
 	/*
 	 * If we have a destination interface which is a member of our bridge,
 	 * OR this is a unicast packet, push it through the bpf(4) machinery.
 	 * For broadcast or multicast packets, don't bother because it will
 	 * be reinjected into ether_input. We do this before we pass the packets
 	 * through the pfil(9) framework, as it is possible that pfil(9) will
 	 * drop the packet, or possibly modify it, making it difficult to debug
 	 * firewall issues on the bridge.
 	 */
 	if (dst_if != NULL || (m->m_flags & (M_BCAST | M_MCAST)) == 0)
 		ETHER_BPF_MTAP(ifp, m);
 
 	/* run the packet filter */
 	if (PFIL_HOOKED(&V_inet_pfil_hook)
 #ifdef INET6
 	    || PFIL_HOOKED(&V_inet6_pfil_hook)
 #endif
 	    ) {
 		BRIDGE_UNLOCK(sc);
 		if (bridge_pfil(&m, ifp, src_if, PFIL_IN) != 0)
 			return;
 		if (m == NULL)
 			return;
 		BRIDGE_LOCK(sc);
 	}
 
 	if (dst_if == NULL) {
 		bridge_broadcast(sc, src_if, m, 1);
 		return;
 	}
 
 	/*
 	 * At this point, we're dealing with a unicast frame
 	 * going to a different interface.
 	 */
 	if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		goto drop;
 
 	dbif = bridge_lookup_member_if(sc, dst_if);
 	if (dbif == NULL)
 		/* Not a member of the bridge (anymore?) */
 		goto drop;
 
 	/* Private segments can not talk to each other */
 	if (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE)
 		goto drop;
 
 	if ((dbif->bif_flags & IFBIF_STP) &&
 	    dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 		goto drop;
 
 	BRIDGE_UNLOCK(sc);
 
 	if (PFIL_HOOKED(&V_inet_pfil_hook)
 #ifdef INET6
 	    || PFIL_HOOKED(&V_inet6_pfil_hook)
 #endif
 	    ) {
 		if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0)
 			return;
 		if (m == NULL)
 			return;
 	}
 
 	bridge_enqueue(sc, dst_if, m);
 	return;
 
 drop:
 	BRIDGE_UNLOCK(sc);
 	m_freem(m);
 }
 
 /*
  * bridge_input:
  *
  *	Receive input from a member interface.  Queue the packet for
  *	bridging if it is not for us.
  */
 static struct mbuf *
 bridge_input(struct ifnet *ifp, struct mbuf *m)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	struct bridge_iflist *bif, *bif2;
 	struct ifnet *bifp;
 	struct ether_header *eh;
 	struct mbuf *mc, *mc2;
 	uint16_t vlan;
 	int error;
 
 	if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return (m);
 
 	bifp = sc->sc_ifp;
 	vlan = VLANTAGOF(m);
 
 	/*
 	 * Implement support for bridge monitoring. If this flag has been
 	 * set on this interface, discard the packet once we push it through
 	 * the bpf(4) machinery, but before we do, increment the byte and
 	 * packet counters associated with this interface.
 	 */
 	if ((bifp->if_flags & IFF_MONITOR) != 0) {
 		m->m_pkthdr.rcvif  = bifp;
 		ETHER_BPF_MTAP(bifp, m);
 		if_inc_counter(bifp, IFCOUNTER_IPACKETS, 1);
 		if_inc_counter(bifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 		m_freem(m);
 		return (NULL);
 	}
 	BRIDGE_LOCK(sc);
 	bif = bridge_lookup_member_if(sc, ifp);
 	if (bif == NULL) {
 		BRIDGE_UNLOCK(sc);
 		return (m);
 	}
 
 	eh = mtod(m, struct ether_header *);
 
 	bridge_span(sc, m);
 
 	if (m->m_flags & (M_BCAST|M_MCAST)) {
 		/* Tap off 802.1D packets; they do not get forwarded. */
 		if (memcmp(eh->ether_dhost, bstp_etheraddr,
 		    ETHER_ADDR_LEN) == 0) {
 			bstp_input(&bif->bif_stp, ifp, m); /* consumes mbuf */
 			BRIDGE_UNLOCK(sc);
 			return (NULL);
 		}
 
 		if ((bif->bif_flags & IFBIF_STP) &&
 		    bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) {
 			BRIDGE_UNLOCK(sc);
 			return (m);
 		}
 
 		/*
 		 * Make a deep copy of the packet and enqueue the copy
 		 * for bridge processing; return the original packet for
 		 * local processing.
 		 */
 		mc = m_dup(m, M_NOWAIT);
 		if (mc == NULL) {
 			BRIDGE_UNLOCK(sc);
 			return (m);
 		}
 
 		/* Perform the bridge forwarding function with the copy. */
 		bridge_forward(sc, bif, mc);
 
 		/*
 		 * Reinject the mbuf as arriving on the bridge so we have a
 		 * chance at claiming multicast packets. We can not loop back
 		 * here from ether_input as a bridge is never a member of a
 		 * bridge.
 		 */
 		KASSERT(bifp->if_bridge == NULL,
 		    ("loop created in bridge_input"));
 		mc2 = m_dup(m, M_NOWAIT);
 		if (mc2 != NULL) {
 			/* Keep the layer3 header aligned */
 			int i = min(mc2->m_pkthdr.len, max_protohdr);
 			mc2 = m_copyup(mc2, i, ETHER_ALIGN);
 		}
 		if (mc2 != NULL) {
 			mc2->m_pkthdr.rcvif = bifp;
 			(*bifp->if_input)(bifp, mc2);
 		}
 
 		/* Return the original packet for local processing. */
 		return (m);
 	}
 
 	if ((bif->bif_flags & IFBIF_STP) &&
 	    bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) {
 		BRIDGE_UNLOCK(sc);
 		return (m);
 	}
 
 #if (defined(INET) || defined(INET6))
 #   define OR_CARP_CHECK_WE_ARE_DST(iface) \
 	|| ((iface)->if_carp \
 	    && (*carp_forus_p)((iface), eh->ether_dhost))
 #   define OR_CARP_CHECK_WE_ARE_SRC(iface) \
 	|| ((iface)->if_carp \
 	    && (*carp_forus_p)((iface), eh->ether_shost))
 #else
 #   define OR_CARP_CHECK_WE_ARE_DST(iface)
 #   define OR_CARP_CHECK_WE_ARE_SRC(iface)
 #endif
 
 #ifdef INET6
 #   define OR_PFIL_HOOKED_INET6 \
 	|| PFIL_HOOKED(&V_inet6_pfil_hook)
 #else
 #   define OR_PFIL_HOOKED_INET6
 #endif
 
 #define GRAB_OUR_PACKETS(iface) \
 	if ((iface)->if_type == IFT_GIF) \
 		continue; \
 	/* It is destined for us. */ \
 	if (memcmp(IF_LLADDR((iface)), eh->ether_dhost,  ETHER_ADDR_LEN) == 0 \
 	    OR_CARP_CHECK_WE_ARE_DST((iface))				\
 	    ) {								\
 		if ((iface)->if_type == IFT_BRIDGE) {			\
 			ETHER_BPF_MTAP(iface, m);			\
 			if_inc_counter(iface, IFCOUNTER_IPACKETS, 1);				\
 			if_inc_counter(iface, IFCOUNTER_IBYTES, m->m_pkthdr.len);		\
 			/* Filter on the physical interface. */		\
 			if (V_pfil_local_phys &&			\
 			    (PFIL_HOOKED(&V_inet_pfil_hook)		\
 			     OR_PFIL_HOOKED_INET6)) {			\
 				if (bridge_pfil(&m, NULL, ifp,		\
 				    PFIL_IN) != 0 || m == NULL) {	\
 					BRIDGE_UNLOCK(sc);		\
 					return (NULL);			\
 				}					\
 				eh = mtod(m, struct ether_header *);	\
 			}						\
 		}							\
 		if (bif->bif_flags & IFBIF_LEARNING) {			\
 			error = bridge_rtupdate(sc, eh->ether_shost,	\
 			    vlan, bif, 0, IFBAF_DYNAMIC);		\
 			if (error && bif->bif_addrmax) {		\
 				BRIDGE_UNLOCK(sc);			\
 				m_freem(m);				\
 				return (NULL);				\
 			}						\
 		}							\
 		m->m_pkthdr.rcvif = iface;				\
 		BRIDGE_UNLOCK(sc);					\
 		return (m);						\
 	}								\
 									\
 	/* We just received a packet that we sent out. */		\
 	if (memcmp(IF_LLADDR((iface)), eh->ether_shost, ETHER_ADDR_LEN) == 0 \
 	    OR_CARP_CHECK_WE_ARE_SRC((iface))			\
 	    ) {								\
 		BRIDGE_UNLOCK(sc);					\
 		m_freem(m);						\
 		return (NULL);						\
 	}
 
 	/*
 	 * Unicast.  Make sure it's not for the bridge.
 	 */
 	do { GRAB_OUR_PACKETS(bifp) } while (0);
 
 	/*
 	 * Give a chance for ifp at first priority. This will help when	the
 	 * packet comes through the interface like VLAN's with the same MACs
 	 * on several interfaces from the same bridge. This also will save
 	 * some CPU cycles in case the destination interface and the input
 	 * interface (eq ifp) are the same.
 	 */
 	do { GRAB_OUR_PACKETS(ifp) } while (0);
 
 	/* Now check the all bridge members. */
 	LIST_FOREACH(bif2, &sc->sc_iflist, bif_next) {
 		GRAB_OUR_PACKETS(bif2->bif_ifp)
 	}
 
 #undef OR_CARP_CHECK_WE_ARE_DST
 #undef OR_CARP_CHECK_WE_ARE_SRC
 #undef OR_PFIL_HOOKED_INET6
 #undef GRAB_OUR_PACKETS
 
 	/* Perform the bridge forwarding function. */
 	bridge_forward(sc, bif, m);
 
 	return (NULL);
 }
 
 /*
  * bridge_broadcast:
  *
  *	Send a frame to all interfaces that are members of
  *	the bridge, except for the one on which the packet
  *	arrived.
  *
  *	NOTE: Releases the lock on return.
  */
 static void
 bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if,
     struct mbuf *m, int runfilt)
 {
 	struct bridge_iflist *dbif, *sbif;
 	struct mbuf *mc;
 	struct ifnet *dst_if;
 	int error = 0, used = 0, i;
 
 	sbif = bridge_lookup_member_if(sc, src_if);
 
 	BRIDGE_LOCK2REF(sc, error);
 	if (error) {
 		m_freem(m);
 		return;
 	}
 
 	/* Filter on the bridge interface before broadcasting */
 	if (runfilt && (PFIL_HOOKED(&V_inet_pfil_hook)
 #ifdef INET6
 	    || PFIL_HOOKED(&V_inet6_pfil_hook)
 #endif
 	    )) {
 		if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0)
 			goto out;
 		if (m == NULL)
 			goto out;
 	}
 
 	LIST_FOREACH(dbif, &sc->sc_iflist, bif_next) {
 		dst_if = dbif->bif_ifp;
 		if (dst_if == src_if)
 			continue;
 
 		/* Private segments can not talk to each other */
 		if (sbif && (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE))
 			continue;
 
 		if ((dbif->bif_flags & IFBIF_STP) &&
 		    dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
 			continue;
 
 		if ((dbif->bif_flags & IFBIF_DISCOVER) == 0 &&
 		    (m->m_flags & (M_BCAST|M_MCAST)) == 0)
 			continue;
 
 		if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			continue;
 
 		if (LIST_NEXT(dbif, bif_next) == NULL) {
 			mc = m;
 			used = 1;
 		} else {
 			mc = m_dup(m, M_NOWAIT);
 			if (mc == NULL) {
 				if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 				continue;
 			}
 		}
 
 		/*
 		 * Filter on the output interface. Pass a NULL bridge interface
 		 * pointer so we do not redundantly filter on the bridge for
 		 * each interface we broadcast on.
 		 */
 		if (runfilt && (PFIL_HOOKED(&V_inet_pfil_hook)
 #ifdef INET6
 		    || PFIL_HOOKED(&V_inet6_pfil_hook)
 #endif
 		    )) {
 			if (used == 0) {
 				/* Keep the layer3 header aligned */
 				i = min(mc->m_pkthdr.len, max_protohdr);
 				mc = m_copyup(mc, i, ETHER_ALIGN);
 				if (mc == NULL) {
 					if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 					continue;
 				}
 			}
 			if (bridge_pfil(&mc, NULL, dst_if, PFIL_OUT) != 0)
 				continue;
 			if (mc == NULL)
 				continue;
 		}
 
 		bridge_enqueue(sc, dst_if, mc);
 	}
 	if (used == 0)
 		m_freem(m);
 
 out:
 	BRIDGE_UNREF(sc);
 }
 
 /*
  * bridge_span:
  *
  *	Duplicate a packet out one or more interfaces that are in span mode,
  *	the original mbuf is unmodified.
  */
 static void
 bridge_span(struct bridge_softc *sc, struct mbuf *m)
 {
 	struct bridge_iflist *bif;
 	struct ifnet *dst_if;
 	struct mbuf *mc;
 
 	if (LIST_EMPTY(&sc->sc_spanlist))
 		return;
 
 	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) {
 		dst_if = bif->bif_ifp;
 
 		if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			continue;
 
 		mc = m_copypacket(m, M_NOWAIT);
 		if (mc == NULL) {
 			if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 			continue;
 		}
 
 		bridge_enqueue(sc, dst_if, mc);
 	}
 }
 
 /*
  * bridge_rtupdate:
  *
  *	Add a bridge routing entry.
  */
 static int
 bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan,
     struct bridge_iflist *bif, int setflags, uint8_t flags)
 {
 	struct bridge_rtnode *brt;
 	int error;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	/* Check the source address is valid and not multicast. */
 	if (ETHER_IS_MULTICAST(dst) ||
 	    (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 &&
 	     dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0)
 		return (EINVAL);
 
 	/* 802.1p frames map to vlan 1 */
 	if (vlan == 0)
 		vlan = 1;
 
 	/*
 	 * A route for this destination might already exist.  If so,
 	 * update it, otherwise create a new one.
 	 */
 	if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) == NULL) {
 		if (sc->sc_brtcnt >= sc->sc_brtmax) {
 			sc->sc_brtexceeded++;
 			return (ENOSPC);
 		}
 		/* Check per interface address limits (if enabled) */
 		if (bif->bif_addrmax && bif->bif_addrcnt >= bif->bif_addrmax) {
 			bif->bif_addrexceeded++;
 			return (ENOSPC);
 		}
 
 		/*
 		 * Allocate a new bridge forwarding node, and
 		 * initialize the expiration time and Ethernet
 		 * address.
 		 */
 		brt = uma_zalloc(bridge_rtnode_zone, M_NOWAIT | M_ZERO);
 		if (brt == NULL)
 			return (ENOMEM);
 
 		if (bif->bif_flags & IFBIF_STICKY)
 			brt->brt_flags = IFBAF_STICKY;
 		else
 			brt->brt_flags = IFBAF_DYNAMIC;
 
 		memcpy(brt->brt_addr, dst, ETHER_ADDR_LEN);
 		brt->brt_vlan = vlan;
 
 		if ((error = bridge_rtnode_insert(sc, brt)) != 0) {
 			uma_zfree(bridge_rtnode_zone, brt);
 			return (error);
 		}
 		brt->brt_dst = bif;
 		bif->bif_addrcnt++;
 	}
 
 	if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC &&
 	    brt->brt_dst != bif) {
 		brt->brt_dst->bif_addrcnt--;
 		brt->brt_dst = bif;
 		brt->brt_dst->bif_addrcnt++;
 	}
 
 	if ((flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
 		brt->brt_expire = time_uptime + sc->sc_brttimeout;
 	if (setflags)
 		brt->brt_flags = flags;
 
 	return (0);
 }
 
 /*
  * bridge_rtlookup:
  *
  *	Lookup the destination interface for an address.
  */
 static struct ifnet *
 bridge_rtlookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan)
 {
 	struct bridge_rtnode *brt;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	if ((brt = bridge_rtnode_lookup(sc, addr, vlan)) == NULL)
 		return (NULL);
 
 	return (brt->brt_ifp);
 }
 
 /*
  * bridge_rttrim:
  *
  *	Trim the routine table so that we have a number
  *	of routing entries less than or equal to the
  *	maximum number.
  */
 static void
 bridge_rttrim(struct bridge_softc *sc)
 {
 	struct bridge_rtnode *brt, *nbrt;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	/* Make sure we actually need to do this. */
 	if (sc->sc_brtcnt <= sc->sc_brtmax)
 		return;
 
 	/* Force an aging cycle; this might trim enough addresses. */
 	bridge_rtage(sc);
 	if (sc->sc_brtcnt <= sc->sc_brtmax)
 		return;
 
 	LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
 		if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) {
 			bridge_rtnode_destroy(sc, brt);
 			if (sc->sc_brtcnt <= sc->sc_brtmax)
 				return;
 		}
 	}
 }
 
 /*
  * bridge_timer:
  *
  *	Aging timer for the bridge.
  */
 static void
 bridge_timer(void *arg)
 {
 	struct bridge_softc *sc = arg;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	bridge_rtage(sc);
 
 	if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING)
 		callout_reset(&sc->sc_brcallout,
 		    bridge_rtable_prune_period * hz, bridge_timer, sc);
 }
 
 /*
  * bridge_rtage:
  *
  *	Perform an aging cycle.
  */
 static void
 bridge_rtage(struct bridge_softc *sc)
 {
 	struct bridge_rtnode *brt, *nbrt;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
 		if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) {
 			if (time_uptime >= brt->brt_expire)
 				bridge_rtnode_destroy(sc, brt);
 		}
 	}
 }
 
 /*
  * bridge_rtflush:
  *
  *	Remove all dynamic addresses from the bridge.
  */
 static void
 bridge_rtflush(struct bridge_softc *sc, int full)
 {
 	struct bridge_rtnode *brt, *nbrt;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
 		if (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
 			bridge_rtnode_destroy(sc, brt);
 	}
 }
 
 /*
  * bridge_rtdaddr:
  *
  *	Remove an address from the table.
  */
 static int
 bridge_rtdaddr(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan)
 {
 	struct bridge_rtnode *brt;
 	int found = 0;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	/*
 	 * If vlan is zero then we want to delete for all vlans so the lookup
 	 * may return more than one.
 	 */
 	while ((brt = bridge_rtnode_lookup(sc, addr, vlan)) != NULL) {
 		bridge_rtnode_destroy(sc, brt);
 		found = 1;
 	}
 
 	return (found ? 0 : ENOENT);
 }
 
 /*
  * bridge_rtdelete:
  *
  *	Delete routes to a speicifc member interface.
  */
 static void
 bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full)
 {
 	struct bridge_rtnode *brt, *nbrt;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
 		if (brt->brt_ifp == ifp && (full ||
 			    (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC))
 			bridge_rtnode_destroy(sc, brt);
 	}
 }
 
 /*
  * bridge_rtable_init:
  *
  *	Initialize the route table for this bridge.
  */
 static void
 bridge_rtable_init(struct bridge_softc *sc)
 {
 	int i;
 
 	sc->sc_rthash = malloc(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE,
 	    M_DEVBUF, M_WAITOK);
 
 	for (i = 0; i < BRIDGE_RTHASH_SIZE; i++)
 		LIST_INIT(&sc->sc_rthash[i]);
 
 	sc->sc_rthash_key = arc4random();
 	LIST_INIT(&sc->sc_rtlist);
 }
 
 /*
  * bridge_rtable_fini:
  *
  *	Deconstruct the route table for this bridge.
  */
 static void
 bridge_rtable_fini(struct bridge_softc *sc)
 {
 
 	KASSERT(sc->sc_brtcnt == 0,
 	    ("%s: %d bridge routes referenced", __func__, sc->sc_brtcnt));
 	free(sc->sc_rthash, M_DEVBUF);
 }
 
 /*
  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
  */
 #define	mix(a, b, c)							\
 do {									\
 	a -= b; a -= c; a ^= (c >> 13);					\
 	b -= c; b -= a; b ^= (a << 8);					\
 	c -= a; c -= b; c ^= (b >> 13);					\
 	a -= b; a -= c; a ^= (c >> 12);					\
 	b -= c; b -= a; b ^= (a << 16);					\
 	c -= a; c -= b; c ^= (b >> 5);					\
 	a -= b; a -= c; a ^= (c >> 3);					\
 	b -= c; b -= a; b ^= (a << 10);					\
 	c -= a; c -= b; c ^= (b >> 15);					\
 } while (/*CONSTCOND*/0)
 
 static __inline uint32_t
 bridge_rthash(struct bridge_softc *sc, const uint8_t *addr)
 {
 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->sc_rthash_key;
 
 	b += addr[5] << 8;
 	b += addr[4];
 	a += addr[3] << 24;
 	a += addr[2] << 16;
 	a += addr[1] << 8;
 	a += addr[0];
 
 	mix(a, b, c);
 
 	return (c & BRIDGE_RTHASH_MASK);
 }
 
 #undef mix
 
 static int
 bridge_rtnode_addr_cmp(const uint8_t *a, const uint8_t *b)
 {
 	int i, d;
 
 	for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) {
 		d = ((int)a[i]) - ((int)b[i]);
 	}
 
 	return (d);
 }
 
 /*
  * bridge_rtnode_lookup:
  *
  *	Look up a bridge route node for the specified destination. Compare the
  *	vlan id or if zero then just return the first match.
  */
 static struct bridge_rtnode *
 bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan)
 {
 	struct bridge_rtnode *brt;
 	uint32_t hash;
 	int dir;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	hash = bridge_rthash(sc, addr);
 	LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) {
 		dir = bridge_rtnode_addr_cmp(addr, brt->brt_addr);
 		if (dir == 0 && (brt->brt_vlan == vlan || vlan == 0))
 			return (brt);
 		if (dir > 0)
 			return (NULL);
 	}
 
 	return (NULL);
 }
 
 /*
  * bridge_rtnode_insert:
  *
  *	Insert the specified bridge node into the route table.  We
  *	assume the entry is not already in the table.
  */
 static int
 bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt)
 {
 	struct bridge_rtnode *lbrt;
 	uint32_t hash;
 	int dir;
 
 	BRIDGE_LOCK_ASSERT(sc);
 
 	hash = bridge_rthash(sc, brt->brt_addr);
 
 	lbrt = LIST_FIRST(&sc->sc_rthash[hash]);
 	if (lbrt == NULL) {
 		LIST_INSERT_HEAD(&sc->sc_rthash[hash], brt, brt_hash);
 		goto out;
 	}
 
 	do {
 		dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr);
 		if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan)
 			return (EEXIST);
 		if (dir > 0) {
 			LIST_INSERT_BEFORE(lbrt, brt, brt_hash);
 			goto out;
 		}
 		if (LIST_NEXT(lbrt, brt_hash) == NULL) {
 			LIST_INSERT_AFTER(lbrt, brt, brt_hash);
 			goto out;
 		}
 		lbrt = LIST_NEXT(lbrt, brt_hash);
 	} while (lbrt != NULL);
 
 #ifdef DIAGNOSTIC
 	panic("bridge_rtnode_insert: impossible");
 #endif
 
 out:
 	LIST_INSERT_HEAD(&sc->sc_rtlist, brt, brt_list);
 	sc->sc_brtcnt++;
 
 	return (0);
 }
 
 /*
  * bridge_rtnode_destroy:
  *
  *	Destroy a bridge rtnode.
  */
 static void
 bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt)
 {
 	BRIDGE_LOCK_ASSERT(sc);
 
 	LIST_REMOVE(brt, brt_hash);
 
 	LIST_REMOVE(brt, brt_list);
 	sc->sc_brtcnt--;
 	brt->brt_dst->bif_addrcnt--;
 	uma_zfree(bridge_rtnode_zone, brt);
 }
 
 /*
  * bridge_rtable_expire:
  *
  *	Set the expiry time for all routes on an interface.
  */
 static void
 bridge_rtable_expire(struct ifnet *ifp, int age)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	struct bridge_rtnode *brt;
 
 	BRIDGE_LOCK(sc);
 
 	/*
 	 * If the age is zero then flush, otherwise set all the expiry times to
 	 * age for the interface
 	 */
 	if (age == 0)
 		bridge_rtdelete(sc, ifp, IFBF_FLUSHDYN);
 	else {
 		LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) {
 			/* Cap the expiry time to 'age' */
 			if (brt->brt_ifp == ifp &&
 			    brt->brt_expire > time_uptime + age &&
 			    (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
 				brt->brt_expire = time_uptime + age;
 		}
 	}
 	BRIDGE_UNLOCK(sc);
 }
 
 /*
  * bridge_state_change:
  *
  *	Callback from the bridgestp code when a port changes states.
  */
 static void
 bridge_state_change(struct ifnet *ifp, int state)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	static const char *stpstates[] = {
 		"disabled",
 		"listening",
 		"learning",
 		"forwarding",
 		"blocking",
 		"discarding"
 	};
 
 	if (V_log_stp)
 		log(LOG_NOTICE, "%s: state changed to %s on %s\n",
 		    sc->sc_ifp->if_xname, stpstates[state], ifp->if_xname);
 }
 
 /*
  * Send bridge packets through pfil if they are one of the types pfil can deal
  * with, or if they are ARP or REVARP.  (pfil will pass ARP and REVARP without
  * question.) If *bifp or *ifp are NULL then packet filtering is skipped for
  * that interface.
  */
 static int
 bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir)
 {
 	int snap, error, i, hlen;
 	struct ether_header *eh1, eh2;
 	struct ip *ip;
 	struct llc llc1;
 	u_int16_t ether_type;
 
 	snap = 0;
 	error = -1;	/* Default error if not error == 0 */
 
 #if 0
 	/* we may return with the IP fields swapped, ensure its not shared */
 	KASSERT(M_WRITABLE(*mp), ("%s: modifying a shared mbuf", __func__));
 #endif
 
 	if (V_pfil_bridge == 0 && V_pfil_member == 0 && V_pfil_ipfw == 0)
 		return (0); /* filtering is disabled */
 
 	i = min((*mp)->m_pkthdr.len, max_protohdr);
 	if ((*mp)->m_len < i) {
 	    *mp = m_pullup(*mp, i);
 	    if (*mp == NULL) {
 		printf("%s: m_pullup failed\n", __func__);
 		return (-1);
 	    }
 	}
 
 	eh1 = mtod(*mp, struct ether_header *);
 	ether_type = ntohs(eh1->ether_type);
 
 	/*
 	 * Check for SNAP/LLC.
 	 */
 	if (ether_type < ETHERMTU) {
 		struct llc *llc2 = (struct llc *)(eh1 + 1);
 
 		if ((*mp)->m_len >= ETHER_HDR_LEN + 8 &&
 		    llc2->llc_dsap == LLC_SNAP_LSAP &&
 		    llc2->llc_ssap == LLC_SNAP_LSAP &&
 		    llc2->llc_control == LLC_UI) {
 			ether_type = htons(llc2->llc_un.type_snap.ether_type);
 			snap = 1;
 		}
 	}
 
 	/*
 	 * If we're trying to filter bridge traffic, don't look at anything
 	 * other than IP and ARP traffic.  If the filter doesn't understand
 	 * IPv6, don't allow IPv6 through the bridge either.  This is lame
 	 * since if we really wanted, say, an AppleTalk filter, we are hosed,
 	 * but of course we don't have an AppleTalk filter to begin with.
 	 * (Note that since pfil doesn't understand ARP it will pass *ALL*
 	 * ARP traffic.)
 	 */
 	switch (ether_type) {
 		case ETHERTYPE_ARP:
 		case ETHERTYPE_REVARP:
 			if (V_pfil_ipfw_arp == 0)
 				return (0); /* Automatically pass */
 			break;
 
 		case ETHERTYPE_IP:
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 #endif /* INET6 */
 			break;
 		default:
 			/*
 			 * Check to see if the user wants to pass non-ip
 			 * packets, these will not be checked by pfil(9) and
 			 * passed unconditionally so the default is to drop.
 			 */
 			if (V_pfil_onlyip)
 				goto bad;
 	}
 
 	/* Run the packet through pfil before stripping link headers */
 	if (PFIL_HOOKED(&V_link_pfil_hook) && V_pfil_ipfw != 0 &&
 			dir == PFIL_OUT && ifp != NULL) {
 
 		error = pfil_run_hooks(&V_link_pfil_hook, mp, ifp, dir, NULL);
 
 		if (*mp == NULL || error != 0) /* packet consumed by filter */
 			return (error);
 	}
 
 	/* Strip off the Ethernet header and keep a copy. */
 	m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t) &eh2);
 	m_adj(*mp, ETHER_HDR_LEN);
 
 	/* Strip off snap header, if present */
 	if (snap) {
 		m_copydata(*mp, 0, sizeof(struct llc), (caddr_t) &llc1);
 		m_adj(*mp, sizeof(struct llc));
 	}
 
 	/*
 	 * Check the IP header for alignment and errors
 	 */
 	if (dir == PFIL_IN) {
 		switch (ether_type) {
 			case ETHERTYPE_IP:
 				error = bridge_ip_checkbasic(mp);
 				break;
 #ifdef INET6
 			case ETHERTYPE_IPV6:
 				error = bridge_ip6_checkbasic(mp);
 				break;
 #endif /* INET6 */
 			default:
 				error = 0;
 		}
 		if (error)
 			goto bad;
 	}
 
 	error = 0;
 
 	/*
 	 * Run the packet through pfil
 	 */
 	switch (ether_type) {
 	case ETHERTYPE_IP:
 		/*
 		 * Run pfil on the member interface and the bridge, both can
 		 * be skipped by clearing pfil_member or pfil_bridge.
 		 *
 		 * Keep the order:
 		 *   in_if -> bridge_if -> out_if
 		 */
 		if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL)
 			error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp,
 					dir, NULL);
 
 		if (*mp == NULL || error != 0) /* filter may consume */
 			break;
 
 		if (V_pfil_member && ifp != NULL)
 			error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp,
 					dir, NULL);
 
 		if (*mp == NULL || error != 0) /* filter may consume */
 			break;
 
 		if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL)
 			error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp,
 					dir, NULL);
 
 		if (*mp == NULL || error != 0) /* filter may consume */
 			break;
 
 		/* check if we need to fragment the packet */
 		if (V_pfil_member && ifp != NULL && dir == PFIL_OUT) {
 			i = (*mp)->m_pkthdr.len;
 			if (i > ifp->if_mtu) {
 				error = bridge_fragment(ifp, *mp, &eh2, snap,
 					    &llc1);
 				return (error);
 			}
 		}
 
 		/* Recalculate the ip checksum. */
 		ip = mtod(*mp, struct ip *);
 		hlen = ip->ip_hl << 2;
 		if (hlen < sizeof(struct ip))
 			goto bad;
 		if (hlen > (*mp)->m_len) {
 			if ((*mp = m_pullup(*mp, hlen)) == 0)
 				goto bad;
 			ip = mtod(*mp, struct ip *);
 			if (ip == NULL)
 				goto bad;
 		}
 		ip->ip_sum = 0;
 		if (hlen == sizeof(struct ip))
 			ip->ip_sum = in_cksum_hdr(ip);
 		else
 			ip->ip_sum = in_cksum(*mp, hlen);
 
 		break;
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL)
 			error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp,
 					dir, NULL);
 
 		if (*mp == NULL || error != 0) /* filter may consume */
 			break;
 
 		if (V_pfil_member && ifp != NULL)
 			error = pfil_run_hooks(&V_inet6_pfil_hook, mp, ifp,
 					dir, NULL);
 
 		if (*mp == NULL || error != 0) /* filter may consume */
 			break;
 
 		if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL)
 			error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp,
 					dir, NULL);
 		break;
 #endif
 	default:
 		error = 0;
 		break;
 	}
 
 	if (*mp == NULL)
 		return (error);
 	if (error != 0)
 		goto bad;
 
 	error = -1;
 
 	/*
 	 * Finally, put everything back the way it was and return
 	 */
 	if (snap) {
 		M_PREPEND(*mp, sizeof(struct llc), M_NOWAIT);
 		if (*mp == NULL)
 			return (error);
 		bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc));
 	}
 
 	M_PREPEND(*mp, ETHER_HDR_LEN, M_NOWAIT);
 	if (*mp == NULL)
 		return (error);
 	bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN);
 
 	return (0);
 
 bad:
 	m_freem(*mp);
 	*mp = NULL;
 	return (error);
 }
 
 /*
  * Perform basic checks on header size since
  * pfil assumes ip_input has already processed
  * it for it.  Cut-and-pasted from ip_input.c.
  * Given how simple the IPv6 version is,
  * does the IPv4 version really need to be
  * this complicated?
  *
  * XXX Should we update ipstat here, or not?
  * XXX Right now we update ipstat but not
  * XXX csum_counter.
  */
 static int
 bridge_ip_checkbasic(struct mbuf **mp)
 {
 	struct mbuf *m = *mp;
 	struct ip *ip;
 	int len, hlen;
 	u_short sum;
 
 	if (*mp == NULL)
 		return (-1);
 
 	if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) {
 		if ((m = m_copyup(m, sizeof(struct ip),
 			(max_linkhdr + 3) & ~3)) == NULL) {
 			/* XXXJRT new stat, please */
 			KMOD_IPSTAT_INC(ips_toosmall);
 			goto bad;
 		}
 	} else if (__predict_false(m->m_len < sizeof (struct ip))) {
 		if ((m = m_pullup(m, sizeof (struct ip))) == NULL) {
 			KMOD_IPSTAT_INC(ips_toosmall);
 			goto bad;
 		}
 	}
 	ip = mtod(m, struct ip *);
 	if (ip == NULL) goto bad;
 
 	if (ip->ip_v != IPVERSION) {
 		KMOD_IPSTAT_INC(ips_badvers);
 		goto bad;
 	}
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) { /* minimum header length */
 		KMOD_IPSTAT_INC(ips_badhlen);
 		goto bad;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == 0) {
 			KMOD_IPSTAT_INC(ips_badhlen);
 			goto bad;
 		}
 		ip = mtod(m, struct ip *);
 		if (ip == NULL) goto bad;
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	} else {
 		if (hlen == sizeof(struct ip)) {
 			sum = in_cksum_hdr(ip);
 		} else {
 			sum = in_cksum(m, hlen);
 		}
 	}
 	if (sum) {
 		KMOD_IPSTAT_INC(ips_badsum);
 		goto bad;
 	}
 
 	/* Retrieve the packet length. */
 	len = ntohs(ip->ip_len);
 
 	/*
 	 * Check for additional length bogosity
 	 */
 	if (len < hlen) {
 		KMOD_IPSTAT_INC(ips_badlen);
 		goto bad;
 	}
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IP header would have us expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len < len) {
 		KMOD_IPSTAT_INC(ips_tooshort);
 		goto bad;
 	}
 
 	/* Checks out, proceed */
 	*mp = m;
 	return (0);
 
 bad:
 	*mp = m;
 	return (-1);
 }
 
 #ifdef INET6
 /*
  * Same as above, but for IPv6.
  * Cut-and-pasted from ip6_input.c.
  * XXX Should we update ip6stat, or not?
  */
 static int
 bridge_ip6_checkbasic(struct mbuf **mp)
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6;
 
 	/*
 	 * If the IPv6 header is not aligned, slurp it up into a new
 	 * mbuf with space for link headers, in the event we forward
 	 * it.  Otherwise, if it is aligned, make sure the entire base
 	 * IPv6 header is in the first mbuf of the chain.
 	 */
 	if (IP6_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) {
 		struct ifnet *inifp = m->m_pkthdr.rcvif;
 		if ((m = m_copyup(m, sizeof(struct ip6_hdr),
 			    (max_linkhdr + 3) & ~3)) == NULL) {
 			/* XXXJRT new stat, please */
 			IP6STAT_INC(ip6s_toosmall);
 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
 			goto bad;
 		}
 	} else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) {
 		struct ifnet *inifp = m->m_pkthdr.rcvif;
 		if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
 			IP6STAT_INC(ip6s_toosmall);
 			in6_ifstat_inc(inifp, ifs6_in_hdrerr);
 			goto bad;
 		}
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 		IP6STAT_INC(ip6s_badvers);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
 		goto bad;
 	}
 
 	/* Checks out, proceed */
 	*mp = m;
 	return (0);
 
 bad:
 	*mp = m;
 	return (-1);
 }
 #endif /* INET6 */
 
 /*
  * bridge_fragment:
  *
  *	Return a fragmented mbuf chain.
  */
 static int
 bridge_fragment(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh,
     int snap, struct llc *llc)
 {
 	struct mbuf *m0;
 	struct ip *ip;
 	int error = -1;
 
 	if (m->m_len < sizeof(struct ip) &&
 	    (m = m_pullup(m, sizeof(struct ip))) == NULL)
 		goto out;
 	ip = mtod(m, struct ip *);
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist);
 	if (error)
 		goto out;
 
 	/* walk the chain and re-add the Ethernet header */
 	for (m0 = m; m0; m0 = m0->m_nextpkt) {
 		if (error == 0) {
 			if (snap) {
 				M_PREPEND(m0, sizeof(struct llc), M_NOWAIT);
 				if (m0 == NULL) {
 					error = ENOBUFS;
 					continue;
 				}
 				bcopy(llc, mtod(m0, caddr_t),
 				    sizeof(struct llc));
 			}
 			M_PREPEND(m0, ETHER_HDR_LEN, M_NOWAIT);
 			if (m0 == NULL) {
 				error = ENOBUFS;
 				continue;
 			}
 			bcopy(eh, mtod(m0, caddr_t), ETHER_HDR_LEN);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		KMOD_IPSTAT_INC(ips_fragmented);
 
 	return (error);
 
 out:
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 static void
 bridge_linkstate(struct ifnet *ifp)
 {
 	struct bridge_softc *sc = ifp->if_bridge;
 	struct bridge_iflist *bif;
 
 	BRIDGE_LOCK(sc);
 	bif = bridge_lookup_member_if(sc, ifp);
 	if (bif == NULL) {
 		BRIDGE_UNLOCK(sc);
 		return;
 	}
 	bridge_linkcheck(sc);
 	BRIDGE_UNLOCK(sc);
 
 	bstp_linkstate(&bif->bif_stp);
 }
 
 static void
 bridge_linkcheck(struct bridge_softc *sc)
 {
 	struct bridge_iflist *bif;
 	int new_link, hasls;
 
 	BRIDGE_LOCK_ASSERT(sc);
 	new_link = LINK_STATE_DOWN;
 	hasls = 0;
 	/* Our link is considered up if at least one of our ports is active */
 	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
 		if (bif->bif_ifp->if_capabilities & IFCAP_LINKSTATE)
 			hasls++;
 		if (bif->bif_ifp->if_link_state == LINK_STATE_UP) {
 			new_link = LINK_STATE_UP;
 			break;
 		}
 	}
 	if (!LIST_EMPTY(&sc->sc_iflist) && !hasls) {
 		/* If no interfaces support link-state then we default to up */
 		new_link = LINK_STATE_UP;
 	}
 	if_link_state_change(sc->sc_ifp, new_link);
 }
Index: user/dchagin/lemul/sys/net/if_ethersubr.c
===================================================================
--- user/dchagin/lemul/sys/net/if_ethersubr.c	(revision 274240)
+++ user/dchagin/lemul/sys/net/if_ethersubr.c	(revision 274241)
@@ -1,1206 +1,1165 @@
 /*-
  * Copyright (c) 1982, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ethersubr.c	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_netgraph.h"
 #include "opt_mbuf_profiling.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mbuf.h>
 #include <sys/random.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/uuid.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/if_llc.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if_bridgevar.h>
 #include <net/if_vlan_var.h>
 #include <net/if_llatbl.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netpfil/pf/pf_mtag.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/in_rss.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip_var.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif
 
 int (*ef_inputp)(struct ifnet*, struct ether_header *eh, struct mbuf *m);
 int (*ef_outputp)(struct ifnet *ifp, struct mbuf **mp,
 		const struct sockaddr *dst, short *tp, int *hlen);
 
 #include <security/mac/mac_framework.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
 CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
 #endif
 
 VNET_DEFINE(struct pfil_head, link_pfil_hook);	/* Packet filter hooks */
 
 /* netgraph node hooks for ng_ether(4) */
 void	(*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp);
 void	(*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m);
 int	(*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp);
 void	(*ng_ether_attach_p)(struct ifnet *ifp);
 void	(*ng_ether_detach_p)(struct ifnet *ifp);
 
 void	(*vlan_input_p)(struct ifnet *, struct mbuf *);
 
 /* if_bridge(4) support */
 struct mbuf *(*bridge_input_p)(struct ifnet *, struct mbuf *); 
 int	(*bridge_output_p)(struct ifnet *, struct mbuf *, 
 		struct sockaddr *, struct rtentry *);
 void	(*bridge_dn_p)(struct mbuf *, struct ifnet *);
 
 /* if_lagg(4) support */
 struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); 
 
 static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] =
 			{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static	int ether_resolvemulti(struct ifnet *, struct sockaddr **,
 		struct sockaddr *);
 #ifdef VIMAGE
 static	void ether_reassign(struct ifnet *, struct vnet *, char *);
 #endif
 
-/* XXX: should be in an arp support file, not here */
-static MALLOC_DEFINE(M_ARPCOM, "arpcom", "802.* interface internals");
-
 #define	ETHER_IS_BROADCAST(addr) \
 	(bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0)
 
 #define senderr(e) do { error = (e); goto bad;} while (0)
 
 static void
 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
 {
 	int csum_flags = 0;
 
 	if (src->m_pkthdr.csum_flags & CSUM_IP)
 		csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
 	if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
 		csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 	if (src->m_pkthdr.csum_flags & CSUM_SCTP)
 		csum_flags |= CSUM_SCTP_VALID;
 	dst->m_pkthdr.csum_flags |= csum_flags;
 	if (csum_flags & CSUM_DATA_VALID)
 		dst->m_pkthdr.csum_data = 0xffff;
 }
 
 /*
  * Ethernet output routine.
  * Encapsulate a packet of type family for the local net.
  * Use trailer local net encapsulation if enough data in first
  * packet leaves a multiple of 512 bytes of data in remainder.
  */
 int
 ether_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	short type;
 	int error = 0, hdrcmplt = 0;
 	u_char esrc[ETHER_ADDR_LEN], edst[ETHER_ADDR_LEN];
 	struct llentry *lle = NULL;
 	struct rtentry *rt0 = NULL;
 	struct ether_header *eh;
 	struct pf_mtag *t;
 	int loop_copy = 1;
 	int hlen;	/* link layer header length */
 
 	if (ro != NULL) {
 		if (!(m->m_flags & (M_BCAST | M_MCAST)))
 			lle = ro->ro_lle;
 		rt0 = ro->ro_rt;
 	}
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		senderr(error);
 #endif
 
 	M_PROFILE(m);
 	if (ifp->if_flags & IFF_MONITOR)
 		senderr(ENETDOWN);
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		senderr(ENETDOWN);
 
 	hlen = ETHER_HDR_LEN;
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (lle != NULL && (lle->la_flags & LLE_VALID))
 			memcpy(edst, &lle->ll_addr.mac16, sizeof(edst));
 		else
 			error = arpresolve(ifp, rt0, m, dst, edst, &lle);
 		if (error)
 			return (error == EWOULDBLOCK ? 0 : error);
 		type = htons(ETHERTYPE_IP);
 		break;
 	case AF_ARP:
 	{
 		struct arphdr *ah;
 		ah = mtod(m, struct arphdr *);
 		ah->ar_hrd = htons(ARPHRD_ETHER);
 
 		loop_copy = 0; /* if this is for us, don't do it */
 
 		switch(ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			type = htons(ETHERTYPE_REVARP);
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			type = htons(ETHERTYPE_ARP);
 			break;
 		}
 
 		if (m->m_flags & M_BCAST)
 			bcopy(ifp->if_broadcastaddr, edst, ETHER_ADDR_LEN);
 		else
 			bcopy(ar_tha(ah), edst, ETHER_ADDR_LEN);
 
 	}
 	break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (lle != NULL && (lle->la_flags & LLE_VALID))
 			memcpy(edst, &lle->ll_addr.mac16, sizeof(edst));
 		else
 			error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle);
 		if (error)
 			return error;
 		type = htons(ETHERTYPE_IPV6);
 		break;
 #endif
 	case pseudo_AF_HDRCMPLT:
 	    {
 		const struct ether_header *eh;
 		
 		hdrcmplt = 1;
 		eh = (const struct ether_header *)dst->sa_data;
 		(void)memcpy(esrc, eh->ether_shost, sizeof (esrc));
 		/* FALLTHROUGH */
 
 	case AF_UNSPEC:
 		loop_copy = 0; /* if this is for us, don't do it */
 		eh = (const struct ether_header *)dst->sa_data;
 		(void)memcpy(edst, eh->ether_dhost, sizeof (edst));
 		type = eh->ether_type;
 		break;
             }
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		senderr(EAFNOSUPPORT);
 	}
 
 	if (lle != NULL && (lle->la_flags & LLE_IFADDR)) {
 		update_mbuf_csumflags(m, m);
 		return (if_simloop(ifp, m, dst->sa_family, 0));
 	}
 
 	/*
 	 * Add local net header.  If no space in first mbuf,
 	 * allocate another.
 	 */
 	M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
 	if (m == NULL)
 		senderr(ENOBUFS);
 	eh = mtod(m, struct ether_header *);
 	(void)memcpy(&eh->ether_type, &type,
 		sizeof(eh->ether_type));
 	(void)memcpy(eh->ether_dhost, edst, sizeof (edst));
 	if (hdrcmplt)
 		(void)memcpy(eh->ether_shost, esrc,
 			sizeof(eh->ether_shost));
 	else
 		(void)memcpy(eh->ether_shost, IF_LLADDR(ifp),
 			sizeof(eh->ether_shost));
 
 	/*
 	 * If a simplex interface, and the packet is being sent to our
 	 * Ethernet address or a broadcast address, loopback a copy.
 	 * XXX To make a simplex device behave exactly like a duplex
 	 * device, we should copy in the case of sending to our own
 	 * ethernet address (thus letting the original actually appear
 	 * on the wire). However, we don't do that here for security
 	 * reasons and compatibility with the original behavior.
 	 */
 	if ((ifp->if_flags & IFF_SIMPLEX) && loop_copy &&
 	    ((t = pf_find_mtag(m)) == NULL || !t->routed)) {
 		if (m->m_flags & M_BCAST) {
 			struct mbuf *n;
 
 			/*
 			 * Because if_simloop() modifies the packet, we need a
 			 * writable copy through m_dup() instead of a readonly
 			 * one as m_copy[m] would give us. The alternative would
 			 * be to modify if_simloop() to handle the readonly mbuf,
 			 * but performancewise it is mostly equivalent (trading
 			 * extra data copying vs. extra locking).
 			 *
 			 * XXX This is a local workaround.  A number of less
 			 * often used kernel parts suffer from the same bug.
 			 * See PR kern/105943 for a proposed general solution.
 			 */
 			if ((n = m_dup(m, M_NOWAIT)) != NULL) {
 				update_mbuf_csumflags(m, n);
 				(void)if_simloop(ifp, n, dst->sa_family, hlen);
 			} else
 				if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 		} else if (bcmp(eh->ether_dhost, eh->ether_shost,
 				ETHER_ADDR_LEN) == 0) {
 			update_mbuf_csumflags(m, m);
 			(void) if_simloop(ifp, m, dst->sa_family, hlen);
 			return (0);	/* XXX */
 		}
 	}
 
        /*
 	* Bridges require special output handling.
 	*/
 	if (ifp->if_bridge) {
 		BRIDGE_OUTPUT(ifp, m, error);
 		return (error);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (ifp->if_carp &&
 	    (error = (*carp_output_p)(ifp, m, dst)))
 		goto bad;
 #endif
 
 	/* Handle ng_ether(4) processing, if any */
-	if (IFP2AC(ifp)->ac_netgraph != NULL) {
+	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_output_p != NULL,
 		    ("ng_ether_output_p is NULL"));
 		if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) {
 bad:			if (m != NULL)
 				m_freem(m);
 			return (error);
 		}
 		if (m == NULL)
 			return (0);
 	}
 
 	/* Continue with link-layer output */
 	return ether_output_frame(ifp, m);
 }
 
 /*
  * Ethernet link layer output routine to send a raw frame to the device.
  *
  * This assumes that the 14 byte Ethernet header is present and contiguous
  * in the first mbuf (if BRIDGE'ing).
  */
 int
 ether_output_frame(struct ifnet *ifp, struct mbuf *m)
 {
 	int i;
 
 	if (PFIL_HOOKED(&V_link_pfil_hook)) {
 		i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_OUT, NULL);
 
 		if (i != 0)
 			return (EACCES);
 
 		if (m == NULL)
 			return (0);
 	}
 
 	/*
 	 * Queue message on interface, update output statistics if
 	 * successful, and start output if interface not yet active.
 	 */
 	return ((ifp->if_transmit)(ifp, m));
 }
 
 #if defined(INET) || defined(INET6)
 #endif
 
 /*
  * Process a received Ethernet packet; the packet is in the
  * mbuf chain m with the ethernet header at the front.
  */
 static void
 ether_input_internal(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	u_short etype;
 
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return;
 	}
 #ifdef DIAGNOSTIC
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n");
 		m_freem(m);
 		return;
 	}
 #endif
 	/*
 	 * Do consistency checks to verify assumptions
 	 * made by code past this point.
 	 */
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		if_printf(ifp, "discard frame w/o packet header\n");
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 	if (m->m_len < ETHER_HDR_LEN) {
 		/* XXX maybe should pullup? */
 		if_printf(ifp, "discard frame w/o leading ethernet "
 				"header (len %u pkt len %u)\n",
 				m->m_len, m->m_pkthdr.len);
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 	if (m->m_pkthdr.rcvif == NULL) {
 		if_printf(ifp, "discard frame w/o interface pointer\n");
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 #ifdef DIAGNOSTIC
 	if (m->m_pkthdr.rcvif != ifp) {
 		if_printf(ifp, "Warning, frame marked as received on %s\n",
 			m->m_pkthdr.rcvif->if_xname);
 	}
 #endif
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 
 	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 		if (ETHER_IS_BROADCAST(eh->ether_dhost))
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 #ifdef MAC
 	/*
 	 * Tag the mbuf with an appropriate MAC label before any other
 	 * consumers can get to it.
 	 */
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Give bpf a chance at the packet.
 	 */
 	ETHER_BPF_MTAP(ifp, m);
 
 	/*
 	 * If the CRC is still on the packet, trim it off. We do this once
 	 * and once only in case we are re-entered. Nothing else on the
 	 * Ethernet receive path expects to see the FCS.
 	 */
 	if (m->m_flags & M_HASFCS) {
 		m_adj(m, -ETHER_CRC_LEN);
 		m->m_flags &= ~M_HASFCS;
 	}
 
 	if (!(ifp->if_capenable & IFCAP_HWSTATS))
 		if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 
 	/* Allow monitor mode to claim this frame, after stats are updated. */
 	if (ifp->if_flags & IFF_MONITOR) {
 		m_freem(m);
 		CURVNET_RESTORE();
 		return;
 	}
 
 	/* Handle input from a lagg(4) port */
 	if (ifp->if_type == IFT_IEEE8023ADLAG) {
 		KASSERT(lagg_input_p != NULL,
 		    ("%s: if_lagg not loaded!", __func__));
 		m = (*lagg_input_p)(ifp, m);
 		if (m != NULL)
 			ifp = m->m_pkthdr.rcvif;
 		else {
 			CURVNET_RESTORE();
 			return;
 		}
 	}
 
 	/*
 	 * If the hardware did not process an 802.1Q tag, do this now,
 	 * to allow 802.1P priority frames to be passed to the main input
 	 * path correctly.
 	 * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels.
 	 */
 	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) {
 		struct ether_vlan_header *evl;
 
 		if (m->m_len < sizeof(*evl) &&
 		    (m = m_pullup(m, sizeof(*evl))) == NULL) {
 #ifdef DIAGNOSTIC
 			if_printf(ifp, "cannot pullup VLAN header\n");
 #endif
 			if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 			m_freem(m);
 			CURVNET_RESTORE();
 			return;
 		}
 
 		evl = mtod(m, struct ether_vlan_header *);
 		m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
 		m->m_flags |= M_VLANTAG;
 
 		bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
 		    ETHER_HDR_LEN - ETHER_TYPE_LEN);
 		m_adj(m, ETHER_VLAN_ENCAP_LEN);
 		eh = mtod(m, struct ether_header *);
 	}
 
 	M_SETFIB(m, ifp->if_fib);
 
 	/* Allow ng_ether(4) to claim this frame. */
-	if (IFP2AC(ifp)->ac_netgraph != NULL) {
+	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_input_p != NULL,
 		    ("%s: ng_ether_input_p is NULL", __func__));
 		m->m_flags &= ~M_PROMISC;
 		(*ng_ether_input_p)(ifp, &m);
 		if (m == NULL) {
 			CURVNET_RESTORE();
 			return;
 		}
 		eh = mtod(m, struct ether_header *);
 	}
 
 	/*
 	 * Allow if_bridge(4) to claim this frame.
 	 * The BRIDGE_INPUT() macro will update ifp if the bridge changed it
 	 * and the frame should be delivered locally.
 	 */
 	if (ifp->if_bridge != NULL) {
 		m->m_flags &= ~M_PROMISC;
 		BRIDGE_INPUT(ifp, m);
 		if (m == NULL) {
 			CURVNET_RESTORE();
 			return;
 		}
 		eh = mtod(m, struct ether_header *);
 	}
 
 #if defined(INET) || defined(INET6)
 	/*
 	 * Clear M_PROMISC on frame so that carp(4) will see it when the
 	 * mbuf flows up to Layer 3.
 	 * FreeBSD's implementation of carp(4) uses the inprotosw
 	 * to dispatch IPPROTO_CARP. carp(4) also allocates its own
 	 * Ethernet addresses of the form 00:00:5e:00:01:xx, which
 	 * is outside the scope of the M_PROMISC test below.
 	 * TODO: Maintain a hash table of ethernet addresses other than
 	 * ether_dhost which may be active on this ifp.
 	 */
 	if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) {
 		m->m_flags &= ~M_PROMISC;
 	} else
 #endif
 	{
 		/*
 		 * If the frame received was not for our MAC address, set the
 		 * M_PROMISC flag on the mbuf chain. The frame may need to
 		 * be seen by the rest of the Ethernet input path in case of
 		 * re-entry (e.g. bridge, vlan, netgraph) but should not be
 		 * seen by upper protocol layers.
 		 */
 		if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
 		    bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0)
 			m->m_flags |= M_PROMISC;
 	}
 
 	random_harvest(&(m->m_data), 12, 2, RANDOM_NET_ETHER);
 
 	ether_demux(ifp, m);
 	CURVNET_RESTORE();
 }
 
 /*
  * Ethernet input dispatch; by default, direct dispatch here regardless of
  * global configuration.  However, if RSS is enabled, hook up RSS affinity
  * so that when deferred or hybrid dispatch is enabled, we can redistribute
  * load based on RSS.
  *
  * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or
  * not it had already done work distribution via multi-queue.  Then we could
  * direct dispatch in the event load balancing was already complete and
  * handle the case of interfaces with different capabilities better.
  *
  * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions
  * at multiple layers?
  *
  * XXXRW: For now, enable all this only if RSS is compiled in, although it
  * works fine without RSS.  Need to characterise the performance overhead
  * of the detour through the netisr code in the event the result is always
  * direct dispatch.
  */
 static void
 ether_nh_input(struct mbuf *m)
 {
 
 	ether_input_internal(m->m_pkthdr.rcvif, m);
 }
 
 static struct netisr_handler	ether_nh = {
 	.nh_name = "ether",
 	.nh_handler = ether_nh_input,
 	.nh_proto = NETISR_ETHER,
 #ifdef RSS
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_DIRECT,
 	.nh_m2cpuid = rss_m2cpuid,
 #else
 	.nh_policy = NETISR_POLICY_SOURCE,
 	.nh_dispatch = NETISR_DISPATCH_DIRECT,
 #endif
 };
 
 static void
 ether_init(__unused void *arg)
 {
 
 	netisr_register(&ether_nh);
 }
 SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL);
 
 static void
 vnet_ether_init(__unused void *arg)
 {
 	int i;
 
 	/* Initialize packet filter hooks. */
 	V_link_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_link_pfil_hook.ph_af = AF_LINK;
 	if ((i = pfil_head_register(&V_link_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil link hook, "
 			"error %d\n", __func__, i);
 }
 VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_ether_init, NULL);
  
 static void
 vnet_ether_destroy(__unused void *arg)
 {
 	int i;
 
 	if ((i = pfil_head_unregister(&V_link_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil link hook, "
 			"error %d\n", __func__, i);
 }
 VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_ether_destroy, NULL);
 
 
 
 static void
 ether_input(struct ifnet *ifp, struct mbuf *m)
 {
 
 	struct mbuf *mn;
 
 	/*
 	 * The drivers are allowed to pass in a chain of packets linked with
 	 * m_nextpkt. We split them up into separate packets here and pass
 	 * them up. This allows the drivers to amortize the receive lock.
 	 */
 	while (m) {
 		mn = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 
 		/*
 		 * We will rely on rcvif being set properly in the deferred context,
 		 * so assert it is correct here.
 		 */
 		KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch", __func__));
 		netisr_dispatch(NETISR_ETHER, m);
 		m = mn;
 	}
 }
 
 /*
  * Upper layer processing for a received Ethernet packet.
  */
 void
 ether_demux(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	int i, isr;
 	u_short ether_type;
 
 	KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__));
 
 	/* Do not grab PROMISC frames in case we are re-entered. */
 	if (PFIL_HOOKED(&V_link_pfil_hook) && !(m->m_flags & M_PROMISC)) {
 		i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_IN, NULL);
 
 		if (i != 0 || m == NULL)
 			return;
 	}
 
 	eh = mtod(m, struct ether_header *);
 	ether_type = ntohs(eh->ether_type);
 
 	/*
 	 * If this frame has a VLAN tag other than 0, call vlan_input()
 	 * if its module is loaded. Otherwise, drop.
 	 */
 	if ((m->m_flags & M_VLANTAG) &&
 	    EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) {
 		if (ifp->if_vlantrunk == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			m_freem(m);
 			return;
 		}
 		KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!",
 		    __func__));
 		/* Clear before possibly re-entering ether_input(). */
 		m->m_flags &= ~M_PROMISC;
 		(*vlan_input_p)(ifp, m);
 		return;
 	}
 
 	/*
 	 * Pass promiscuously received frames to the upper layer if the user
 	 * requested this by setting IFF_PPROMISC. Otherwise, drop them.
 	 */
 	if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) {
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * Reset layer specific mbuf flags to avoid confusing upper layers.
 	 * Strip off Ethernet header.
 	 */
 	m->m_flags &= ~M_VLANTAG;
 	m_clrprotoflags(m);
 	m_adj(m, ETHER_HDR_LEN);
 
 	/*
 	 * Dispatch frame to upper layer.
 	 */
 	switch (ether_type) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		if ((m = ip_fastforward(m)) == NULL)
 			return;
 		isr = NETISR_IP;
 		break;
 
 	case ETHERTYPE_ARP:
 		if (ifp->if_flags & IFF_NOARP) {
 			/* Discard packet if ARP is disabled on interface */
 			m_freem(m);
 			return;
 		}
 		isr = NETISR_ARP;
 		break;
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		goto discard;
 	}
 	netisr_dispatch(isr, m);
 	return;
 
 discard:
 	/*
 	 * Packet is to be discarded.  If netgraph is present,
 	 * hand the packet to it for last chance processing;
 	 * otherwise dispose of it.
 	 */
-	if (IFP2AC(ifp)->ac_netgraph != NULL) {
+	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_input_orphan_p != NULL,
 		    ("ng_ether_input_orphan_p is NULL"));
 		/*
 		 * Put back the ethernet header so netgraph has a
 		 * consistent view of inbound packets.
 		 */
 		M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
 		(*ng_ether_input_orphan_p)(ifp, m);
 		return;
 	}
 	m_freem(m);
 }
 
 /*
  * Convert Ethernet address to printable (loggable) representation.
  * This routine is for compatibility; it's better to just use
  *
  *	printf("%6D", <pointer to address>, ":");
  *
  * since there's no static buffer involved.
  */
 char *
 ether_sprintf(const u_char *ap)
 {
 	static char etherbuf[18];
 	snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":");
 	return (etherbuf);
 }
 
 /*
  * Perform common duties while attaching to interface list
  */
 void
 ether_ifattach(struct ifnet *ifp, const u_int8_t *lla)
 {
 	int i;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifp->if_addrlen = ETHER_ADDR_LEN;
 	ifp->if_hdrlen = ETHER_HDR_LEN;
 	if_attach(ifp);
 	ifp->if_mtu = ETHERMTU;
 	ifp->if_output = ether_output;
 	ifp->if_input = ether_input;
 	ifp->if_resolvemulti = ether_resolvemulti;
 #ifdef VIMAGE
 	ifp->if_reassign = ether_reassign;
 #endif
 	if (ifp->if_baudrate == 0)
 		ifp->if_baudrate = IF_Mbps(10);		/* just a default */
 	ifp->if_broadcastaddr = etherbroadcastaddr;
 
 	ifa = ifp->if_addr;
 	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_ETHER;
 	sdl->sdl_alen = ifp->if_addrlen;
 	bcopy(lla, LLADDR(sdl), ifp->if_addrlen);
 
 	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
 	if (ng_ether_attach_p != NULL)
 		(*ng_ether_attach_p)(ifp);
 
 	/* Announce Ethernet MAC address if non-zero. */
 	for (i = 0; i < ifp->if_addrlen; i++)
 		if (lla[i] != 0)
 			break; 
 	if (i != ifp->if_addrlen)
 		if_printf(ifp, "Ethernet address: %6D\n", lla, ":");
 
 	uuid_ether_add(LLADDR(sdl));
 }
 
 /*
  * Perform common duties while detaching an Ethernet interface
  */
 void
 ether_ifdetach(struct ifnet *ifp)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr);
 	uuid_ether_del(LLADDR(sdl));
 
-	if (IFP2AC(ifp)->ac_netgraph != NULL) {
+	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_detach_p != NULL,
 		    ("ng_ether_detach_p is NULL"));
 		(*ng_ether_detach_p)(ifp);
 	}
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 }
 
 #ifdef VIMAGE
 void
 ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused)
 {
 
-	if (IFP2AC(ifp)->ac_netgraph != NULL) {
+	if (ifp->if_l2com != NULL) {
 		KASSERT(ng_ether_detach_p != NULL,
 		    ("ng_ether_detach_p is NULL"));
 		(*ng_ether_detach_p)(ifp);
 	}
 
 	if (ng_ether_attach_p != NULL) {
 		CURVNET_SET_QUIET(new_vnet);
 		(*ng_ether_attach_p)(ifp);
 		CURVNET_RESTORE();
 	}
 }
 #endif
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet");
 
 #if 0
 /*
  * This is for reference.  We have a table-driven version
  * of the little-endian crc32 generator, which is faster
  * than the double-loop.
  */
 uint32_t
 ether_crc32_le(const uint8_t *buf, size_t len)
 {
 	size_t i;
 	uint32_t crc;
 	int bit;
 	uint8_t data;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
 			carry = (crc ^ data) & 1;
 			crc >>= 1;
 			if (carry)
 				crc = (crc ^ ETHER_CRC_POLY_LE);
 		}
 	}
 
 	return (crc);
 }
 #else
 uint32_t
 ether_crc32_le(const uint8_t *buf, size_t len)
 {
 	static const uint32_t crctab[] = {
 		0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac,
 		0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
 		0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
 		0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c
 	};
 	size_t i;
 	uint32_t crc;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		crc ^= buf[i];
 		crc = (crc >> 4) ^ crctab[crc & 0xf];
 		crc = (crc >> 4) ^ crctab[crc & 0xf];
 	}
 
 	return (crc);
 }
 #endif
 
 uint32_t
 ether_crc32_be(const uint8_t *buf, size_t len)
 {
 	size_t i;
 	uint32_t crc, carry;
 	int bit;
 	uint8_t data;
 
 	crc = 0xffffffff;	/* initial value */
 
 	for (i = 0; i < len; i++) {
 		for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
 			carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01);
 			crc <<= 1;
 			if (carry)
 				crc = (crc ^ ETHER_CRC_POLY_BE) | carry;
 		}
 	}
 
 	return (crc);
 }
 
 int
 ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct ifaddr *ifa = (struct ifaddr *) data;
 	struct ifreq *ifr = (struct ifreq *) data;
 	int error = 0;
 
 	switch (command) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			ifp->if_init(ifp->if_softc);	/* before arpwhohas */
 			arp_ifinit(ifp, ifa);
 			break;
 #endif
 		default:
 			ifp->if_init(ifp->if_softc);
 			break;
 		}
 		break;
 
 	case SIOCGIFADDR:
 		{
 			struct sockaddr *sa;
 
 			sa = (struct sockaddr *) & ifr->ifr_data;
 			bcopy(IF_LLADDR(ifp),
 			      (caddr_t) sa->sa_data, ETHER_ADDR_LEN);
 		}
 		break;
 
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		if (ifr->ifr_mtu > ETHERMTU) {
 			error = EINVAL;
 		} else {
 			ifp->if_mtu = ifr->ifr_mtu;
 		}
 		break;
 	default:
 		error = EINVAL;			/* XXX netbsd has ENOTTY??? */
 		break;
 	}
 	return (error);
 }
 
 static int
 ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
 	struct sockaddr *sa)
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	u_char *e_addr;
 
 	switch(sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if (!ETHER_IS_MULTICAST(e_addr))
 			return EADDRNOTAVAIL;
 		*llsa = 0;
 		return 0;
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
 		sdl->sdl_alen = ETHER_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/*
 			 * An IP6 address of 0 means listen to all
 			 * of the Ethernet multicast address used for IP6.
 			 * (This is used for multicast routers.)
 			 */
 			ifp->if_flags |= IFF_ALLMULTI;
 			*llsa = 0;
 			return 0;
 		}
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			return EADDRNOTAVAIL;
 		sdl = link_init_sdl(ifp, *llsa, IFT_ETHER);
 		sdl->sdl_alen = ETHER_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return 0;
 #endif
 
 	default:
 		/*
 		 * Well, the text isn't quite right, but it's the name
 		 * that counts...
 		 */
 		return EAFNOSUPPORT;
 	}
 }
 
-static void*
-ether_alloc(u_char type, struct ifnet *ifp)
-{
-	struct arpcom	*ac;
-	
-	ac = malloc(sizeof(struct arpcom), M_ARPCOM, M_WAITOK | M_ZERO);
-	ac->ac_ifp = ifp;
-
-	return (ac);
-}
-
-static void
-ether_free(void *com, u_char type)
-{
-
-	free(com, M_ARPCOM);
-}
-
-static int
-ether_modevent(module_t mod, int type, void *data)
-{
-
-	switch (type) {
-	case MOD_LOAD:
-		if_register_com_alloc(IFT_ETHER, ether_alloc, ether_free);
-		break;
-	case MOD_UNLOAD:
-		if_deregister_com_alloc(IFT_ETHER);
-		break;
-	default:
-		return EOPNOTSUPP;
-	}
-
-	return (0);
-}
-
 static moduledata_t ether_mod = {
-	"ether",
-	ether_modevent,
-	0
+	.name = "ether",
 };
 
 void
 ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen)
 {
 	struct ether_vlan_header vlan;
 	struct mbuf mv, mb;
 
 	KASSERT((m->m_flags & M_VLANTAG) != 0,
 	    ("%s: vlan information not present", __func__));
 	KASSERT(m->m_len >= sizeof(struct ether_header),
 	    ("%s: mbuf not large enough for header", __func__));
 	bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header));
 	vlan.evl_proto = vlan.evl_encap_proto;
 	vlan.evl_encap_proto = htons(ETHERTYPE_VLAN);
 	vlan.evl_tag = htons(m->m_pkthdr.ether_vtag);
 	m->m_len -= sizeof(struct ether_header);
 	m->m_data += sizeof(struct ether_header);
 	/*
 	 * If a data link has been supplied by the caller, then we will need to
 	 * re-create a stack allocated mbuf chain with the following structure:
 	 *
 	 * (1) mbuf #1 will contain the supplied data link
 	 * (2) mbuf #2 will contain the vlan header
 	 * (3) mbuf #3 will contain the original mbuf's packet data
 	 *
 	 * Otherwise, submit the packet and vlan header via bpf_mtap2().
 	 */
 	if (data != NULL) {
 		mv.m_next = m;
 		mv.m_data = (caddr_t)&vlan;
 		mv.m_len = sizeof(vlan);
 		mb.m_next = &mv;
 		mb.m_data = data;
 		mb.m_len = dlen;
 		bpf_mtap(bp, &mb);
 	} else
 		bpf_mtap2(bp, &vlan, sizeof(vlan), m);
 	m->m_len += sizeof(struct ether_header);
 	m->m_data -= sizeof(struct ether_header);
 }
 
 struct mbuf *
 ether_vlanencap(struct mbuf *m, uint16_t tag)
 {
 	struct ether_vlan_header *evl;
 
 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
 	if (m == NULL)
 		return (NULL);
 	/* M_PREPEND takes care of m_len, m_pkthdr.len for us */
 
 	if (m->m_len < sizeof(*evl)) {
 		m = m_pullup(m, sizeof(*evl));
 		if (m == NULL)
 			return (NULL);
 	}
 
 	/*
 	 * Transform the Ethernet header into an Ethernet header
 	 * with 802.1Q encapsulation.
 	 */
 	evl = mtod(m, struct ether_vlan_header *);
 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
 	    (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
 	evl->evl_tag = htons(tag);
 	return (m);
 }
 
 DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
 MODULE_VERSION(ether, 1);
Index: user/dchagin/lemul/sys/net/if_fddisubr.c
===================================================================
--- user/dchagin/lemul/sys/net/if_fddisubr.c	(revision 274240)
+++ user/dchagin/lemul/sys/net/if_fddisubr.c	(revision 274241)
@@ -1,672 +1,671 @@
 /*-
  * Copyright (c) 1995, 1996
  *	Matt Thomas <matt@3am-software.com>.  All rights reserved.
  * Copyright (c) 1982, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: if_ethersubr.c,v 1.5 1994/12/13 22:31:45 wollman Exp
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llc.h>
 #include <net/if_types.h>
 #include <net/if_llatbl.h>
 
 #include <net/ethernet.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/bpf.h>
 #include <net/fddi.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif
 
 #ifdef DECNET
 #include <netdnet/dn.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 static const u_char fddibroadcastaddr[FDDI_ADDR_LEN] =
 			{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static int fddi_resolvemulti(struct ifnet *, struct sockaddr **,
 			      struct sockaddr *);
 static int fddi_output(struct ifnet *, struct mbuf *, const struct sockaddr *,
 		       struct route *); 
 static void fddi_input(struct ifnet *ifp, struct mbuf *m);
 
 #define	senderr(e)	do { error = (e); goto bad; } while (0)
 
 /*
  * FDDI output routine.
  * Encapsulate a packet of type family for the local net.
  * Use trailer local net encapsulation if enough data in first
  * packet leaves a multiple of 512 bytes of data in remainder.
- * Assumes that ifp is actually pointer to arpcom structure.
  */
 static int
 fddi_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *ro)
 {
 	u_int16_t type;
 	int loop_copy = 0, error = 0, hdrcmplt = 0;
  	u_char esrc[FDDI_ADDR_LEN], edst[FDDI_ADDR_LEN];
 	struct fddi_header *fh;
 #if defined(INET) || defined(INET6)
 	struct llentry *lle;
 #endif
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		senderr(error);
 #endif
 
 	if (ifp->if_flags & IFF_MONITOR)
 		senderr(ENETDOWN);
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		senderr(ENETDOWN);
 	getmicrotime(&ifp->if_lastchange);
 
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET: {
 		struct rtentry *rt0 = NULL;
 
 		if (ro != NULL)
 			rt0 = ro->ro_rt;
 		error = arpresolve(ifp, rt0, m, dst, edst, &lle);
 		if (error)
 			return (error == EWOULDBLOCK ? 0 : error);
 		type = htons(ETHERTYPE_IP);
 		break;
 	}
 	case AF_ARP:
 	{
 		struct arphdr *ah;
 		ah = mtod(m, struct arphdr *);
 		ah->ar_hrd = htons(ARPHRD_ETHER);
 
 		loop_copy = -1; /* if this is for us, don't do it */
 
 		switch (ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			type = htons(ETHERTYPE_REVARP);
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			type = htons(ETHERTYPE_ARP);
 			break;
 		}
 
 		if (m->m_flags & M_BCAST)
 			bcopy(ifp->if_broadcastaddr, edst, FDDI_ADDR_LEN);
                 else
 			bcopy(ar_tha(ah), edst, FDDI_ADDR_LEN);
 
 	}
 	break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle);
 		if (error)
 			return (error); /* Something bad happened */
 		type = htons(ETHERTYPE_IPV6);
 		break;
 #endif /* INET6 */
 	case pseudo_AF_HDRCMPLT:
 	{
 		const struct ether_header *eh;
 
 		hdrcmplt = 1;
 		eh = (const struct ether_header *)dst->sa_data;
 		bcopy(eh->ether_shost, esrc, FDDI_ADDR_LEN);
 		/* FALLTHROUGH */
 	}
 
 	case AF_UNSPEC:
 	{
 		const struct ether_header *eh;
 
 		loop_copy = -1;
 		eh = (const struct ether_header *)dst->sa_data;
 		bcopy(eh->ether_dhost, edst, FDDI_ADDR_LEN);
 		if (*edst & 1)
 			m->m_flags |= (M_BCAST|M_MCAST);
 		type = eh->ether_type;
 		break;
 	}
 
 	case AF_IMPLINK:
 	{
 		fh = mtod(m, struct fddi_header *);
 		error = EPROTONOSUPPORT;
 		switch (fh->fddi_fc & (FDDIFC_C|FDDIFC_L|FDDIFC_F)) {
 			case FDDIFC_LLC_ASYNC: {
 				/* legal priorities are 0 through 7 */
 				if ((fh->fddi_fc & FDDIFC_Z) > 7)
 			        	goto bad;
 				break;
 			}
 			case FDDIFC_LLC_SYNC: {
 				/* FDDIFC_Z bits reserved, must be zero */
 				if (fh->fddi_fc & FDDIFC_Z)
 					goto bad;
 				break;
 			}
 			case FDDIFC_SMT: {
 				/* FDDIFC_Z bits must be non zero */
 				if ((fh->fddi_fc & FDDIFC_Z) == 0)
 					goto bad;
 				break;
 			}
 			default: {
 				/* anything else is too dangerous */
                	 		goto bad;
 			}
 		}
 		error = 0;
 		if (fh->fddi_dhost[0] & 1)
 			m->m_flags |= (M_BCAST|M_MCAST);
 		goto queue_it;
 	}
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		senderr(EAFNOSUPPORT);
 	}
 
 	/*
 	 * Add LLC header.
 	 */
 	if (type != 0) {
 		struct llc *l;
 		M_PREPEND(m, LLC_SNAPFRAMELEN, M_NOWAIT);
 		if (m == 0)
 			senderr(ENOBUFS);
 		l = mtod(m, struct llc *);
 		l->llc_control = LLC_UI;
 		l->llc_dsap = l->llc_ssap = LLC_SNAP_LSAP;
 		l->llc_snap.org_code[0] =
 			l->llc_snap.org_code[1] =
 			l->llc_snap.org_code[2] = 0;
 		l->llc_snap.ether_type = htons(type);
 	}
 
 	/*
 	 * Add local net header.  If no space in first mbuf,
 	 * allocate another.
 	 */
 	M_PREPEND(m, FDDI_HDR_LEN, M_NOWAIT);
 	if (m == 0)
 		senderr(ENOBUFS);
 	fh = mtod(m, struct fddi_header *);
 	fh->fddi_fc = FDDIFC_LLC_ASYNC|FDDIFC_LLC_PRIO4;
 	bcopy((caddr_t)edst, (caddr_t)fh->fddi_dhost, FDDI_ADDR_LEN);
   queue_it:
 	if (hdrcmplt)
 		bcopy((caddr_t)esrc, (caddr_t)fh->fddi_shost, FDDI_ADDR_LEN);
 	else
 		bcopy(IF_LLADDR(ifp), (caddr_t)fh->fddi_shost,
 			FDDI_ADDR_LEN);
 
 	/*
 	 * If a simplex interface, and the packet is being sent to our
 	 * Ethernet address or a broadcast address, loopback a copy.
 	 * XXX To make a simplex device behave exactly like a duplex
 	 * device, we should copy in the case of sending to our own
 	 * ethernet address (thus letting the original actually appear
 	 * on the wire). However, we don't do that here for security
 	 * reasons and compatibility with the original behavior.
 	 */
 	if ((ifp->if_flags & IFF_SIMPLEX) && (loop_copy != -1)) {
 		if ((m->m_flags & M_BCAST) || (loop_copy > 0)) {
 			struct mbuf *n;
 			n = m_copy(m, 0, (int)M_COPYALL);
 			(void) if_simloop(ifp, n, dst->sa_family,
 					  FDDI_HDR_LEN);
 	     	} else if (bcmp(fh->fddi_dhost, fh->fddi_shost,
 				FDDI_ADDR_LEN) == 0) {
 			(void) if_simloop(ifp, m, dst->sa_family,
 					  FDDI_HDR_LEN);
 			return (0);	/* XXX */
 		}
 	}
 
 	error = (ifp->if_transmit)(ifp, m);
 	if (error)
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 
 	return (error);
 
 bad:
 	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	if (m)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * Process a received FDDI packet.
  */
 static void
 fddi_input(ifp, m)
 	struct ifnet *ifp;
 	struct mbuf *m;
 {
 	int isr;
 	struct llc *l;
 	struct fddi_header *fh;
 
 	/*
 	 * Do consistency checks to verify assumptions
 	 * made by code past this point.
 	 */
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		if_printf(ifp, "discard frame w/o packet header\n");
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 	if (m->m_pkthdr.rcvif == NULL) {
 		if_printf(ifp, "discard frame w/o interface pointer\n");
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
         }
 
 	m = m_pullup(m, FDDI_HDR_LEN);
 	if (m == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		goto dropanyway;
 	}
 	fh = mtod(m, struct fddi_header *);
 
 	/*
 	 * Discard packet if interface is not up.
 	 */
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		goto dropanyway;
 
 	/*
 	 * Give bpf a chance at the packet.
 	 */
 	BPF_MTAP(ifp, m);
 
 	/*
 	 * Interface marked for monitoring; discard packet.
 	 */
 	if (ifp->if_flags & IFF_MONITOR) {
 		m_freem(m);
 		return;
 	}
 
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Update interface statistics.
 	 */
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	getmicrotime(&ifp->if_lastchange);
 
 	/*
 	 * Discard non local unicast packets when interface
 	 * is in promiscuous mode.
 	 */
 	if ((ifp->if_flags & IFF_PROMISC) && ((fh->fddi_dhost[0] & 1) == 0) &&
 	    (bcmp(IF_LLADDR(ifp), (caddr_t)fh->fddi_dhost,
 	     FDDI_ADDR_LEN) != 0))
 		goto dropanyway;
 
 	/*
 	 * Set mbuf flags for bcast/mcast.
 	 */
 	if (fh->fddi_dhost[0] & 1) {
 		if (bcmp(ifp->if_broadcastaddr, fh->fddi_dhost,
 		    FDDI_ADDR_LEN) == 0)
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 #ifdef M_LINK0
 	/*
 	 * If this has a LLC priority of 0, then mark it so upper
 	 * layers have a hint that it really came via a FDDI/Ethernet
 	 * bridge.
 	 */
 	if ((fh->fddi_fc & FDDIFC_LLC_PRIO7) == FDDIFC_LLC_PRIO0)
 		m->m_flags |= M_LINK0;
 #endif
 
 	/* Strip off FDDI header. */
 	m_adj(m, FDDI_HDR_LEN);
 
 	m = m_pullup(m, LLC_SNAPFRAMELEN);
 	if (m == 0) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		goto dropanyway;
 	}
 	l = mtod(m, struct llc *);
 
 	switch (l->llc_dsap) {
 	case LLC_SNAP_LSAP:
 	{
 		u_int16_t type;
 		if ((l->llc_control != LLC_UI) ||
 		    (l->llc_ssap != LLC_SNAP_LSAP)) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 		}
 		if (l->llc_snap.org_code[0] != 0 ||
 		    l->llc_snap.org_code[1] != 0 ||
 		    l->llc_snap.org_code[2] != 0) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 		}
 
 		type = ntohs(l->llc_snap.ether_type);
 		m_adj(m, LLC_SNAPFRAMELEN);
 
 		switch (type) {
 #ifdef INET
 		case ETHERTYPE_IP:
 			if ((m = ip_fastforward(m)) == NULL)
 				return;
 			isr = NETISR_IP;
 			break;
 
 		case ETHERTYPE_ARP:
 			if (ifp->if_flags & IFF_NOARP)
 				goto dropanyway;
 			isr = NETISR_ARP;
 			break;
 #endif
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 			isr = NETISR_IPV6;
 			break;
 #endif
 #ifdef DECNET
 		case ETHERTYPE_DECNET:
 			isr = NETISR_DECNET;
 			break;
 #endif
 		default:
 			/* printf("fddi_input: unknown protocol 0x%x\n", type); */
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 		}
 		break;
 	}
 		
 	default:
 		/* printf("fddi_input: unknown dsap 0x%x\n", l->llc_dsap); */
 		if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 		goto dropanyway;
 	}
 	M_SETFIB(m, ifp->if_fib);
 	netisr_dispatch(isr, m);
 	return;
 
 dropanyway:
 	if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 	if (m)
 		m_freem(m);
 	return;
 }
 
 /*
  * Perform common duties while attaching to interface list
  */
 void
 fddi_ifattach(ifp, lla, bpf)
 	struct ifnet *ifp;
 	const u_int8_t *lla;
 	int bpf;
 {
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 
 	ifp->if_type = IFT_FDDI;
 	ifp->if_addrlen = FDDI_ADDR_LEN;
 	ifp->if_hdrlen = 21;
 
 	if_attach(ifp);         /* Must be called before additional assignments */
 
 	ifp->if_mtu = FDDIMTU;
 	ifp->if_output = fddi_output;
 	ifp->if_input = fddi_input;
 	ifp->if_resolvemulti = fddi_resolvemulti;
 	ifp->if_broadcastaddr = fddibroadcastaddr;
 	ifp->if_baudrate = 100000000;
 #ifdef IFF_NOTRAILERS
 	ifp->if_flags |= IFF_NOTRAILERS;
 #endif
 	ifa = ifp->if_addr;
 	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	sdl->sdl_type = IFT_FDDI;
 	sdl->sdl_alen = ifp->if_addrlen;
 	bcopy(lla, LLADDR(sdl), ifp->if_addrlen);
 
 	if (bpf)
 		bpfattach(ifp, DLT_FDDI, FDDI_HDR_LEN);
 
 	return;
 }
 
 void
 fddi_ifdetach(ifp, bpf)
 	struct ifnet *ifp;
 	int bpf;
 {
      
 	if (bpf)
 		bpfdetach(ifp);
 
 	if_detach(ifp);
 
 	return;
 }
 
 int
 fddi_ioctl (ifp, command, data)
 	struct ifnet *ifp;
 	u_long command;
 	caddr_t data;
 {
 	struct ifaddr *ifa;
 	struct ifreq *ifr;
 	int error;
 
 	ifa = (struct ifaddr *) data;
 	ifr = (struct ifreq *) data;
 	error = 0;
 
 	switch (command) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:	/* before arpwhohas */
 			ifp->if_init(ifp->if_softc);
 			arp_ifinit(ifp, ifa);
 			break;
 #endif
 		default:
 			ifp->if_init(ifp->if_softc);
 			break;
 		}
 		break;
 	case SIOCGIFADDR: {
 			struct sockaddr *sa;
 
 			sa = (struct sockaddr *) & ifr->ifr_data;
 			bcopy(IF_LLADDR(ifp),
 			      (caddr_t) sa->sa_data, FDDI_ADDR_LEN);
 
 		}
 		break;
 	case SIOCSIFMTU:
 		/*
 		 * Set the interface MTU.
 		 */
 		if (ifr->ifr_mtu > FDDIMTU) {
 			error = EINVAL;
 		} else {
 			ifp->if_mtu = ifr->ifr_mtu;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 static int
 fddi_resolvemulti(ifp, llsa, sa)
 	struct ifnet *ifp;
 	struct sockaddr **llsa;
 	struct sockaddr *sa;
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	u_char *e_addr;
 
 	switch(sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if ((e_addr[0] & 1) != 1)
 			return (EADDRNOTAVAIL);
 		*llsa = 0;
 		return (0);
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 			return (EADDRNOTAVAIL);
 		sdl = link_init_sdl(ifp, *llsa, IFT_FDDI);
 		sdl->sdl_nlen = 0;
 		sdl->sdl_alen = FDDI_ADDR_LEN;
 		sdl->sdl_slen = 0;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return (0);
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/*
 			 * An IP6 address of 0 means listen to all
 			 * of the Ethernet multicast address used for IP6.
 			 * (This is used for multicast routers.)
 			 */
 			ifp->if_flags |= IFF_ALLMULTI;
 			*llsa = 0;
 			return (0);
 		}
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 			return (EADDRNOTAVAIL);
 		sdl = link_init_sdl(ifp, *llsa, IFT_FDDI);
 		sdl->sdl_nlen = 0;
 		sdl->sdl_alen = FDDI_ADDR_LEN;
 		sdl->sdl_slen = 0;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return (0);
 #endif
 
 	default:
 		/*
 		 * Well, the text isn't quite right, but it's the name
 		 * that counts...
 		 */
 		return (EAFNOSUPPORT);
 	}
 
 	return (0);
 }
 
 static moduledata_t fddi_mod = {
 	"fddi",	/* module name */
 	NULL,	/* event handler */
 	0	/* extra data */
 };
 
 DECLARE_MODULE(fddi, fddi_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(fddi, 1);
Index: user/dchagin/lemul/sys/net/if_gif.c
===================================================================
--- user/dchagin/lemul/sys/net/if_gif.c	(revision 274240)
+++ user/dchagin/lemul/sys/net/if_gif.c	(revision 274241)
@@ -1,1090 +1,1091 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/conf.h>
 #include <machine/cpu.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_ecn.h>
 #ifdef	INET
 #include <netinet/in_var.h>
 #include <netinet/in_gif.h>
 #include <netinet/ip_var.h>
 #endif	/* INET */
 
 #ifdef INET6
 #ifndef INET
 #include <netinet/in.h>
 #endif
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_ecn.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/in6_gif.h>
 #include <netinet6/ip6protosw.h>
 #endif /* INET6 */
 
 #include <netinet/ip_encap.h>
 #include <net/ethernet.h>
 #include <net/if_bridgevar.h>
 #include <net/if_gif.h>
 
 #include <security/mac/mac_framework.h>
 
 static const char gifname[] = "gif";
 
 /*
  * gif_mtx protects a per-vnet gif_softc_list.
  */
 static VNET_DEFINE(struct mtx, gif_mtx);
 #define	V_gif_mtx		VNET(gif_mtx)
 static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface");
 static VNET_DEFINE(LIST_HEAD(, gif_softc), gif_softc_list);
 #define	V_gif_softc_list	VNET(gif_softc_list)
 static struct sx gif_ioctl_sx;
 SX_SYSINIT(gif_ioctl_sx, &gif_ioctl_sx, "gif_ioctl");
 
 #define	GIF_LIST_LOCK_INIT(x)		mtx_init(&V_gif_mtx, "gif_mtx", \
 					    NULL, MTX_DEF)
 #define	GIF_LIST_LOCK_DESTROY(x)	mtx_destroy(&V_gif_mtx)
 #define	GIF_LIST_LOCK(x)		mtx_lock(&V_gif_mtx)
 #define	GIF_LIST_UNLOCK(x)		mtx_unlock(&V_gif_mtx)
 
 void	(*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af);
 void	(*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af);
 void	(*ng_gif_attach_p)(struct ifnet *ifp);
 void	(*ng_gif_detach_p)(struct ifnet *ifp);
 
 static int	gif_set_tunnel(struct ifnet *, struct sockaddr *,
     struct sockaddr *);
 static void	gif_delete_tunnel(struct ifnet *);
 static int	gif_ioctl(struct ifnet *, u_long, caddr_t);
 static int	gif_transmit(struct ifnet *, struct mbuf *);
 static void	gif_qflush(struct ifnet *);
 static int	gif_clone_create(struct if_clone *, int, caddr_t);
 static void	gif_clone_destroy(struct ifnet *);
 static VNET_DEFINE(struct if_clone *, gif_cloner);
 #define	V_gif_cloner	VNET(gif_cloner)
 
 static int gifmodevent(module_t, int, void *);
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW, 0,
     "Generic Tunnel Interface");
 #ifndef MAX_GIF_NEST
 /*
  * This macro controls the default upper limitation on nesting of gif tunnels.
  * Since, setting a large value to this macro with a careless configuration
  * may introduce system crash, we don't allow any nestings by default.
  * If you need to configure nested gif tunnels, you can define this macro
  * in your kernel configuration file.  However, if you do so, please be
  * careful to configure the tunnels so that it won't make a loop.
  */
 #define MAX_GIF_NEST 1
 #endif
 static VNET_DEFINE(int, max_gif_nesting) = MAX_GIF_NEST;
 #define	V_max_gif_nesting	VNET(max_gif_nesting)
-SYSCTL_VNET_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_RW,
+SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(max_gif_nesting), 0, "Max nested tunnels");
 
 /*
  * By default, we disallow creation of multiple tunnels between the same
  * pair of addresses.  Some applications require this functionality so
  * we allow control over this check here.
  */
 #ifdef XBONEHACK
 static VNET_DEFINE(int, parallel_tunnels) = 1;
 #else
 static VNET_DEFINE(int, parallel_tunnels) = 0;
 #endif
 #define	V_parallel_tunnels	VNET(parallel_tunnels)
-SYSCTL_VNET_INT(_net_link_gif, OID_AUTO, parallel_tunnels, CTLFLAG_RW,
-    &VNET_NAME(parallel_tunnels), 0, "Allow parallel tunnels?");
+SYSCTL_INT(_net_link_gif, OID_AUTO, parallel_tunnels,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(parallel_tunnels), 0,
+    "Allow parallel tunnels?");
 
 /* copy from src/sys/net/if_ethersubr.c */
 static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] =
 			{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 #ifndef ETHER_IS_BROADCAST
 #define ETHER_IS_BROADCAST(addr) \
 	(bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0)
 #endif
 
 static int
 gif_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct gif_softc *sc;
 
 	sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK | M_ZERO);
 	sc->gif_fibnum = curthread->td_proc->p_fibnum;
 	GIF2IFP(sc) = if_alloc(IFT_GIF);
 	GIF_LOCK_INIT(sc);
 	GIF2IFP(sc)->if_softc = sc;
 	if_initname(GIF2IFP(sc), gifname, unit);
 
 	GIF2IFP(sc)->if_addrlen = 0;
 	GIF2IFP(sc)->if_mtu    = GIF_MTU;
 	GIF2IFP(sc)->if_flags  = IFF_POINTOPOINT | IFF_MULTICAST;
 #if 0
 	/* turn off ingress filter */
 	GIF2IFP(sc)->if_flags  |= IFF_LINK2;
 #endif
 	GIF2IFP(sc)->if_ioctl  = gif_ioctl;
 	GIF2IFP(sc)->if_transmit  = gif_transmit;
 	GIF2IFP(sc)->if_qflush  = gif_qflush;
 	GIF2IFP(sc)->if_output = gif_output;
 	if_attach(GIF2IFP(sc));
 	bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t));
 	if (ng_gif_attach_p != NULL)
 		(*ng_gif_attach_p)(GIF2IFP(sc));
 
 	GIF_LIST_LOCK();
 	LIST_INSERT_HEAD(&V_gif_softc_list, sc, gif_list);
 	GIF_LIST_UNLOCK();
 	return (0);
 }
 
 static void
 gif_clone_destroy(struct ifnet *ifp)
 {
 	struct gif_softc *sc;
 
 	sx_xlock(&gif_ioctl_sx);
 	sc = ifp->if_softc;
 	gif_delete_tunnel(ifp);
 	GIF_LIST_LOCK();
 	LIST_REMOVE(sc, gif_list);
 	GIF_LIST_UNLOCK();
 	if (ng_gif_detach_p != NULL)
 		(*ng_gif_detach_p)(ifp);
 	bpfdetach(ifp);
 	if_detach(ifp);
 	ifp->if_softc = NULL;
 	sx_xunlock(&gif_ioctl_sx);
 
 	if_free(ifp);
 	GIF_LOCK_DESTROY(sc);
 	free(sc, M_GIF);
 }
 
 static void
 vnet_gif_init(const void *unused __unused)
 {
 
 	LIST_INIT(&V_gif_softc_list);
 	GIF_LIST_LOCK_INIT();
 	V_gif_cloner = if_clone_simple(gifname, gif_clone_create,
 	    gif_clone_destroy, 0);
 }
 VNET_SYSINIT(vnet_gif_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_gif_init, NULL);
 
 static void
 vnet_gif_uninit(const void *unused __unused)
 {
 
 	if_clone_detach(V_gif_cloner);
 	GIF_LIST_LOCK_DESTROY();
 }
 VNET_SYSUNINIT(vnet_gif_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     vnet_gif_uninit, NULL);
 
 static int
 gifmodevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t gif_mod = {
 	"if_gif",
 	gifmodevent,
 	0
 };
 
 DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_gif, 1);
 
 int
 gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
 {
 	GIF_RLOCK_TRACKER;
 	struct gif_softc *sc;
 	int ret;
 	uint8_t ver;
 
 	sc = (struct gif_softc *)arg;
 	if (sc == NULL || (GIF2IFP(sc)->if_flags & IFF_UP) == 0)
 		return (0);
 
 	ret = 0;
 	GIF_RLOCK(sc);
 
 	/* no physical address */
 	if (sc->gif_family == 0)
 		goto done;
 
 	switch (proto) {
 #ifdef INET
 	case IPPROTO_IPV4:
 #endif
 #ifdef INET6
 	case IPPROTO_IPV6:
 #endif
 	case IPPROTO_ETHERIP:
 		break;
 	default:
 		goto done;
 	}
 
 	/* Bail on short packets */
 	if (m->m_pkthdr.len < sizeof(struct ip))
 		goto done;
 
 	m_copydata(m, 0, 1, &ver);
 	switch (ver >> 4) {
 #ifdef INET
 	case 4:
 		if (sc->gif_family != AF_INET)
 			goto done;
 		ret = in_gif_encapcheck(m, off, proto, arg);
 		break;
 #endif
 #ifdef INET6
 	case 6:
 		if (m->m_pkthdr.len < sizeof(struct ip6_hdr))
 			goto done;
 		if (sc->gif_family != AF_INET6)
 			goto done;
 		ret = in6_gif_encapcheck(m, off, proto, arg);
 		break;
 #endif
 	}
 done:
 	GIF_RUNLOCK(sc);
 	return (ret);
 }
 
 static int
 gif_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct gif_softc *sc;
 	struct etherip_header *eth;
 #ifdef INET
 	struct ip *ip;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	uint32_t t;
 #endif
 	uint32_t af;
 	uint8_t proto, ecn;
 	int error;
 
 	error = ENETDOWN;
 	sc = ifp->if_softc;
 	if (sc->gif_family == 0) {
 		m_freem(m);
 		goto err;
 	}
 	/* Now pull back the af that we stashed in the csum_data. */
 	af = m->m_pkthdr.csum_data;
 	BPF_MTAP2(ifp, &af, sizeof(af), m);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 	M_SETFIB(m, sc->gif_fibnum);
 	/* inner AF-specific encapsulation */
 	ecn = 0;
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		proto = IPPROTO_IPV4;
 		if (m->m_len < sizeof(struct ip))
 			m = m_pullup(m, sizeof(struct ip));
 		if (m == NULL) {
 			error = ENOBUFS;
 			goto err;
 		}
 		ip = mtod(m, struct ip *);
 		ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
 		    ECN_NOCARE, &ecn, &ip->ip_tos);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		proto = IPPROTO_IPV6;
 		if (m->m_len < sizeof(struct ip6_hdr))
 			m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL) {
 			error = ENOBUFS;
 			goto err;
 		}
 		t = 0;
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
 		    ECN_NOCARE, &t, &ip6->ip6_flow);
 		ecn = (ntohl(t) >> 20) & 0xff;
 		break;
 #endif
 	case AF_LINK:
 		proto = IPPROTO_ETHERIP;
 		M_PREPEND(m, sizeof(struct etherip_header), M_NOWAIT);
 		if (m == NULL) {
 			error = ENOBUFS;
 			goto err;
 		}
 		eth = mtod(m, struct etherip_header *);
 		eth->eip_resvh = 0;
 		if ((sc->gif_options & GIF_SEND_REVETHIP) != 0) {
 			eth->eip_ver = 0;
 			eth->eip_resvl = ETHERIP_VERSION;
 		} else {
 			eth->eip_ver = ETHERIP_VERSION;
 			eth->eip_resvl = 0;
 		}
 		break;
 	default:
 		error = EAFNOSUPPORT;
 		m_freem(m);
 		goto err;
 	}
 	/* XXX should we check if our outer source is legal? */
 	/* dispatch to output logic based on outer AF */
 	switch (sc->gif_family) {
 #ifdef INET
 	case AF_INET:
 		error = in_gif_output(ifp, m, proto, ecn);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		error = in6_gif_output(ifp, m, proto, ecn);
 		break;
 #endif
 	default:
 		m_freem(m);
 	}
 err:
 	if (error)
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	return (error);
 }
 
 static void
 gif_qflush(struct ifnet *ifp __unused)
 {
 
 }
 
 int
 gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *ro)
 {
 	struct m_tag *mtag;
 	uint32_t af;
 	int gif_called;
 	int error = 0;
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		goto err;
 #endif
 	if ((ifp->if_flags & IFF_MONITOR) != 0 ||
 	    (ifp->if_flags & IFF_UP) == 0) {
 		error = ENETDOWN;
 		goto err;
 	}
 
 	/*
 	 * gif may cause infinite recursion calls when misconfigured.
 	 * We'll prevent this by detecting loops.
 	 *
 	 * High nesting level may cause stack exhaustion.
 	 * We'll prevent this by introducing upper limit.
 	 */
 	gif_called = 1;
 	mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, NULL);
 	while (mtag != NULL) {
 		if (*(struct ifnet **)(mtag + 1) == ifp) {
 			log(LOG_NOTICE,
 			    "gif_output: loop detected on %s\n",
 			    (*(struct ifnet **)(mtag + 1))->if_xname);
 			error = EIO;	/* is there better errno? */
 			goto err;
 		}
 		mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, mtag);
 		gif_called++;
 	}
 	if (gif_called > V_max_gif_nesting) {
 		log(LOG_NOTICE,
 		    "gif_output: recursively called too many times(%d)\n",
 		    gif_called);
 		error = EIO;	/* is there better errno? */
 		goto err;
 	}
 	mtag = m_tag_alloc(MTAG_GIF, MTAG_GIF_CALLED, sizeof(struct ifnet *),
 	    M_NOWAIT);
 	if (mtag == NULL) {
 		error = ENOMEM;
 		goto err;
 	}
 	*(struct ifnet **)(mtag + 1) = ifp;
 	m_tag_prepend(m, mtag);
 
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 	if (dst->sa_family == AF_UNSPEC)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = dst->sa_family;
 	if (ifp->if_bridge)
 		af = AF_LINK;
 	/*
 	 * Now save the af in the inbound pkt csum data, this is a cheat since
 	 * we are using the inbound csum_data field to carry the af over to
 	 * the gif_transmit() routine, avoiding using yet another mtag.
 	 */
 	m->m_pkthdr.csum_data = af;
 	return (ifp->if_transmit(ifp, m));
 err:
 	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	m_freem(m);
 	return (error);
 }
 
 void
 gif_input(struct mbuf *m, struct ifnet *ifp, int proto, uint8_t ecn)
 {
 	struct etherip_header *eip;
 #ifdef INET
 	struct ip *ip;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	uint32_t t;
 #endif
 	struct gif_softc *sc;
 	struct ether_header *eh;
 	struct ifnet *oldifp;
 	uint32_t gif_options;
 	int isr, n, af;
 
 	if (ifp == NULL) {
 		/* just in case */
 		m_freem(m);
 		return;
 	}
 	sc = ifp->if_softc;
 	gif_options = sc->gif_options;
 	m->m_pkthdr.rcvif = ifp;
 	m_clrprotoflags(m);
 	switch (proto) {
 #ifdef INET
 	case IPPROTO_IPV4:
 		af = AF_INET;
 		if (m->m_len < sizeof(struct ip))
 			m = m_pullup(m, sizeof(struct ip));
 		if (m == NULL)
 			goto drop;
 		ip = mtod(m, struct ip *);
 		if (ip_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
 		    ECN_NOCARE, &ecn, &ip->ip_tos) == 0) {
 			m_freem(m);
 			goto drop;
 		}
 		break;
 #endif
 #ifdef INET6
 	case IPPROTO_IPV6:
 		af = AF_INET6;
 		if (m->m_len < sizeof(struct ip6_hdr))
 			m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL)
 			goto drop;
 		t = htonl((uint32_t)ecn << 20);
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (ip6_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED:
 		    ECN_NOCARE, &t, &ip6->ip6_flow) == 0) {
 			m_freem(m);
 			goto drop;
 		}
 		break;
 #endif
 	case IPPROTO_ETHERIP:
 		af = AF_LINK;
 		break;
 	default:
 		m_freem(m);
 		goto drop;
 	}
 
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	if (bpf_peers_present(ifp->if_bpf)) {
 		uint32_t af1 = af;
 		bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m);
 	}
 
 	if ((ifp->if_flags & IFF_MONITOR) != 0) {
 		if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 		m_freem(m);
 		return;
 	}
 
 	if (ng_gif_input_p != NULL) {
 		(*ng_gif_input_p)(ifp, &m, af);
 		if (m == NULL)
 			goto drop;
 	}
 
 	/*
 	 * Put the packet to the network layer input queue according to the
 	 * specified address family.
 	 * Note: older versions of gif_input directly called network layer
 	 * input functions, e.g. ip6_input, here.  We changed the policy to
 	 * prevent too many recursive calls of such input functions, which
 	 * might cause kernel panic.  But the change may introduce another
 	 * problem; if the input queue is full, packets are discarded.
 	 * The kernel stack overflow really happened, and we believed
 	 * queue-full rarely occurs, so we changed the policy.
 	 */
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		isr = NETISR_IP;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	case AF_LINK:
 		n = sizeof(struct etherip_header) + sizeof(struct ether_header);
 		if (n > m->m_len)
 			m = m_pullup(m, n);
 		if (m == NULL)
 			goto drop;
 		eip = mtod(m, struct etherip_header *);
 		/*
 		 * GIF_ACCEPT_REVETHIP (enabled by default) intentionally
 		 * accepts an EtherIP packet with revered version field in
 		 * the header.  This is a knob for backward compatibility
 		 * with FreeBSD 7.2R or prior.
 		 */
 		if (eip->eip_ver != ETHERIP_VERSION) {
 			if ((gif_options & GIF_ACCEPT_REVETHIP) == 0 ||
 			    eip->eip_resvl != ETHERIP_VERSION) {
 				/* discard unknown versions */
 				m_freem(m);
 				goto drop;
 			}
 		}
 		m_adj(m, sizeof(struct etherip_header));
 
 		m->m_flags &= ~(M_BCAST|M_MCAST);
 		m->m_pkthdr.rcvif = ifp;
 
 		if (ifp->if_bridge) {
 			oldifp = ifp;
 			eh = mtod(m, struct ether_header *);
 			if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 				if (ETHER_IS_BROADCAST(eh->ether_dhost))
 					m->m_flags |= M_BCAST;
 				else
 					m->m_flags |= M_MCAST;
 				if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 			}
 			BRIDGE_INPUT(ifp, m);
 
 			if (m != NULL && ifp != oldifp) {
 				/*
 				 * The bridge gave us back itself or one of the
 				 * members for which the frame is addressed.
 				 */
 				ether_demux(ifp, m);
 				return;
 			}
 		}
 		if (m != NULL)
 			m_freem(m);
 		return;
 
 	default:
 		if (ng_gif_input_orphan_p != NULL)
 			(*ng_gif_input_orphan_p)(ifp, m, af);
 		else
 			m_freem(m);
 		return;
 	}
 
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	M_SETFIB(m, ifp->if_fib);
 	netisr_dispatch(isr, m);
 	return;
 drop:
 	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 }
 
 /* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */
 int
 gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	GIF_RLOCK_TRACKER;
 	struct ifreq *ifr = (struct ifreq*)data;
 	struct sockaddr *dst, *src;
 	struct gif_softc *sc;
 #ifdef INET
 	struct sockaddr_in *sin = NULL;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6 = NULL;
 #endif
 	u_int options;
 	int error;
 
 	switch (cmd) {
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 	case SIOCGIFMTU:
 	case SIOCSIFFLAGS:
 		return (0);
 	case SIOCSIFMTU:
 		if (ifr->ifr_mtu < GIF_MTU_MIN ||
 		    ifr->ifr_mtu > GIF_MTU_MAX)
 			return (EINVAL);
 		else
 			ifp->if_mtu = ifr->ifr_mtu;
 		return (0);
 	}
 	sx_xlock(&gif_ioctl_sx);
 	sc = ifp->if_softc;
 	if (sc == NULL) {
 		error = ENXIO;
 		goto bad;
 	}
 	error = 0;
 	switch (cmd) {
 	case SIOCSIFPHYADDR:
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 #endif
 		error = EINVAL;
 		switch (cmd) {
 #ifdef INET
 		case SIOCSIFPHYADDR:
 			src = (struct sockaddr *)
 				&(((struct in_aliasreq *)data)->ifra_addr);
 			dst = (struct sockaddr *)
 				&(((struct in_aliasreq *)data)->ifra_dstaddr);
 			break;
 #endif
 #ifdef INET6
 		case SIOCSIFPHYADDR_IN6:
 			src = (struct sockaddr *)
 				&(((struct in6_aliasreq *)data)->ifra_addr);
 			dst = (struct sockaddr *)
 				&(((struct in6_aliasreq *)data)->ifra_dstaddr);
 			break;
 #endif
 		default:
 			goto bad;
 		}
 		/* sa_family must be equal */
 		if (src->sa_family != dst->sa_family ||
 		    src->sa_len != dst->sa_len)
 			goto bad;
 
 		/* validate sa_len */
 		switch (src->sa_family) {
 #ifdef INET
 		case AF_INET:
 			if (src->sa_len != sizeof(struct sockaddr_in))
 				goto bad;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			if (src->sa_len != sizeof(struct sockaddr_in6))
 				goto bad;
 			break;
 #endif
 		default:
 			error = EAFNOSUPPORT;
 			goto bad;
 		}
 		/* check sa_family looks sane for the cmd */
 		error = EAFNOSUPPORT;
 		switch (cmd) {
 #ifdef INET
 		case SIOCSIFPHYADDR:
 			if (src->sa_family == AF_INET)
 				break;
 			goto bad;
 #endif
 #ifdef INET6
 		case SIOCSIFPHYADDR_IN6:
 			if (src->sa_family == AF_INET6)
 				break;
 			goto bad;
 #endif
 		}
 		error = EADDRNOTAVAIL;
 		switch (src->sa_family) {
 #ifdef INET
 		case AF_INET:
 			if (satosin(src)->sin_addr.s_addr == INADDR_ANY ||
 			    satosin(dst)->sin_addr.s_addr == INADDR_ANY)
 				goto bad;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr)
 			    ||
 			    IN6_IS_ADDR_UNSPECIFIED(&satosin6(dst)->sin6_addr))
 				goto bad;
 			/*
 			 * Check validity of the scope zone ID of the
 			 * addresses, and convert it into the kernel
 			 * internal form if necessary.
 			 */
 			error = sa6_embedscope(satosin6(src), 0);
 			if (error != 0)
 				goto bad;
 			error = sa6_embedscope(satosin6(dst), 0);
 			if (error != 0)
 				goto bad;
 #endif
 		};
 		error = gif_set_tunnel(ifp, src, dst);
 		break;
 	case SIOCDIFPHYADDR:
 		gif_delete_tunnel(ifp);
 		break;
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 #ifdef INET6
 	case SIOCGIFPSRCADDR_IN6:
 	case SIOCGIFPDSTADDR_IN6:
 #endif
 		if (sc->gif_family == 0) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		GIF_RLOCK(sc);
 		switch (cmd) {
 #ifdef INET
 		case SIOCGIFPSRCADDR:
 		case SIOCGIFPDSTADDR:
 			if (sc->gif_family != AF_INET) {
 				error = EADDRNOTAVAIL;
 				break;
 			}
 			sin = (struct sockaddr_in *)&ifr->ifr_addr;
 			memset(sin, 0, sizeof(*sin));
 			sin->sin_family = AF_INET;
 			sin->sin_len = sizeof(*sin);
 			break;
 #endif
 #ifdef INET6
 		case SIOCGIFPSRCADDR_IN6:
 		case SIOCGIFPDSTADDR_IN6:
 			if (sc->gif_family != AF_INET6) {
 				error = EADDRNOTAVAIL;
 				break;
 			}
 			sin6 = (struct sockaddr_in6 *)
 				&(((struct in6_ifreq *)data)->ifr_addr);
 			memset(sin6, 0, sizeof(*sin6));
 			sin6->sin6_family = AF_INET6;
 			sin6->sin6_len = sizeof(*sin6);
 			break;
 #endif
 		default:
 			error = EAFNOSUPPORT;
 		}
 		if (error == 0) {
 			switch (cmd) {
 #ifdef INET
 			case SIOCGIFPSRCADDR:
 				sin->sin_addr = sc->gif_iphdr->ip_src;
 				break;
 			case SIOCGIFPDSTADDR:
 				sin->sin_addr = sc->gif_iphdr->ip_dst;
 				break;
 #endif
 #ifdef INET6
 			case SIOCGIFPSRCADDR_IN6:
 				sin6->sin6_addr = sc->gif_ip6hdr->ip6_src;
 				break;
 			case SIOCGIFPDSTADDR_IN6:
 				sin6->sin6_addr = sc->gif_ip6hdr->ip6_dst;
 				break;
 #endif
 			}
 		}
 		GIF_RUNLOCK(sc);
 		if (error != 0)
 			break;
 		switch (cmd) {
 #ifdef INET
 		case SIOCGIFPSRCADDR:
 		case SIOCGIFPDSTADDR:
 			error = prison_if(curthread->td_ucred,
 			    (struct sockaddr *)sin);
 			if (error != 0)
 				memset(sin, 0, sizeof(*sin));
 			break;
 #endif
 #ifdef INET6
 		case SIOCGIFPSRCADDR_IN6:
 		case SIOCGIFPDSTADDR_IN6:
 			error = prison_if(curthread->td_ucred,
 			    (struct sockaddr *)sin6);
 			if (error == 0)
 				error = sa6_recoverscope(sin6);
 			if (error != 0)
 				memset(sin6, 0, sizeof(*sin6));
 #endif
 		}
 		break;
 	case GIFGOPTS:
 		options = sc->gif_options;
 		error = copyout(&options, ifr->ifr_data, sizeof(options));
 		break;
 	case GIFSOPTS:
 		if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0)
 			break;
 		error = copyin(ifr->ifr_data, &options, sizeof(options));
 		if (error)
 			break;
 		if (options & ~GIF_OPTMASK)
 			error = EINVAL;
 		else
 			sc->gif_options = options;
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 bad:
 	sx_xunlock(&gif_ioctl_sx);
 	return (error);
 }
 
 static void
 gif_detach(struct gif_softc *sc)
 {
 
 	sx_assert(&gif_ioctl_sx, SA_XLOCKED);
 	if (sc->gif_ecookie != NULL)
 		encap_detach(sc->gif_ecookie);
 	sc->gif_ecookie = NULL;
 }
 
 static int
 gif_attach(struct gif_softc *sc, int af)
 {
 
 	sx_assert(&gif_ioctl_sx, SA_XLOCKED);
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		return (in_gif_attach(sc));
 #endif
 #ifdef INET6
 	case AF_INET6:
 		return (in6_gif_attach(sc));
 #endif
 	}
 	return (EAFNOSUPPORT);
 }
 
 static int
 gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst)
 {
 	struct gif_softc *sc = ifp->if_softc;
 	struct gif_softc *tsc;
 #ifdef INET
 	struct ip *ip;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	void *hdr;
 	int error = 0;
 
 	if (sc == NULL)
 		return (ENXIO);
 	/* Disallow parallel tunnels unless instructed otherwise. */
 	if (V_parallel_tunnels == 0) {
 		GIF_LIST_LOCK();
 		LIST_FOREACH(tsc, &V_gif_softc_list, gif_list) {
 			if (tsc == sc || tsc->gif_family != src->sa_family)
 				continue;
 #ifdef INET
 			if (tsc->gif_family == AF_INET &&
 			    tsc->gif_iphdr->ip_src.s_addr ==
 			    satosin(src)->sin_addr.s_addr &&
 			    tsc->gif_iphdr->ip_dst.s_addr ==
 			    satosin(dst)->sin_addr.s_addr) {
 				error = EADDRNOTAVAIL;
 				GIF_LIST_UNLOCK();
 				goto bad;
 			}
 #endif
 #ifdef INET6
 			if (tsc->gif_family == AF_INET6 &&
 			    IN6_ARE_ADDR_EQUAL(&tsc->gif_ip6hdr->ip6_src,
 			    &satosin6(src)->sin6_addr) &&
 			    IN6_ARE_ADDR_EQUAL(&tsc->gif_ip6hdr->ip6_dst,
 			    &satosin6(dst)->sin6_addr)) {
 				error = EADDRNOTAVAIL;
 				GIF_LIST_UNLOCK();
 				goto bad;
 			}
 #endif
 		}
 		GIF_LIST_UNLOCK();
 	}
 	switch (src->sa_family) {
 #ifdef INET
 	case AF_INET:
 		hdr = ip = malloc(sizeof(struct ip), M_GIF,
 		    M_WAITOK | M_ZERO);
 		ip->ip_src.s_addr = satosin(src)->sin_addr.s_addr;
 		ip->ip_dst.s_addr = satosin(dst)->sin_addr.s_addr;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		hdr = ip6 = malloc(sizeof(struct ip6_hdr), M_GIF,
 		    M_WAITOK | M_ZERO);
 		ip6->ip6_src = satosin6(src)->sin6_addr;
 		ip6->ip6_dst = satosin6(dst)->sin6_addr;
 		ip6->ip6_vfc = IPV6_VERSION;
 		break;
 #endif
 	default:
 		return (EAFNOSUPPORT);
 	};
 
 	if (sc->gif_family != src->sa_family)
 		gif_detach(sc);
 	if (sc->gif_family == 0 ||
 	    sc->gif_family != src->sa_family)
 		error = gif_attach(sc, src->sa_family);
 
 	GIF_WLOCK(sc);
 	if (sc->gif_family != 0)
 		free(sc->gif_hdr, M_GIF);
 	sc->gif_family = src->sa_family;
 	sc->gif_hdr = hdr;
 	GIF_WUNLOCK(sc);
 #if defined(INET) || defined(INET6)
 bad:
 #endif
 	if (error == 0 && sc->gif_family != 0)
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	else
 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	return (error);
 }
 
 static void
 gif_delete_tunnel(struct ifnet *ifp)
 {
 	struct gif_softc *sc = ifp->if_softc;
 	int family;
 
 	if (sc == NULL)
 		return;
 
 	GIF_WLOCK(sc);
 	family = sc->gif_family;
 	sc->gif_family = 0;
 	GIF_WUNLOCK(sc);
 	if (family != 0) {
 		gif_detach(sc);
 		free(sc->gif_hdr, M_GIF);
 	}
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 }
Index: user/dchagin/lemul/sys/net/if_iso88025subr.c
===================================================================
--- user/dchagin/lemul/sys/net/if_iso88025subr.c	(revision 274240)
+++ user/dchagin/lemul/sys/net/if_iso88025subr.c	(revision 274241)
@@ -1,744 +1,702 @@
 /*-
  * Copyright (c) 1998, Larry Lile
  * All rights reserved.
  *
  * For latest sources and information on this driver, please
  * go to http://anarchy.stdio.com.
  *
  * Questions, comments or suggestions should be directed to
  * Larry Lile <lile@stdio.com>.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 /*
  *
  * General ISO 802.5 (Token Ring) support routines
  * 
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/sockio.h> 
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_llc.h>
 #include <net/if_types.h>
 #include <net/if_llatbl.h>
 
 #include <net/ethernet.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/bpf.h>
 #include <net/iso88025.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 static const u_char iso88025_broadcastaddr[ISO88025_ADDR_LEN] =
 			{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static int iso88025_resolvemulti (struct ifnet *, struct sockaddr **,
 				  struct sockaddr *);
 
 #define	senderr(e)	do { error = (e); goto bad; } while (0)
 
 /*
  * Perform common duties while attaching to interface list
  */
 void
 iso88025_ifattach(struct ifnet *ifp, const u_int8_t *lla, int bpf)
 {
     struct ifaddr *ifa;
     struct sockaddr_dl *sdl;
 
     ifa = NULL;
 
     ifp->if_type = IFT_ISO88025;
     ifp->if_addrlen = ISO88025_ADDR_LEN;
     ifp->if_hdrlen = ISO88025_HDR_LEN;
 
     if_attach(ifp);	/* Must be called before additional assignments */
 
     ifp->if_output = iso88025_output;
     ifp->if_input = iso88025_input;
     ifp->if_resolvemulti = iso88025_resolvemulti;
     ifp->if_broadcastaddr = iso88025_broadcastaddr;
 
     if (ifp->if_baudrate == 0)
         ifp->if_baudrate = TR_16MBPS; /* 16Mbit should be a safe default */
     if (ifp->if_mtu == 0)
         ifp->if_mtu = ISO88025_DEFAULT_MTU;
 
     ifa = ifp->if_addr;
     KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
 
     sdl = (struct sockaddr_dl *)ifa->ifa_addr;
     sdl->sdl_type = IFT_ISO88025;
     sdl->sdl_alen = ifp->if_addrlen;
     bcopy(lla, LLADDR(sdl), ifp->if_addrlen);
 
     if (bpf)
         bpfattach(ifp, DLT_IEEE802, ISO88025_HDR_LEN);
 
     return;
 }
 
 /*
  * Perform common duties while detaching a Token Ring interface
  */
 void
 iso88025_ifdetach(ifp, bpf)
         struct ifnet *ifp;
         int bpf;
 {
 
 	if (bpf)
                 bpfdetach(ifp);
 
 	if_detach(ifp);
 
 	return;
 }
 
 int
 iso88025_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
         struct ifaddr *ifa;
         struct ifreq *ifr;
         int error;
 
 	ifa = (struct ifaddr *) data;
 	ifr = (struct ifreq *) data;
 	error = 0;
 
         switch (command) {
         case SIOCSIFADDR:
                 ifp->if_flags |= IFF_UP;
 
                 switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
                 case AF_INET:
                         ifp->if_init(ifp->if_softc);    /* before arpwhohas */
                         arp_ifinit(ifp, ifa);
                         break;
 #endif	/* INET */
                 default:
                         ifp->if_init(ifp->if_softc);
                         break;
                 }
                 break;
 
         case SIOCGIFADDR: {
                         struct sockaddr *sa;
 
                         sa = (struct sockaddr *) & ifr->ifr_data;
                         bcopy(IF_LLADDR(ifp),
                               (caddr_t) sa->sa_data, ISO88025_ADDR_LEN);
                 }
                 break;
 
         case SIOCSIFMTU:
                 /*
                  * Set the interface MTU.
                  */
                 if (ifr->ifr_mtu > ISO88025_MAX_MTU) {
                         error = EINVAL;
                 } else {
                         ifp->if_mtu = ifr->ifr_mtu;
                 }
                 break;
 	default:
 		error = EINVAL;			/* XXX netbsd has ENOTTY??? */
 		break;
         }
 
         return (error);
 }
 
 /*
  * ISO88025 encapsulation
  */
 int
 iso88025_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *ro)
 {
 	u_int16_t snap_type = 0;
 	int loop_copy = 0, error = 0, rif_len = 0;
 	u_char edst[ISO88025_ADDR_LEN];
 	struct iso88025_header *th;
 	struct iso88025_header gen_th;
 	struct sockaddr_dl *sdl = NULL;
 	struct rtentry *rt0 = NULL;
 #if defined(INET) || defined(INET6)
 	struct llentry *lle;
 #endif
 
 	if (ro != NULL)
 		rt0 = ro->ro_rt;
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		senderr(error);
 #endif
 
 	if (ifp->if_flags & IFF_MONITOR)
 		senderr(ENETDOWN);
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		senderr(ENETDOWN);
 	getmicrotime(&ifp->if_lastchange);
 
 	/* Calculate routing info length based on arp table entry */
 	/* XXX any better way to do this ? */
 
 	if (rt0 && (sdl = (struct sockaddr_dl *)rt0->rt_gateway))
 		if (SDL_ISO88025(sdl)->trld_rcf != 0)
 			rif_len = TR_RCF_RIFLEN(SDL_ISO88025(sdl)->trld_rcf);
 
 	/* Generate a generic 802.5 header for the packet */
 	gen_th.ac = TR_AC;
 	gen_th.fc = TR_LLC_FRAME;
 	(void)memcpy((caddr_t)gen_th.iso88025_shost, IF_LLADDR(ifp),
 		     ISO88025_ADDR_LEN);
 	if (rif_len) {
 		gen_th.iso88025_shost[0] |= TR_RII;
 		if (rif_len > 2) {
 			gen_th.rcf = SDL_ISO88025(sdl)->trld_rcf;
 			(void)memcpy((caddr_t)gen_th.rd,
 				(caddr_t)SDL_ISO88025(sdl)->trld_route,
 				rif_len - 2);
 		}
 	}
 	
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 		error = arpresolve(ifp, rt0, m, dst, edst, &lle);
 		if (error)
 			return (error == EWOULDBLOCK ? 0 : error);
 		snap_type = ETHERTYPE_IP;
 		break;
 	case AF_ARP:
 	{
 		struct arphdr *ah;
 		ah = mtod(m, struct arphdr *);
 		ah->ar_hrd = htons(ARPHRD_IEEE802);
 
 		loop_copy = -1; /* if this is for us, don't do it */
 
 		switch(ntohs(ah->ar_op)) {
 		case ARPOP_REVREQUEST:
 		case ARPOP_REVREPLY:
 			snap_type = ETHERTYPE_REVARP;
 			break;
 		case ARPOP_REQUEST:
 		case ARPOP_REPLY:
 		default:
 			snap_type = ETHERTYPE_ARP;
 			break;
 		}
 
 		if (m->m_flags & M_BCAST)
 			bcopy(ifp->if_broadcastaddr, edst, ISO88025_ADDR_LEN);
 		else
 			bcopy(ar_tha(ah), edst, ISO88025_ADDR_LEN);
 
 	}
 	break;
 #endif	/* INET */
 #ifdef INET6
 	case AF_INET6:
 		error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle);
 		if (error)
 			return (error);
 		snap_type = ETHERTYPE_IPV6;
 		break;
 #endif	/* INET6 */
 	case AF_UNSPEC:
 	{
 		const struct iso88025_sockaddr_data *sd;
 		/*
 		 * For AF_UNSPEC sockaddr.sa_data must contain all of the
 		 * mac information needed to send the packet.  This allows
 		 * full mac, llc, and source routing function to be controlled.
 		 * llc and source routing information must already be in the
 		 * mbuf provided, ac/fc are set in sa_data.  sockaddr.sa_data
 		 * should be an iso88025_sockaddr_data structure see iso88025.h
 		 */
                 loop_copy = -1;
 		sd = (const struct iso88025_sockaddr_data *)dst->sa_data;
 		gen_th.ac = sd->ac;
 		gen_th.fc = sd->fc;
 		(void)memcpy(edst, sd->ether_dhost, ISO88025_ADDR_LEN);
 		(void)memcpy(gen_th.iso88025_shost, sd->ether_shost,
 		    ISO88025_ADDR_LEN);
 		rif_len = 0;
 		break;
 	}
 	default:
 		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
 		senderr(EAFNOSUPPORT);
 		break;
 	}
 
 	/*
 	 * Add LLC header.
 	 */
 	if (snap_type != 0) {
         	struct llc *l;
 		M_PREPEND(m, LLC_SNAPFRAMELEN, M_NOWAIT);
 		if (m == 0)
 			senderr(ENOBUFS);
 		l = mtod(m, struct llc *);
 		l->llc_control = LLC_UI;
 		l->llc_dsap = l->llc_ssap = LLC_SNAP_LSAP;
 		l->llc_snap.org_code[0] =
 			l->llc_snap.org_code[1] =
 			l->llc_snap.org_code[2] = 0;
 		l->llc_snap.ether_type = htons(snap_type);
 	}
 
 	/*
 	 * Add local net header.  If no space in first mbuf,
 	 * allocate another.
 	 */
 	M_PREPEND(m, ISO88025_HDR_LEN + rif_len, M_NOWAIT);
 	if (m == 0)
 		senderr(ENOBUFS);
 	th = mtod(m, struct iso88025_header *);
 	bcopy((caddr_t)edst, (caddr_t)&gen_th.iso88025_dhost, ISO88025_ADDR_LEN);
 
 	/* Copy as much of the generic header as is needed into the mbuf */
 	memcpy(th, &gen_th, ISO88025_HDR_LEN + rif_len);
 
         /*
          * If a simplex interface, and the packet is being sent to our
          * Ethernet address or a broadcast address, loopback a copy.
          * XXX To make a simplex device behave exactly like a duplex
          * device, we should copy in the case of sending to our own
          * ethernet address (thus letting the original actually appear
          * on the wire). However, we don't do that here for security
          * reasons and compatibility with the original behavior.
          */     
         if ((ifp->if_flags & IFF_SIMPLEX) && (loop_copy != -1)) {
                 if ((m->m_flags & M_BCAST) || (loop_copy > 0)) { 
                         struct mbuf *n;
 			n = m_copy(m, 0, (int)M_COPYALL);
                         (void) if_simloop(ifp, n, dst->sa_family,
 					  ISO88025_HDR_LEN);
                 } else if (bcmp(th->iso88025_dhost, th->iso88025_shost,
 				 ETHER_ADDR_LEN) == 0) {
 			(void) if_simloop(ifp, m, dst->sa_family,
 					  ISO88025_HDR_LEN);
                        	return(0);      /* XXX */
 		}       
         }      
 
 	IFQ_HANDOFF_ADJ(ifp, m, ISO88025_HDR_LEN + LLC_SNAPFRAMELEN, error);
 	if (error) {
 		printf("iso88025_output: packet dropped QFULL.\n");
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	}
 	return (error);
 
 bad:
 	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	if (m)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * ISO 88025 de-encapsulation
  */
 void
 iso88025_input(ifp, m)
 	struct ifnet *ifp;
 	struct mbuf *m;
 {
 	struct iso88025_header *th;
 	struct llc *l;
 	int isr;
 	int mac_hdr_len;
 
 	/*
 	 * Do consistency checks to verify assumptions
 	 * made by code past this point.
 	 */
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		if_printf(ifp, "discard frame w/o packet header\n");
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		m_freem(m);
 		return;
 	}
 	if (m->m_pkthdr.rcvif == NULL) {
 		if_printf(ifp, "discard frame w/o interface pointer\n");
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
  		m_freem(m);
 		return;
 	}
 
 	m = m_pullup(m, ISO88025_HDR_LEN);
 	if (m == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		goto dropanyway;
 	}
 	th = mtod(m, struct iso88025_header *);
 
 	/*
 	 * Discard packet if interface is not up.
 	 */
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		goto dropanyway;
 
 	/*
 	 * Give bpf a chance at the packet.
 	 */
 	BPF_MTAP(ifp, m);
 
 	/*
 	 * Interface marked for monitoring; discard packet.
 	 */
 	if (ifp->if_flags & IFF_MONITOR) {
 		m_freem(m);
 		return;
 	}
 
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	/*
 	 * Update interface statistics.
 	 */
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	getmicrotime(&ifp->if_lastchange);
 
 	/*
 	 * Discard non local unicast packets when interface
 	 * is in promiscuous mode.
 	 */
 	if ((ifp->if_flags & IFF_PROMISC) &&
 	    ((th->iso88025_dhost[0] & 1) == 0) &&
 	     (bcmp(IF_LLADDR(ifp), (caddr_t) th->iso88025_dhost,
 	     ISO88025_ADDR_LEN) != 0))
 		goto dropanyway;
 
 	/*
 	 * Set mbuf flags for bcast/mcast.
 	 */
 	if (th->iso88025_dhost[0] & 1) {
 		if (bcmp(iso88025_broadcastaddr, th->iso88025_dhost,
 		    ISO88025_ADDR_LEN) == 0)
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
 	}
 
 	mac_hdr_len = ISO88025_HDR_LEN;
 	/* Check for source routing info */
 	if (th->iso88025_shost[0] & TR_RII)
 		mac_hdr_len += TR_RCF_RIFLEN(th->rcf);
 
 	/* Strip off ISO88025 header. */
 	m_adj(m, mac_hdr_len);
 
 	m = m_pullup(m, LLC_SNAPFRAMELEN);
 	if (m == 0) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		goto dropanyway;
 	}
 	l = mtod(m, struct llc *);
 
 	switch (l->llc_dsap) {
 	case LLC_SNAP_LSAP: {
 		u_int16_t type;
 		if ((l->llc_control != LLC_UI) ||
 		    (l->llc_ssap != LLC_SNAP_LSAP)) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 		}
 
 		if (l->llc_snap.org_code[0] != 0 ||
 		    l->llc_snap.org_code[1] != 0 ||
 		    l->llc_snap.org_code[2] != 0) {
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 		}
 
 		type = ntohs(l->llc_snap.ether_type);
 		m_adj(m, LLC_SNAPFRAMELEN);
 		switch (type) {
 #ifdef INET
 		case ETHERTYPE_IP:
 			th->iso88025_shost[0] &= ~(TR_RII); 
 			if ((m = ip_fastforward(m)) == NULL)
 				return;
 			isr = NETISR_IP;
 			break;
 
 		case ETHERTYPE_ARP:
 			if (ifp->if_flags & IFF_NOARP)
 				goto dropanyway;
 			isr = NETISR_ARP;
 			break;
 #endif	/* INET */
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 			th->iso88025_shost[0] &= ~(TR_RII); 
 			isr = NETISR_IPV6;
 			break;
 #endif	/* INET6 */
 		default:
 			printf("iso88025_input: unexpected llc_snap ether_type  0x%02x\n", type);
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 		}
 		break;
 	}
 #ifdef ISO
 	case LLC_ISO_LSAP:
 		switch (l->llc_control) {
 		case LLC_UI:
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 			break;
                 case LLC_XID:
                 case LLC_XID_P:
 			if(m->m_len < ISO88025_ADDR_LEN)
 				goto dropanyway;
 			l->llc_window = 0;
 			l->llc_fid = 9;  
 			l->llc_class = 1;
 			l->llc_dsap = l->llc_ssap = 0;
 			/* Fall through to */  
 		case LLC_TEST:
 		case LLC_TEST_P:
 		{
 			struct sockaddr sa;
-			struct arpcom *ac;
 			struct iso88025_sockaddr_data *th2;
 			int i;
 			u_char c;
 
 			c = l->llc_dsap;
 
 			if (th->iso88025_shost[0] & TR_RII) { /* XXX */
 				printf("iso88025_input: dropping source routed LLC_TEST\n");
 				goto dropanyway;
 			}
 			l->llc_dsap = l->llc_ssap;
 			l->llc_ssap = c;
 			if (m->m_flags & (M_BCAST | M_MCAST))
 				bcopy((caddr_t)IF_LLADDR(ifp),
 				      (caddr_t)th->iso88025_dhost,
 					ISO88025_ADDR_LEN);
 			sa.sa_family = AF_UNSPEC;
 			sa.sa_len = sizeof(sa);
 			th2 = (struct iso88025_sockaddr_data *)sa.sa_data;
 			for (i = 0; i < ISO88025_ADDR_LEN; i++) {
 				th2->ether_shost[i] = c = th->iso88025_dhost[i];
 				th2->ether_dhost[i] = th->iso88025_dhost[i] =
 					th->iso88025_shost[i];
 				th->iso88025_shost[i] = c;
 			}
 			th2->ac = TR_AC;
 			th2->fc = TR_LLC_FRAME;
 			ifp->if_output(ifp, m, &sa, NULL);
 			return;
 		}
 		default:
 			printf("iso88025_input: unexpected llc control 0x%02x\n", l->llc_control);
 			if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 			goto dropanyway;
 			break;
 		}
 		break;
 #endif	/* ISO */
 	default:
 		printf("iso88025_input: unknown dsap 0x%x\n", l->llc_dsap);
 		if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1);
 		goto dropanyway;
 		break;
 	}
 
 	M_SETFIB(m, ifp->if_fib);
 	netisr_dispatch(isr, m);
 	return;
 
 dropanyway:
 	if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 	if (m)
 		m_freem(m);
 	return;
 }
 
 static int
 iso88025_resolvemulti (ifp, llsa, sa)
 	struct ifnet *ifp;
 	struct sockaddr **llsa;
 	struct sockaddr *sa;
 {
 	struct sockaddr_dl *sdl;
 #ifdef INET
 	struct sockaddr_in *sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	u_char *e_addr;
 
 	switch(sa->sa_family) {
 	case AF_LINK:
 		/*
 		 * No mapping needed. Just check that it's a valid MC address.
 		 */
 		sdl = (struct sockaddr_dl *)sa;
 		e_addr = LLADDR(sdl);
 		if ((e_addr[0] & 1) != 1) {
 			return (EADDRNOTAVAIL);
 		}
 		*llsa = 0;
 		return (0);
 
 #ifdef INET
 	case AF_INET:
 		sin = (struct sockaddr_in *)sa;
 		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			return (EADDRNOTAVAIL);
 		}
 		sdl = link_init_sdl(ifp, *llsa, IFT_ISO88025);
 		sdl->sdl_alen = ISO88025_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return (0);
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)sa;
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			/*
 			 * An IP6 address of 0 means listen to all
 			 * of the Ethernet multicast address used for IP6.
 			 * (This is used for multicast routers.)
 			 */
 			ifp->if_flags |= IFF_ALLMULTI;
 			*llsa = 0;
 			return (0);
 		}
 		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
 			return (EADDRNOTAVAIL);
 		}
 		sdl = link_init_sdl(ifp, *llsa, IFT_ISO88025);
 		sdl->sdl_alen = ISO88025_ADDR_LEN;
 		e_addr = LLADDR(sdl);
 		ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
 		*llsa = (struct sockaddr *)sdl;
 		return (0);
 #endif
 
 	default:
 		/*
 		 * Well, the text isn't quite right, but it's the name
 		 * that counts...
 		 */
 		return (EAFNOSUPPORT);
 	}
 
 	return (0);
 }
 
-static MALLOC_DEFINE(M_ISO88025, "arpcom", "802.5 interface internals");
-
-static void*
-iso88025_alloc(u_char type, struct ifnet *ifp)
-{
-	struct arpcom	*ac;
- 
-        ac = malloc(sizeof(struct arpcom), M_ISO88025, M_WAITOK | M_ZERO);
-	ac->ac_ifp = ifp;
-
-	return (ac);
-} 
-
-static void
-iso88025_free(void *com, u_char type)
-{
- 
-        free(com, M_ISO88025);
-}
- 
-static int
-iso88025_modevent(module_t mod, int type, void *data)
-{
-  
-        switch (type) {
-        case MOD_LOAD:
-                if_register_com_alloc(IFT_ISO88025, iso88025_alloc,
-                    iso88025_free);
-                break;
-        case MOD_UNLOAD:
-                if_deregister_com_alloc(IFT_ISO88025);
-                break;
-        default:
-                return EOPNOTSUPP;
-        }
-
-        return (0);
-}
-
 static moduledata_t iso88025_mod = {
-	"iso88025",
-	iso88025_modevent,
-	0
+	.name = "iso88025",
 };
 
 DECLARE_MODULE(iso88025, iso88025_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(iso88025, 1);
Index: user/dchagin/lemul/sys/net/if_mib.c
===================================================================
--- user/dchagin/lemul/sys/net/if_mib.c	(revision 274240)
+++ user/dchagin/lemul/sys/net/if_mib.c	(revision 274241)
@@ -1,151 +1,151 @@
 /*-
  * Copyright 1996 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  * 
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_mib.h>
 #include <net/vnet.h>
 
 /*
  * A sysctl(3) MIB for generic interface information.  This information
  * is exported in the net.link.generic branch, which has the following
  * structure:
  *
  * net.link.generic	.system			- system-wide control variables
  *						  and statistics (node)
  *			.ifdata.<ifindex>.general
  *						- what's in `struct ifdata'
  *						  plus some other info
  *			.ifdata.<ifindex>.linkspecific
  *						- a link-type-specific data
  *						  structure (as might be used
  *						  by an SNMP agent
  *
  * Perhaps someday we will make addresses accessible via this interface
  * as well (then there will be four such...).  The reason that the
  * index comes before the last element in the name is because it
  * seems more orthogonal that way, particularly with the possibility
  * of other per-interface data living down here as well (e.g., integrated
  * services stuff).
  */
 
 SYSCTL_DECL(_net_link_generic);
 static SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RW, 0,
 	    "Variables global to all interfaces");
 
-SYSCTL_VNET_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD,
-	    &VNET_NAME(if_index), 0,
-	     "Number of configured interfaces");
+SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount,
+	CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(if_index), 0,
+	"Number of configured interfaces");
 
 static int
 sysctl_ifdata(SYSCTL_HANDLER_ARGS) /* XXX bad syntax! */
 {
 	int *name = (int *)arg1;
 	int error;
 	u_int namelen = arg2;
 	struct ifnet *ifp;
 	struct ifmibdata ifmd;
 	size_t dlen;
 	char *dbuf;
 
 	if (namelen != 2)
 		return EINVAL;
 	if (name[0] <= 0)
 		return (ENOENT);
 	ifp = ifnet_byindex_ref(name[0]);
 	if (ifp == NULL)
 		return (ENOENT);
 
 	switch(name[1]) {
 	default:
 		error = ENOENT;
 		goto out;
 
 	case IFDATA_GENERAL:
 		bzero(&ifmd, sizeof(ifmd));
 		strlcpy(ifmd.ifmd_name, ifp->if_xname, sizeof(ifmd.ifmd_name));
 
 		ifmd.ifmd_pcount = ifp->if_pcount;
 		if_data_copy(ifp, &ifmd.ifmd_data);
 
 		ifmd.ifmd_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifmd.ifmd_snd_len = ifp->if_snd.ifq_len;
 		ifmd.ifmd_snd_maxlen = ifp->if_snd.ifq_maxlen;
 		ifmd.ifmd_snd_drops =
 		    ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
 
 		error = SYSCTL_OUT(req, &ifmd, sizeof ifmd);
 		if (error)
 			goto out;
 		break;
 
 	case IFDATA_LINKSPECIFIC:
 		error = SYSCTL_OUT(req, ifp->if_linkmib, ifp->if_linkmiblen);
 		if (error || !req->newptr)
 			goto out;
 
 		error = SYSCTL_IN(req, ifp->if_linkmib, ifp->if_linkmiblen);
 		if (error)
 			goto out;
 		break;
 
 	case IFDATA_DRIVERNAME:
 		/* 20 is enough for 64bit ints */
 		dlen = strlen(ifp->if_dname) + 20 + 1;
 		if ((dbuf = malloc(dlen, M_TEMP, M_NOWAIT)) == NULL) {
 			error = ENOMEM;
 			goto out;
 		}
 		if (ifp->if_dunit == IF_DUNIT_NONE)
 			strcpy(dbuf, ifp->if_dname);
 		else
 			sprintf(dbuf, "%s%d", ifp->if_dname, ifp->if_dunit);
 
 		error = SYSCTL_OUT(req, dbuf, strlen(dbuf) + 1);
 		if (error == 0 && req->newptr != NULL)
 			error = EPERM;
 		free(dbuf, M_TEMP);
 		goto out;
 	}
 out:
 	if_rele(ifp);
 	return error;
 }
 
 static SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RW,
 	    sysctl_ifdata, "Interface table");
 
Index: user/dchagin/lemul/sys/net/vnet.h
===================================================================
--- user/dchagin/lemul/sys/net/vnet.h	(revision 274240)
+++ user/dchagin/lemul/sys/net/vnet.h	(revision 274241)
@@ -1,491 +1,428 @@
 /*-
  * Copyright (c) 2006-2009 University of Zagreb
  * Copyright (c) 2006-2009 FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by the University of Zagreb and the
  * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
  * FreeBSD Foundation.
  *
  * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org>
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*-
  * This header file defines several sets of interfaces supporting virtualized
  * network stacks:
  *
  * - Definition of 'struct vnet' and functions and macros to allocate/free/
  *   manipulate it.
  *
  * - A virtual network stack memory allocator, which provides support for
  *   virtualized global variables via a special linker set, set_vnet.
  *
  * - Virtualized sysinits/sysuninits, which allow constructors and
  *   destructors to be run for each network stack subsystem as virtual
  *   instances are created and destroyed.
  *
  * If VIMAGE isn't compiled into the kernel, virtualized global variables
  * compile to normal global variables, and virtualized sysinits to regular
  * sysinits.
  */
 
 #ifndef _NET_VNET_H_
 #define	_NET_VNET_H_
 
 /*
  * struct vnet describes a virtualized network stack, and is primarily a
  * pointer to storage for virtualized global variables.  Expose to userspace
  * as required for libkvm.
  */
 #if defined(_KERNEL) || defined(_WANT_VNET)
 #include <sys/queue.h>
 
 struct vnet {
 	LIST_ENTRY(vnet)	 vnet_le;	/* all vnets list */
 	u_int			 vnet_magic_n;
 	u_int			 vnet_ifcnt;
 	u_int			 vnet_sockcnt;
 	void			*vnet_data_mem;
 	uintptr_t		 vnet_data_base;
 };
 #define	VNET_MAGIC_N	0x3e0d8f29
 
 /*
  * These two virtual network stack allocator definitions are also required
  * for libkvm so that it can evaluate virtualized global variables.
  */
 #define	VNET_SETNAME		"set_vnet"
 #define	VNET_SYMPREFIX		"vnet_entry_"
 #endif
 
 #ifdef _KERNEL
 
 #define	VNET_PCPUSTAT_DECLARE(type, name)	\
     VNET_DECLARE(counter_u64_t, name[sizeof(type) / sizeof(uint64_t)])
 
 #define	VNET_PCPUSTAT_DEFINE(type, name)	\
     VNET_DEFINE(counter_u64_t, name[sizeof(type) / sizeof(uint64_t)])
 
 #define	VNET_PCPUSTAT_ALLOC(name, wait)	\
     COUNTER_ARRAY_ALLOC(VNET(name), \
 	sizeof(VNET(name)) / sizeof(counter_u64_t), (wait))
 
 #define	VNET_PCPUSTAT_FREE(name)	\
     COUNTER_ARRAY_FREE(VNET(name), sizeof(VNET(name)) / sizeof(counter_u64_t))
 
 #define	VNET_PCPUSTAT_ADD(type, name, f, v)	\
     counter_u64_add(VNET(name)[offsetof(type, f) / sizeof(uint64_t)], (v))
 
 #define	VNET_PCPUSTAT_SYSINIT(name)	\
 static void				\
 vnet_##name##_init(const void *unused)	\
 {					\
 	VNET_PCPUSTAT_ALLOC(name, M_WAITOK);	\
 }					\
 VNET_SYSINIT(vnet_ ## name ## _init, SI_SUB_PROTO_IFATTACHDOMAIN,	\
     SI_ORDER_ANY, vnet_ ## name ## _init, NULL)
 
 #define	VNET_PCPUSTAT_SYSUNINIT(name)					\
 static void								\
 vnet_##name##_uninit(const void *unused)				\
 {									\
 	VNET_PCPUSTAT_FREE(name);					\
 }									\
 VNET_SYSUNINIT(vnet_ ## name ## _uninit, SI_SUB_PROTO_IFATTACHDOMAIN,	\
     SI_ORDER_ANY, vnet_ ## name ## _uninit, NULL)
 
+#ifdef SYSCTL_OID
 #define	SYSCTL_VNET_PCPUSTAT(parent, nbr, name, type, array, desc)	\
 static int								\
 array##_sysctl(SYSCTL_HANDLER_ARGS)					\
 {									\
 	type s;								\
 	CTASSERT((sizeof(type) / sizeof(uint64_t)) ==			\
 	    (sizeof(VNET(array)) / sizeof(counter_u64_t)));		\
 	COUNTER_ARRAY_COPY(VNET(array), &s, sizeof(type) / sizeof(uint64_t));\
 	if (req->newptr)						\
 		COUNTER_ARRAY_ZERO(VNET(array),				\
 		    sizeof(type) / sizeof(uint64_t));			\
 	return (SYSCTL_OUT(req, &s, sizeof(type)));			\
 }									\
-SYSCTL_VNET_PROC(parent, nbr, name, CTLTYPE_OPAQUE | CTLFLAG_RW, NULL,	\
-    0, array ## _sysctl, "I", desc)
+SYSCTL_PROC(parent, nbr, name, CTLFLAG_VNET | CTLTYPE_OPAQUE | CTLFLAG_RW, \
+    NULL, 0, array ## _sysctl, "I", desc)
+#endif /* SYSCTL_OID */
 
 #ifdef VIMAGE
 #include <sys/lock.h>
 #include <sys/proc.h>			/* for struct thread */
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 
 /*
  * Location of the kernel's 'set_vnet' linker set.
  */
 extern uintptr_t	*__start_set_vnet;
 __GLOBL(__start_set_vnet);
 extern uintptr_t	*__stop_set_vnet;
 __GLOBL(__stop_set_vnet);
 
 #define	VNET_START	(uintptr_t)&__start_set_vnet
 #define	VNET_STOP	(uintptr_t)&__stop_set_vnet
 
 /*
  * Functions to allocate and destroy virtual network stacks.
  */
 struct vnet *vnet_alloc(void);
 void	vnet_destroy(struct vnet *vnet);
 
 /*
  * The current virtual network stack -- we may wish to move this to struct
  * pcpu in the future.
  */
 #define	curvnet	curthread->td_vnet
 
 /*
  * Various macros -- get and set the current network stack, but also
  * assertions.
  */
 #if defined(INVARIANTS) || defined(VNET_DEBUG)
 #define	VNET_ASSERT(exp, msg)	do {					\
 	if (!(exp))							\
 		panic msg;						\
 } while (0)
 #else
 #define	VNET_ASSERT(exp, msg)	do {					\
 } while (0)
 #endif
 
 #ifdef VNET_DEBUG
 void vnet_log_recursion(struct vnet *, const char *, int);
 
 #define	CURVNET_SET_QUIET(arg)						\
 	VNET_ASSERT((arg) != NULL && (arg)->vnet_magic_n == VNET_MAGIC_N, \
 	    ("CURVNET_SET at %s:%d %s() curvnet=%p vnet=%p",		\
 	    __FILE__, __LINE__, __func__, curvnet, (arg)));		\
 	struct vnet *saved_vnet = curvnet;				\
 	const char *saved_vnet_lpush = curthread->td_vnet_lpush;	\
 	curvnet = arg;							\
 	curthread->td_vnet_lpush = __func__;
  
 #define	CURVNET_SET_VERBOSE(arg)					\
 	CURVNET_SET_QUIET(arg)						\
 	if (saved_vnet)							\
 		vnet_log_recursion(saved_vnet, saved_vnet_lpush, __LINE__);
 
 #define	CURVNET_SET(arg)	CURVNET_SET_VERBOSE(arg)
  
 #define	CURVNET_RESTORE()						\
 	VNET_ASSERT(curvnet != NULL && (saved_vnet == NULL ||		\
 	    saved_vnet->vnet_magic_n == VNET_MAGIC_N),			\
 	    ("CURVNET_RESTORE at %s:%d %s() curvnet=%p saved_vnet=%p",	\
 	    __FILE__, __LINE__, __func__, curvnet, saved_vnet));	\
 	curvnet = saved_vnet;						\
 	curthread->td_vnet_lpush = saved_vnet_lpush;
 #else /* !VNET_DEBUG */
 
 #define	CURVNET_SET_QUIET(arg)						\
 	VNET_ASSERT((arg) != NULL && (arg)->vnet_magic_n == VNET_MAGIC_N, \
 	    ("CURVNET_SET at %s:%d %s() curvnet=%p vnet=%p",		\
 	    __FILE__, __LINE__, __func__, curvnet, (arg)));		\
 	struct vnet *saved_vnet = curvnet;				\
 	curvnet = arg;	
  
 #define	CURVNET_SET_VERBOSE(arg)					\
 	CURVNET_SET_QUIET(arg)
 
 #define	CURVNET_SET(arg)	CURVNET_SET_VERBOSE(arg)
  
 #define	CURVNET_RESTORE()						\
 	VNET_ASSERT(curvnet != NULL && (saved_vnet == NULL ||		\
 	    saved_vnet->vnet_magic_n == VNET_MAGIC_N),			\
 	    ("CURVNET_RESTORE at %s:%d %s() curvnet=%p saved_vnet=%p",	\
 	    __FILE__, __LINE__, __func__, curvnet, saved_vnet));	\
 	curvnet = saved_vnet;
 #endif /* VNET_DEBUG */
 
 extern struct vnet *vnet0;
 #define	IS_DEFAULT_VNET(arg)	((arg) == vnet0)
 
 #define	CRED_TO_VNET(cr)	(cr)->cr_prison->pr_vnet
 #define	TD_TO_VNET(td)		CRED_TO_VNET((td)->td_ucred)
 #define	P_TO_VNET(p)		CRED_TO_VNET((p)->p_ucred)
 
 /*
  * Global linked list of all virtual network stacks, along with read locks to
  * access it.  If a caller may sleep while accessing the list, it must use
  * the sleepable lock macros.
  */
 LIST_HEAD(vnet_list_head, vnet);
 extern struct vnet_list_head vnet_head;
 extern struct rwlock vnet_rwlock;
 extern struct sx vnet_sxlock;
 
 #define	VNET_LIST_RLOCK()		sx_slock(&vnet_sxlock)
 #define	VNET_LIST_RLOCK_NOSLEEP()	rw_rlock(&vnet_rwlock)
 #define	VNET_LIST_RUNLOCK()		sx_sunlock(&vnet_sxlock)
 #define	VNET_LIST_RUNLOCK_NOSLEEP()	rw_runlock(&vnet_rwlock)
 
 /*
  * Iteration macros to walk the global list of virtual network stacks.
  */
 #define	VNET_ITERATOR_DECL(arg)	struct vnet *arg
 #define	VNET_FOREACH(arg)	LIST_FOREACH((arg), &vnet_head, vnet_le)
 
 /*
  * Virtual network stack memory allocator, which allows global variables to
  * be automatically instantiated for each network stack instance.
  */
 #define	VNET_NAME(n)		vnet_entry_##n
 #define	VNET_DECLARE(t, n)	extern t VNET_NAME(n)
 #define	VNET_DEFINE(t, n)	t VNET_NAME(n) __section(VNET_SETNAME) __used
 #define	_VNET_PTR(b, n)		(__typeof(VNET_NAME(n))*)		\
 				    ((b) + (uintptr_t)&VNET_NAME(n))
 
 #define	_VNET(b, n)		(*_VNET_PTR(b, n))
 
 /*
  * Virtualized global variable accessor macros.
  */
 #define	VNET_VNET_PTR(vnet, n)		_VNET_PTR((vnet)->vnet_data_base, n)
 #define	VNET_VNET(vnet, n)		(*VNET_VNET_PTR((vnet), n))
 
 #define	VNET_PTR(n)		VNET_VNET_PTR(curvnet, n)
 #define	VNET(n)			VNET_VNET(curvnet, n)
 
 /*
  * Virtual network stack allocator interfaces from the kernel linker.
  */
 void	*vnet_data_alloc(int size);
 void	 vnet_data_copy(void *start, int size);
 void	 vnet_data_free(void *start_arg, int size);
 
 /*
- * Sysctl variants for vnet-virtualized global variables.  Include
- * <sys/sysctl.h> to expose these definitions.
- *
- * Note: SYSCTL_PROC() handler functions will need to resolve pointer
- * arguments themselves, if required.
- */
-#ifdef SYSCTL_OID
-#define	SYSCTL_VNET_INT(parent, nbr, name, access, ptr, val, descr)	\
-	SYSCTL_OID(parent, nbr, name,					\
-	    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_VNET|(access),		\
-	    ptr, val, sysctl_handle_int, "I", descr)
-#define	SYSCTL_VNET_PROC(parent, nbr, name, access, ptr, arg, handler,	\
-	    fmt, descr)							\
-	CTASSERT(((access) & CTLTYPE) != 0);				\
-	SYSCTL_OID(parent, nbr, name, CTLFLAG_VNET|(access), ptr, arg, 	\
-	    handler, fmt, descr)
-#define	SYSCTL_VNET_OPAQUE(parent, nbr, name, access, ptr, len, fmt,    \
-	    descr)							\
-	SYSCTL_OID(parent, nbr, name,					\
-	    CTLTYPE_OPAQUE|CTLFLAG_VNET|(access), ptr, len, 		\
-	    sysctl_handle_opaque, fmt, descr)
-#define	SYSCTL_VNET_STRING(parent, nbr, name, access, arg, len, descr)	\
-	SYSCTL_OID(parent, nbr, name,					\
-	    CTLTYPE_STRING|CTLFLAG_VNET|(access),			\
-	    arg, len, sysctl_handle_string, "A", descr)
-#define	SYSCTL_VNET_STRUCT(parent, nbr, name, access, ptr, type, descr)	\
-	SYSCTL_OID(parent, nbr, name,					\
-	    CTLTYPE_OPAQUE|CTLFLAG_VNET|(access), ptr,			\
-	    sizeof(struct type), sysctl_handle_opaque, "S," #type,	\
-	    descr)
-#define	SYSCTL_VNET_UINT(parent, nbr, name, access, ptr, val, descr)	\
-	SYSCTL_OID(parent, nbr, name,					\
-	    CTLTYPE_UINT|CTLFLAG_MPSAFE|CTLFLAG_VNET|(access),		\
-	    ptr, val, sysctl_handle_int, "IU", descr)
-#define	VNET_SYSCTL_ARG(req, arg1) do {					\
-	if (arg1 != NULL)						\
-		arg1 = (void *)(TD_TO_VNET((req)->td)->vnet_data_base +	\
-		    (uintptr_t)(arg1));					\
-} while (0)
-#endif /* SYSCTL_OID */
-
-/*
  * Virtual sysinit mechanism, allowing network stack components to declare
  * startup and shutdown methods to be run when virtual network stack
  * instances are created and destroyed.
  */
 #include <sys/kernel.h>
 
 /*
  * SYSINIT/SYSUNINIT variants that provide per-vnet constructors and
  * destructors.
  */
 struct vnet_sysinit {
 	enum sysinit_sub_id	subsystem;
 	enum sysinit_elem_order	order;
 	sysinit_cfunc_t		func;
 	const void		*arg;
 	TAILQ_ENTRY(vnet_sysinit) link;
 };
 
 #define	VNET_SYSINIT(ident, subsystem, order, func, arg)		\
 	static struct vnet_sysinit ident ## _vnet_init = {		\
 		subsystem,						\
 		order,							\
 		(sysinit_cfunc_t)(sysinit_nfunc_t)func,			\
 		(arg)							\
 	};								\
 	SYSINIT(vnet_init_ ## ident, subsystem, order,			\
 	    vnet_register_sysinit, &ident ## _vnet_init);		\
 	SYSUNINIT(vnet_init_ ## ident, subsystem, order,		\
 	    vnet_deregister_sysinit, &ident ## _vnet_init)
 
 #define	VNET_SYSUNINIT(ident, subsystem, order, func, arg)		\
 	static struct vnet_sysinit ident ## _vnet_uninit = {		\
 		subsystem,						\
 		order,							\
 		(sysinit_cfunc_t)(sysinit_nfunc_t)func,			\
 		(arg)							\
 	};								\
 	SYSINIT(vnet_uninit_ ## ident, subsystem, order,		\
 	    vnet_register_sysuninit, &ident ## _vnet_uninit);		\
 	SYSUNINIT(vnet_uninit_ ## ident, subsystem, order,		\
 	    vnet_deregister_sysuninit, &ident ## _vnet_uninit)
 
 /*
  * Run per-vnet sysinits or sysuninits during vnet creation/destruction.
  */
 void	 vnet_sysinit(void);
 void	 vnet_sysuninit(void);
 
 /*
  * Interfaces for managing per-vnet constructors and destructors.
  */
 void	vnet_register_sysinit(void *arg);
 void	vnet_register_sysuninit(void *arg);
 void	vnet_deregister_sysinit(void *arg);
 void	vnet_deregister_sysuninit(void *arg);
 
 /*
  * EVENTHANDLER(9) extensions.
  */
 #include <sys/eventhandler.h>
 
 void	vnet_global_eventhandler_iterator_func(void *, ...);
 #define VNET_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \
 do {									\
 	if (IS_DEFAULT_VNET(curvnet)) {					\
 		(tag) = vimage_eventhandler_register(NULL, #name, func,	\
 		    arg, priority,					\
 		    vnet_global_eventhandler_iterator_func);		\
 	}								\
 } while(0)
 #define VNET_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority)	\
 do {									\
 	if (IS_DEFAULT_VNET(curvnet)) {					\
 		vimage_eventhandler_register(NULL, #name, func,		\
 		    arg, priority,					\
 		    vnet_global_eventhandler_iterator_func);		\
 	}								\
 } while(0)
 
 #else /* !VIMAGE */
 
 /*
  * Various virtual network stack macros compile to no-ops without VIMAGE.
  */
 #define	curvnet			NULL
 
 #define	VNET_ASSERT(exp, msg)
 #define	CURVNET_SET(arg)
 #define	CURVNET_SET_QUIET(arg)
 #define	CURVNET_RESTORE()
 
 #define	VNET_LIST_RLOCK()
 #define	VNET_LIST_RLOCK_NOSLEEP()
 #define	VNET_LIST_RUNLOCK()
 #define	VNET_LIST_RUNLOCK_NOSLEEP()
 #define	VNET_ITERATOR_DECL(arg)
 #define	VNET_FOREACH(arg)
 
 #define	IS_DEFAULT_VNET(arg)	1
 #define	CRED_TO_VNET(cr)	NULL
 #define	TD_TO_VNET(td)		NULL
 #define	P_TO_VNET(p)		NULL
 
 /*
  * Versions of the VNET macros that compile to normal global variables and
  * standard sysctl definitions.
  */
 #define	VNET_NAME(n)		n
 #define	VNET_DECLARE(t, n)	extern t n
 #define	VNET_DEFINE(t, n)	t n
 #define	_VNET_PTR(b, n)		&VNET_NAME(n)
 
 /*
  * Virtualized global variable accessor macros.
  */
 #define	VNET_VNET_PTR(vnet, n)		(&(n))
 #define	VNET_VNET(vnet, n)		(n)
 
 #define	VNET_PTR(n)		(&(n))
 #define	VNET(n)			(n)
-
-/*
- * When VIMAGE isn't compiled into the kernel, virtaulized SYSCTLs simply
- * become normal SYSCTLs.
- */
-#ifdef SYSCTL_OID
-#define	SYSCTL_VNET_INT(parent, nbr, name, access, ptr, val, descr)	\
-	SYSCTL_INT(parent, nbr, name, access, ptr, val, descr)
-#define	SYSCTL_VNET_PROC(parent, nbr, name, access, ptr, arg, handler,	\
-	    fmt, descr)							\
-	SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt,	\
-	    descr)
-#define	SYSCTL_VNET_OPAQUE(parent, nbr, name, access, ptr, len, fmt,    \
-	    descr)							\
-	SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr)
-#define	SYSCTL_VNET_STRING(parent, nbr, name, access, arg, len, descr)	\
-	SYSCTL_STRING(parent, nbr, name, access, arg, len, descr)
-#define	SYSCTL_VNET_STRUCT(parent, nbr, name, access, ptr, type, descr)	\
-	SYSCTL_STRUCT(parent, nbr, name, access, ptr, type, descr)
-#define	SYSCTL_VNET_UINT(parent, nbr, name, access, ptr, val, descr)	\
-	SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr)
-#define	VNET_SYSCTL_ARG(req, arg1)
-#endif /* SYSCTL_OID */
 
 /*
  * When VIMAGE isn't compiled into the kernel, VNET_SYSINIT/VNET_SYSUNINIT
  * map into normal sysinits, which have the same ordering properties.
  */
 #define	VNET_SYSINIT(ident, subsystem, order, func, arg)		\
 	SYSINIT(ident, subsystem, order, func, arg)
 #define	VNET_SYSUNINIT(ident, subsystem, order, func, arg)		\
 	SYSUNINIT(ident, subsystem, order, func, arg)
 
 /*
  * Without VIMAGE revert to the default implementation.
  */
 #define VNET_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \
 	(tag) = eventhandler_register(NULL, #name, func, arg, priority)
 #define VNET_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority)	\
 	eventhandler_register(NULL, #name, func, arg, priority)
 #endif /* VIMAGE */
 #endif /* _KERNEL */
 
 #endif /* !_NET_VNET_H_ */
Index: user/dchagin/lemul/sys/netgraph/ng_ether.c
===================================================================
--- user/dchagin/lemul/sys/netgraph/ng_ether.c	(revision 274240)
+++ user/dchagin/lemul/sys/netgraph/ng_ether.c	(revision 274241)
@@ -1,880 +1,880 @@
 
 /*
  * ng_ether.c
  */
 
 /*-
  * Copyright (c) 1996-2000 Whistle Communications, Inc.
  * All rights reserved.
  * 
  * Subject to the following obligations and disclaimer of warranty, use and
  * redistribution of this software, in source or object code forms, with or
  * without modifications are expressly permitted by Whistle Communications;
  * provided, however, that:
  * 1. Any and all reproductions of the source or object code must include the
  *    copyright notice above and the following disclaimer of warranties; and
  * 2. No rights are granted, in any manner or form, to use Whistle
  *    Communications, Inc. trademarks, including the mark "WHISTLE
  *    COMMUNICATIONS" on advertising, endorsements, or otherwise except as
  *    such appears in the above copyright notice or in the software.
  * 
  * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
  * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
  * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
  * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
  * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
  * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
  * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
  * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
  * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
  * OF SUCH DAMAGE.
  *
  * Authors: Archie Cobbs <archie@freebsd.org>
  *	    Julian Elischer <julian@freebsd.org>
  *
  * $FreeBSD$
  */
 
 /*
  * ng_ether(4) netgraph node type
  */
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/syslog.h>
 #include <sys/socket.h>
 #include <sys/taskqueue.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_arp.h>
 #include <net/if_var.h>
 #include <net/ethernet.h>
 #include <net/if_bridgevar.h>
 #include <net/vnet.h>
 
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/ng_parse.h>
 #include <netgraph/ng_ether.h>
 
 MODULE_VERSION(ng_ether, 1);
 
-#define IFP2NG(ifp)  (IFP2AC((ifp))->ac_netgraph)
+#define IFP2NG(ifp)  ((ifp)->if_l2com)
 
 /* Per-node private data */
 struct private {
 	struct ifnet	*ifp;		/* associated interface */
 	hook_p		upper;		/* upper hook connection */
 	hook_p		lower;		/* lower hook connection */
 	hook_p		orphan;		/* orphan hook connection */
 	u_char		autoSrcAddr;	/* always overwrite source address */
 	u_char		promisc;	/* promiscuous mode enabled */
 	u_long		hwassist;	/* hardware checksum capabilities */
 	u_int		flags;		/* flags e.g. really die */
 };
 typedef struct private *priv_p;
 
 /* Hook pointers used by if_ethersubr.c to callback to netgraph */
 extern	void	(*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp);
 extern	void	(*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m);
 extern	int	(*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp);
 extern	void	(*ng_ether_attach_p)(struct ifnet *ifp);
 extern	void	(*ng_ether_detach_p)(struct ifnet *ifp);
 extern	void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
 
 /* Functional hooks called from if_ethersubr.c */
 static void	ng_ether_input(struct ifnet *ifp, struct mbuf **mp);
 static void	ng_ether_input_orphan(struct ifnet *ifp, struct mbuf *m);
 static int	ng_ether_output(struct ifnet *ifp, struct mbuf **mp);
 static void	ng_ether_attach(struct ifnet *ifp);
 static void	ng_ether_detach(struct ifnet *ifp); 
 static void	ng_ether_link_state(struct ifnet *ifp, int state); 
 
 /* Other functions */
 static int	ng_ether_rcv_lower(hook_p node, item_p item);
 static int	ng_ether_rcv_upper(hook_p node, item_p item);
 
 /* Netgraph node methods */
 static ng_constructor_t	ng_ether_constructor;
 static ng_rcvmsg_t	ng_ether_rcvmsg;
 static ng_shutdown_t	ng_ether_shutdown;
 static ng_newhook_t	ng_ether_newhook;
 static ng_rcvdata_t	ng_ether_rcvdata;
 static ng_disconnect_t	ng_ether_disconnect;
 static int		ng_ether_mod_event(module_t mod, int event, void *data);
 
 static eventhandler_tag	ng_ether_ifnet_arrival_cookie;
 
 /* List of commands and how to convert arguments to/from ASCII */
 static const struct ng_cmdlist ng_ether_cmdlist[] = {
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_GET_IFNAME,
 	  "getifname",
 	  NULL,
 	  &ng_parse_string_type
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_GET_IFINDEX,
 	  "getifindex",
 	  NULL,
 	  &ng_parse_int32_type
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_GET_ENADDR,
 	  "getenaddr",
 	  NULL,
 	  &ng_parse_enaddr_type
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_SET_ENADDR,
 	  "setenaddr",
 	  &ng_parse_enaddr_type,
 	  NULL
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_GET_PROMISC,
 	  "getpromisc",
 	  NULL,
 	  &ng_parse_int32_type
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_SET_PROMISC,
 	  "setpromisc",
 	  &ng_parse_int32_type,
 	  NULL
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_GET_AUTOSRC,
 	  "getautosrc",
 	  NULL,
 	  &ng_parse_int32_type
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_SET_AUTOSRC,
 	  "setautosrc",
 	  &ng_parse_int32_type,
 	  NULL
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_ADD_MULTI,
 	  "addmulti",
 	  &ng_parse_enaddr_type,
 	  NULL
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_DEL_MULTI,
 	  "delmulti",
 	  &ng_parse_enaddr_type,
 	  NULL
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_DETACH,
 	  "detach",
 	  NULL,
 	  NULL
 	},
 	{ 0 }
 };
 
 static struct ng_type ng_ether_typestruct = {
 	.version =	NG_ABI_VERSION,
 	.name =		NG_ETHER_NODE_TYPE,
 	.mod_event =	ng_ether_mod_event,
 	.constructor =	ng_ether_constructor,
 	.rcvmsg =	ng_ether_rcvmsg,
 	.shutdown =	ng_ether_shutdown,
 	.newhook =	ng_ether_newhook,
 	.rcvdata =	ng_ether_rcvdata,
 	.disconnect =	ng_ether_disconnect,
 	.cmdlist =	ng_ether_cmdlist,
 };
 NETGRAPH_INIT(ether, &ng_ether_typestruct);
 
 /******************************************************************
 		    UTILITY FUNCTIONS
 ******************************************************************/
 static void
 ng_ether_sanitize_ifname(const char *ifname, char *name)
 {
 	int i;
 
 	for (i = 0; i < IFNAMSIZ; i++) {
 		if (ifname[i] == '.' || ifname[i] == ':')
 			name[i] = '_';
 		else
 			name[i] = ifname[i];
 		if (name[i] == '\0')
 			break;
 	}
 }
 
 /******************************************************************
 		    ETHERNET FUNCTION HOOKS
 ******************************************************************/
 
 /*
  * Handle a packet that has come in on an interface. We get to
  * look at it here before any upper layer protocols do.
  */
 static void
 ng_ether_input(struct ifnet *ifp, struct mbuf **mp)
 {
 	const node_p node = IFP2NG(ifp);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	int error;
 
 	/* If "lower" hook not connected, let packet continue */
 	if (priv->lower == NULL)
 		return;
 	NG_SEND_DATA_ONLY(error, priv->lower, *mp);	/* sets *mp = NULL */
 }
 
 /*
  * Handle a packet that has come in on an interface, and which
  * does not match any of our known protocols (an ``orphan'').
  */
 static void
 ng_ether_input_orphan(struct ifnet *ifp, struct mbuf *m)
 {
 	const node_p node = IFP2NG(ifp);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	int error;
 
 	/* If "orphan" hook not connected, discard packet */
 	if (priv->orphan == NULL) {
 		m_freem(m);
 		return;
 	}
 	NG_SEND_DATA_ONLY(error, priv->orphan, m);
 }
 
 /*
  * Handle a packet that is going out on an interface.
  * The Ethernet header is already attached to the mbuf.
  */
 static int
 ng_ether_output(struct ifnet *ifp, struct mbuf **mp)
 {
 	const node_p node = IFP2NG(ifp);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	int error = 0;
 
 	/* If "upper" hook not connected, let packet continue */
 	if (priv->upper == NULL)
 		return (0);
 
 	/* Send it out "upper" hook */
 	NG_OUTBOUND_THREAD_REF();
 	NG_SEND_DATA_ONLY(error, priv->upper, *mp);
 	NG_OUTBOUND_THREAD_UNREF();
 	return (error);
 }
 
 /*
  * A new Ethernet interface has been attached.
  * Create a new node for it, etc.
  */
 static void
 ng_ether_attach(struct ifnet *ifp)
 {
 	char name[IFNAMSIZ];
 	priv_p priv;
 	node_p node;
 
 	/*
 	 * Do not create / attach an ether node to this ifnet if
 	 * a netgraph node with the same name already exists.
 	 * This should prevent ether nodes to become attached to
 	 * eiface nodes, which may be problematic due to naming
 	 * clashes.
 	 */
 	if ((node = ng_name2noderef(NULL, ifp->if_xname)) != NULL) {
 		NG_NODE_UNREF(node);
 		return;
 	}
 
 	/* Create node */
 	KASSERT(!IFP2NG(ifp), ("%s: node already exists?", __func__));
 	if (ng_make_node_common(&ng_ether_typestruct, &node) != 0) {
 		log(LOG_ERR, "%s: can't %s for %s\n",
 		    __func__, "create node", ifp->if_xname);
 		return;
 	}
 
 	/* Allocate private data */
 	priv = malloc(sizeof(*priv), M_NETGRAPH, M_NOWAIT | M_ZERO);
 	if (priv == NULL) {
 		log(LOG_ERR, "%s: can't %s for %s\n",
 		    __func__, "allocate memory", ifp->if_xname);
 		NG_NODE_UNREF(node);
 		return;
 	}
 	NG_NODE_SET_PRIVATE(node, priv);
 	priv->ifp = ifp;
 	IFP2NG(ifp) = node;
 	priv->hwassist = ifp->if_hwassist;
 
 	/* Try to give the node the same name as the interface */
 	ng_ether_sanitize_ifname(ifp->if_xname, name);
 	if (ng_name_node(node, name) != 0)
 		log(LOG_WARNING, "%s: can't name node %s\n", __func__, name);
 }
 
 /*
  * An Ethernet interface is being detached.
  * REALLY Destroy its node.
  */
 static void
 ng_ether_detach(struct ifnet *ifp)
 {
 	const node_p node = IFP2NG(ifp);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 	NG_NODE_REALLY_DIE(node);	/* Force real removal of node */
 	/*
 	 * We can't assume the ifnet is still around when we run shutdown
 	 * So zap it now. XXX We HOPE that anything running at this time
 	 * handles it (as it should in the non netgraph case).
 	 */
 	IFP2NG(ifp) = NULL;
 	priv->ifp = NULL;	/* XXX race if interrupted an output packet */
 	ng_rmnode_self(node);		/* remove all netgraph parts */
 }
 
 /*
  * Notify graph about link event.
  * if_link_state_change() has already checked that the state has changed.
  */
 static void
 ng_ether_link_state(struct ifnet *ifp, int state)
 {
 	const node_p node = IFP2NG(ifp);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ng_mesg *msg;
 	int cmd, dummy_error = 0;
 
 	if (state == LINK_STATE_UP)
 		cmd = NGM_LINK_IS_UP;
 	else if (state == LINK_STATE_DOWN)
 		cmd = NGM_LINK_IS_DOWN;
 	else
 		return;
 
 	if (priv->lower != NULL) {
 		NG_MKMESSAGE(msg, NGM_FLOW_COOKIE, cmd, 0, M_NOWAIT);
 		if (msg != NULL)
 			NG_SEND_MSG_HOOK(dummy_error, node, msg, priv->lower, 0);
 	}
 	if (priv->orphan != NULL) {
 		NG_MKMESSAGE(msg, NGM_FLOW_COOKIE, cmd, 0, M_NOWAIT);
 		if (msg != NULL)
 			NG_SEND_MSG_HOOK(dummy_error, node, msg, priv->orphan, 0);
 	}
 }
 
 /*
  * Interface arrival notification handler.
  * The notification is produced in two cases:
  *  o a new interface arrives
  *  o an existing interface got renamed
  * Currently the first case is handled by ng_ether_attach via special
  * hook ng_ether_attach_p.
  */
 static void
 ng_ether_ifnet_arrival_event(void *arg __unused, struct ifnet *ifp)
 {
 	char name[IFNAMSIZ];
 	node_p node;
 
 	/* Only ethernet interfaces are of interest. */
 	if (ifp->if_type != IFT_ETHER
 	    && ifp->if_type != IFT_L2VLAN)
 		return;
 
 	/*
 	 * Just return if it's a new interface without an ng_ether companion.
 	 */
 	node = IFP2NG(ifp);
 	if (node == NULL)
 		return;
 
 	/* Try to give the node the same name as the new interface name */
 	ng_ether_sanitize_ifname(ifp->if_xname, name);
 	if (ng_name_node(node, name) != 0)
 		log(LOG_WARNING, "%s: can't re-name node %s\n", __func__, name);
 }
 
 /******************************************************************
 		    NETGRAPH NODE METHODS
 ******************************************************************/
 
 /*
  * It is not possible or allowable to create a node of this type.
  * Nodes get created when the interface is attached (or, when
  * this node type's KLD is loaded).
  */
 static int
 ng_ether_constructor(node_p node)
 {
 	return (EINVAL);
 }
 
 /*
  * Check for attaching a new hook.
  */
 static	int
 ng_ether_newhook(node_p node, hook_p hook, const char *name)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	hook_p *hookptr;
 
 	/* Divert hook is an alias for lower */
 	if (strcmp(name, NG_ETHER_HOOK_DIVERT) == 0)
 		name = NG_ETHER_HOOK_LOWER;
 
 	/* Which hook? */
 	if (strcmp(name, NG_ETHER_HOOK_UPPER) == 0) {
 		hookptr = &priv->upper;
 		NG_HOOK_SET_RCVDATA(hook, ng_ether_rcv_upper);
 		NG_HOOK_SET_TO_INBOUND(hook);
 	} else if (strcmp(name, NG_ETHER_HOOK_LOWER) == 0) {
 		hookptr = &priv->lower;
 		NG_HOOK_SET_RCVDATA(hook, ng_ether_rcv_lower);
 	} else if (strcmp(name, NG_ETHER_HOOK_ORPHAN) == 0) {
 		hookptr = &priv->orphan;
 		NG_HOOK_SET_RCVDATA(hook, ng_ether_rcv_lower);
 	} else
 		return (EINVAL);
 
 	/* Check if already connected (shouldn't be, but doesn't hurt) */
 	if (*hookptr != NULL)
 		return (EISCONN);
 
 	/* Disable hardware checksums while 'upper' hook is connected */
 	if (hookptr == &priv->upper)
 		priv->ifp->if_hwassist = 0;
 	NG_HOOK_HI_STACK(hook);
 	/* OK */
 	*hookptr = hook;
 	return (0);
 }
 
 /*
  * Receive an incoming control message.
  */
 static int
 ng_ether_rcvmsg(node_p node, item_p item, hook_p lasthook)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ng_mesg *resp = NULL;
 	int error = 0;
 	struct ng_mesg *msg;
 
 	NGI_GET_MSG(item, msg);
 	switch (msg->header.typecookie) {
 	case NGM_ETHER_COOKIE:
 		switch (msg->header.cmd) {
 		case NGM_ETHER_GET_IFNAME:
 			NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			strlcpy(resp->data, priv->ifp->if_xname, IFNAMSIZ);
 			break;
 		case NGM_ETHER_GET_IFINDEX:
 			NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			*((u_int32_t *)resp->data) = priv->ifp->if_index;
 			break;
 		case NGM_ETHER_GET_ENADDR:
 			NG_MKRESPONSE(resp, msg, ETHER_ADDR_LEN, M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			bcopy(IF_LLADDR(priv->ifp),
 			    resp->data, ETHER_ADDR_LEN);
 			break;
 		case NGM_ETHER_SET_ENADDR:
 		    {
 			if (msg->header.arglen != ETHER_ADDR_LEN) {
 				error = EINVAL;
 				break;
 			}
 			error = if_setlladdr(priv->ifp,
 			    (u_char *)msg->data, ETHER_ADDR_LEN);
 			EVENTHANDLER_INVOKE(iflladdr_event, priv->ifp);
 			break;
 		    }
 		case NGM_ETHER_GET_PROMISC:
 			NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			*((u_int32_t *)resp->data) = priv->promisc;
 			break;
 		case NGM_ETHER_SET_PROMISC:
 		    {
 			u_char want;
 
 			if (msg->header.arglen != sizeof(u_int32_t)) {
 				error = EINVAL;
 				break;
 			}
 			want = !!*((u_int32_t *)msg->data);
 			if (want ^ priv->promisc) {
 				if ((error = ifpromisc(priv->ifp, want)) != 0)
 					break;
 				priv->promisc = want;
 			}
 			break;
 		    }
 		case NGM_ETHER_GET_AUTOSRC:
 			NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			*((u_int32_t *)resp->data) = priv->autoSrcAddr;
 			break;
 		case NGM_ETHER_SET_AUTOSRC:
 			if (msg->header.arglen != sizeof(u_int32_t)) {
 				error = EINVAL;
 				break;
 			}
 			priv->autoSrcAddr = !!*((u_int32_t *)msg->data);
 			break;
 		case NGM_ETHER_ADD_MULTI:
 		    {
 			struct sockaddr_dl sa_dl;
 			struct ifmultiaddr *ifma;
 
 			if (msg->header.arglen != ETHER_ADDR_LEN) {
 				error = EINVAL;
 				break;
 			}
 			bzero(&sa_dl, sizeof(struct sockaddr_dl));
 			sa_dl.sdl_len = sizeof(struct sockaddr_dl);
 			sa_dl.sdl_family = AF_LINK;
 			sa_dl.sdl_alen = ETHER_ADDR_LEN;
 			bcopy((void *)msg->data, LLADDR(&sa_dl),
 			    ETHER_ADDR_LEN);
 			/*
 			 * Netgraph is only permitted to join groups once
 			 * via the if_addmulti() KPI, because it cannot hold
 			 * struct ifmultiaddr * between calls. It may also
 			 * lose a race while we check if the membership
 			 * already exists.
 			 */
 			if_maddr_rlock(priv->ifp);
 			ifma = if_findmulti(priv->ifp,
 			    (struct sockaddr *)&sa_dl);
 			if_maddr_runlock(priv->ifp);
 			if (ifma != NULL) {
 				error = EADDRINUSE;
 			} else {
 				error = if_addmulti(priv->ifp,
 				    (struct sockaddr *)&sa_dl, &ifma);
 			}
 			break;
 		    }
 		case NGM_ETHER_DEL_MULTI:
 		    {
 			struct sockaddr_dl sa_dl;
 
 			if (msg->header.arglen != ETHER_ADDR_LEN) {
 				error = EINVAL;
 				break;
 			}
 			bzero(&sa_dl, sizeof(struct sockaddr_dl));
 			sa_dl.sdl_len = sizeof(struct sockaddr_dl);
 			sa_dl.sdl_family = AF_LINK;
 			sa_dl.sdl_alen = ETHER_ADDR_LEN;
 			bcopy((void *)msg->data, LLADDR(&sa_dl),
 			    ETHER_ADDR_LEN);
 			error = if_delmulti(priv->ifp,
 			    (struct sockaddr *)&sa_dl);
 			break;
 		    }
 		case NGM_ETHER_DETACH:
 			ng_ether_detach(priv->ifp);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	NG_RESPOND_MSG(error, node, item, resp);
 	NG_FREE_MSG(msg);
 	return (error);
 }
 
 /*
  * Receive data on a hook.
  * Since we use per-hook recveive methods this should never be called.
  */
 static int
 ng_ether_rcvdata(hook_p hook, item_p item)
 {
 	NG_FREE_ITEM(item);
 
 	panic("%s: weird hook", __func__);
 }
 
 /*
  * Handle an mbuf received on the "lower" or "orphan" hook.
  */
 static int
 ng_ether_rcv_lower(hook_p hook, item_p item)
 {
 	struct mbuf *m;
 	const node_p node = NG_HOOK_NODE(hook);
 	const priv_p priv = NG_NODE_PRIVATE(node);
  	struct ifnet *const ifp = priv->ifp;
 
 	NGI_GET_M(item, m);
 	NG_FREE_ITEM(item);
 
 	/* Check whether interface is ready for packets */
 
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
 		NG_FREE_M(m);
 		return (ENETDOWN);
 	}
 
 	/* Make sure header is fully pulled up */
 	if (m->m_pkthdr.len < sizeof(struct ether_header)) {
 		NG_FREE_M(m);
 		return (EINVAL);
 	}
 	if (m->m_len < sizeof(struct ether_header)
 	    && (m = m_pullup(m, sizeof(struct ether_header))) == NULL)
 		return (ENOBUFS);
 
 	/* Drop in the MAC address if desired */
 	if (priv->autoSrcAddr) {
 
 		/* Make the mbuf writable if it's not already */
 		if (!M_WRITABLE(m)
 		    && (m = m_pullup(m, sizeof(struct ether_header))) == NULL)
 			return (ENOBUFS);
 
 		/* Overwrite source MAC address */
 		bcopy(IF_LLADDR(ifp),
 		    mtod(m, struct ether_header *)->ether_shost,
 		    ETHER_ADDR_LEN);
 	}
 
 	/* Send it on its way */
 	return ether_output_frame(ifp, m);
 }
 
 /*
  * Handle an mbuf received on the "upper" hook.
  */
 static int
 ng_ether_rcv_upper(hook_p hook, item_p item)
 {
 	struct mbuf *m;
 	const node_p node = NG_HOOK_NODE(hook);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ifnet *ifp = priv->ifp;
 
 	NGI_GET_M(item, m);
 	NG_FREE_ITEM(item);
 
 	/* Check length and pull off header */
 	if (m->m_pkthdr.len < sizeof(struct ether_header)) {
 		NG_FREE_M(m);
 		return (EINVAL);
 	}
 	if (m->m_len < sizeof(struct ether_header) &&
 	    (m = m_pullup(m, sizeof(struct ether_header))) == NULL)
 		return (ENOBUFS);
 
 	m->m_pkthdr.rcvif = ifp;
 
 	/* Pass the packet to the bridge, it may come back to us */
 	if (ifp->if_bridge) {
 		BRIDGE_INPUT(ifp, m);
 		if (m == NULL)
 			return (0);
 	}
 
 	/* Route packet back in */
 	ether_demux(ifp, m);
 	return (0);
 }
 
 /*
  * Shutdown node. This resets the node but does not remove it
  * unless the REALLY_DIE flag is set.
  */
 static int
 ng_ether_shutdown(node_p node)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 
 	if (node->nd_flags & NGF_REALLY_DIE) {
 		/*
 		 * WE came here because the ethernet card is being unloaded,
 		 * so stop being persistant.
 		 * Actually undo all the things we did on creation.
 		 * Assume the ifp has already been freed.
 		 */
 		NG_NODE_SET_PRIVATE(node, NULL);
 		free(priv, M_NETGRAPH);		
 		NG_NODE_UNREF(node);	/* free node itself */
 		return (0);
 	}
 	if (priv->promisc) {		/* disable promiscuous mode */
 		(void)ifpromisc(priv->ifp, 0);
 		priv->promisc = 0;
 	}
 	NG_NODE_REVIVE(node);		/* Signal ng_rmnode we are persisant */
 
 	return (0);
 }
 
 /*
  * Hook disconnection.
  */
 static int
 ng_ether_disconnect(hook_p hook)
 {
 	const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
 
 	if (hook == priv->upper) {
 		priv->upper = NULL;
 		if (priv->ifp != NULL)		/* restore h/w csum */
 			priv->ifp->if_hwassist = priv->hwassist;
 	} else if (hook == priv->lower)
 		priv->lower = NULL;
 	else if (hook == priv->orphan)
 		priv->orphan = NULL;
 	else
 		panic("%s: weird hook", __func__);
 	if ((NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0)
 	&& (NG_NODE_IS_VALID(NG_HOOK_NODE(hook))))
 		ng_rmnode_self(NG_HOOK_NODE(hook));	/* reset node */
 	return (0);
 }
 
 /******************************************************************
 		    	INITIALIZATION
 ******************************************************************/
 
 /*
  * Handle loading and unloading for this node type.
  */
 static int
 ng_ether_mod_event(module_t mod, int event, void *data)
 {
 	int error = 0;
 
 	switch (event) {
 	case MOD_LOAD:
 
 		/* Register function hooks */
 		if (ng_ether_attach_p != NULL) {
 			error = EEXIST;
 			break;
 		}
 		ng_ether_attach_p = ng_ether_attach;
 		ng_ether_detach_p = ng_ether_detach;
 		ng_ether_output_p = ng_ether_output;
 		ng_ether_input_p = ng_ether_input;
 		ng_ether_input_orphan_p = ng_ether_input_orphan;
 		ng_ether_link_state_p = ng_ether_link_state;
 
 		ng_ether_ifnet_arrival_cookie =
 		    EVENTHANDLER_REGISTER(ifnet_arrival_event,
 		    ng_ether_ifnet_arrival_event, NULL, EVENTHANDLER_PRI_ANY);
 		break;
 
 	case MOD_UNLOAD:
 
 		/*
 		 * Note that the base code won't try to unload us until
 		 * all nodes have been removed, and that can't happen
 		 * until all Ethernet interfaces are removed. In any
 		 * case, we know there are no nodes left if the action
 		 * is MOD_UNLOAD, so there's no need to detach any nodes.
 		 */
 
 		EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
 		    ng_ether_ifnet_arrival_cookie);
 
 		/* Unregister function hooks */
 		ng_ether_attach_p = NULL;
 		ng_ether_detach_p = NULL;
 		ng_ether_output_p = NULL;
 		ng_ether_input_p = NULL;
 		ng_ether_input_orphan_p = NULL;
 		ng_ether_link_state_p = NULL;
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static void
 vnet_ng_ether_init(const void *unused)
 {
 	struct ifnet *ifp;
 
 	/* If module load was rejected, don't attach to vnets. */
 	if (ng_ether_attach_p != ng_ether_attach)
 		return;
 
 	/* Create nodes for any already-existing Ethernet interfaces. */
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp->if_type == IFT_ETHER
 		    || ifp->if_type == IFT_L2VLAN)
 			ng_ether_attach(ifp);
 	}
 	IFNET_RUNLOCK();
 }
 VNET_SYSINIT(vnet_ng_ether_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_ng_ether_init, NULL);
Index: user/dchagin/lemul/sys/netinet/cc/cc.c
===================================================================
--- user/dchagin/lemul/sys/netinet/cc/cc.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/cc/cc.c	(revision 274241)
@@ -1,326 +1,327 @@
 /*-
  * Copyright (c) 2007-2008
  *	Swinburne University of Technology, Melbourne, Australia.
  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by Lawrence Stewart and
  * James Healy, made possible in part by a grant from the Cisco University
  * Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This software was first released in 2007 by James Healy and Lawrence Stewart
  * whilst working on the NewTCP research project at Swinburne University of
  * Technology's Centre for Advanced Internet Architectures, Melbourne,
  * Australia, which was made possible in part by a grant from the Cisco
  * University Research Program Fund at Community Foundation Silicon Valley.
  * More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 
 #include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 
 #include <netinet/cc/cc_module.h>
 
 /*
  * List of available cc algorithms on the current system. First element
  * is used as the system default CC algorithm.
  */
 struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
 
 /* Protects the cc_list TAILQ. */
 struct rwlock cc_list_lock;
 
 VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo;
 
 /*
  * Sysctl handler to show and change the default CC algorithm.
  */
 static int
 cc_default_algo(SYSCTL_HANDLER_ARGS)
 {
 	char default_cc[TCP_CA_NAME_MAX];
 	struct cc_algo *funcs;
 	int error;
 
 	/* Get the current default: */
 	CC_LIST_RLOCK();
 	strlcpy(default_cc, CC_DEFAULT()->name, sizeof(default_cc));
 	CC_LIST_RUNLOCK();
 
 	error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req);
 
 	/* Check for error or no change */
 	if (error != 0 || req->newptr == NULL)
 		goto done;
 
 	error = ESRCH;
 
 	/* Find algo with specified name and set it to default. */
 	CC_LIST_RLOCK();
 	STAILQ_FOREACH(funcs, &cc_list, entries) {
 		if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
 			continue;
 		V_default_cc_ptr = funcs;
 		error = 0;
 		break;
 	}
 	CC_LIST_RUNLOCK();
 done:
 	return (error);
 }
 
 /*
  * Sysctl handler to display the list of available CC algorithms.
  */
 static int
 cc_list_available(SYSCTL_HANDLER_ARGS)
 {
 	struct cc_algo *algo;
 	struct sbuf *s;
 	int err, first, nalgos;
 
 	err = nalgos = 0;
 	first = 1;
 
 	CC_LIST_RLOCK();
 	STAILQ_FOREACH(algo, &cc_list, entries) {
 		nalgos++;
 	}
 	CC_LIST_RUNLOCK();
 
 	s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN);
 
 	if (s == NULL)
 		return (ENOMEM);
 
 	/*
 	 * It is theoretically possible for the CC list to have grown in size
 	 * since the call to sbuf_new() and therefore for the sbuf to be too
 	 * small. If this were to happen (incredibly unlikely), the sbuf will
 	 * reach an overflow condition, sbuf_printf() will return an error and
 	 * the sysctl will fail gracefully.
 	 */
 	CC_LIST_RLOCK();
 	STAILQ_FOREACH(algo, &cc_list, entries) {
 		err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
 		if (err) {
 			/* Sbuf overflow condition. */
 			err = EOVERFLOW;
 			break;
 		}
 		first = 0;
 	}
 	CC_LIST_RUNLOCK();
 
 	if (!err) {
 		sbuf_finish(s);
 		err = sysctl_handle_string(oidp, sbuf_data(s), 0, req);
 	}
 
 	sbuf_delete(s);
 	return (err);
 }
 
 /*
  * Reset the default CC algo to NewReno for any netstack which is using the algo
  * that is about to go away as its default.
  */
 static void
 cc_checkreset_default(struct cc_algo *remove_cc)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	CC_LIST_LOCK_ASSERT();
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		if (strncmp(CC_DEFAULT()->name, remove_cc->name,
 		    TCP_CA_NAME_MAX) == 0)
 			V_default_cc_ptr = &newreno_cc_algo;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Initialise CC subsystem on system boot.
  */
 static void
 cc_init(void)
 {
 	CC_LIST_LOCK_INIT();
 	STAILQ_INIT(&cc_list);
 }
 
 /*
  * Returns non-zero on success, 0 on failure.
  */
 int
 cc_deregister_algo(struct cc_algo *remove_cc)
 {
 	struct cc_algo *funcs, *tmpfuncs;
 	int err;
 
 	err = ENOENT;
 
 	/* Never allow newreno to be deregistered. */
 	if (&newreno_cc_algo == remove_cc)
 		return (EPERM);
 
 	/* Remove algo from cc_list so that new connections can't use it. */
 	CC_LIST_WLOCK();
 	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
 		if (funcs == remove_cc) {
 			cc_checkreset_default(remove_cc);
 			STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
 			err = 0;
 			break;
 		}
 	}
 	CC_LIST_WUNLOCK();
 
 	if (!err)
 		/*
 		 * XXXLAS:
 		 * - We may need to handle non-zero return values in future.
 		 * - If we add CC framework support for protocols other than
 		 *   TCP, we may want a more generic way to handle this step.
 		 */
 		tcp_ccalgounload(remove_cc);
 
 	return (err);
 }
 
 /*
  * Returns 0 on success, non-zero on failure.
  */
 int
 cc_register_algo(struct cc_algo *add_cc)
 {
 	struct cc_algo *funcs;
 	int err;
 
 	err = 0;
 
 	/*
 	 * Iterate over list of registered CC algorithms and make sure
 	 * we're not trying to add a duplicate.
 	 */
 	CC_LIST_WLOCK();
 	STAILQ_FOREACH(funcs, &cc_list, entries) {
 		if (funcs == add_cc || strncmp(funcs->name, add_cc->name,
 		    TCP_CA_NAME_MAX) == 0)
 			err = EEXIST;
 	}
 
 	if (!err)
 		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
 
 	CC_LIST_WUNLOCK();
 
 	return (err);
 }
 
 /*
  * Handles kld related events. Returns 0 on success, non-zero on failure.
  */
 int
 cc_modevent(module_t mod, int event_type, void *data)
 {
 	struct cc_algo *algo;
 	int err;
 
 	err = 0;
 	algo = (struct cc_algo *)data;
 
 	switch(event_type) {
 	case MOD_LOAD:
 		if (algo->mod_init != NULL)
 			err = algo->mod_init();
 		if (!err)
 			err = cc_register_algo(algo);
 		break;
 
 	case MOD_QUIESCE:
 	case MOD_SHUTDOWN:
 	case MOD_UNLOAD:
 		err = cc_deregister_algo(algo);
 		if (!err && algo->mod_destroy != NULL)
 			algo->mod_destroy();
 		if (err == ENOENT)
 			err = 0;
 		break;
 
 	default:
 		err = EINVAL;
 		break;
 	}
 
 	return (err);
 }
 
 SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
 
 /* Declare sysctl tree and populate it. */
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
     "congestion control related settings");
 
-SYSCTL_VNET_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW,
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm,
+    CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW,
     NULL, 0, cc_default_algo, "A", "default congestion control algorithm");
 
 SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,
     NULL, 0, cc_list_available, "A",
     "list available congestion control algorithms");
Index: user/dchagin/lemul/sys/netinet/cc/cc_cdg.c
===================================================================
--- user/dchagin/lemul/sys/netinet/cc/cc_cdg.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/cc/cc_cdg.c	(revision 274241)
@@ -1,700 +1,700 @@
 /*-
  * Copyright (c) 2009-2013
  * 	Swinburne University of Technology, Melbourne, Australia
  * All rights reserved.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by David Hayes, made
  * possible in part by a gift from The Cisco University Research Program Fund,
  * a corporate advised fund of Silicon Valley Community Foundation. Development
  * and testing were further assisted by a grant from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * CAIA Delay-Gradient (CDG) congestion control algorithm
  *
  * An implemention of the delay-gradient congestion control algorithm proposed
  * in the following paper:
  *
  * D. A. Hayes and G. Armitage, "Revisiting TCP Congestion Control using Delay
  * Gradients", in IFIP Networking, Valencia, Spain, 9-13 May 2011.
  *
  * Developed as part of the NewTCP research project at Swinburne University of
  * Technology's Centre for Advanced Internet Architectures, Melbourne,
  * Australia. More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/cc.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 
 #include <netinet/cc/cc_module.h>
 
 #include <netinet/khelp/h_ertt.h>
 
 #include <vm/uma.h>
 
 #define	CDG_VERSION "0.1"
 
 #define	CAST_PTR_INT(X) (*((int*)(X)))
 
 /* Private delay-gradient induced congestion control signal. */
 #define	CC_CDG_DELAY 0x01000000
 
 /* NewReno window deflation factor on loss (as a percentage). */
 #define	RENO_BETA 50
 
 /* Queue states. */
 #define	CDG_Q_EMPTY	1
 #define	CDG_Q_RISING	2
 #define	CDG_Q_FALLING	3
 #define	CDG_Q_FULL	4
 #define	CDG_Q_UNKNOWN	9999
 
 /* Number of bit shifts used in probexp lookup table. */
 #define	EXP_PREC 15
 
 /* Largest gradient represented in probexp lookup table. */
 #define	MAXGRAD 5
 
 /*
  * Delay Precision Enhance - number of bit shifts used for qtrend related
  * integer arithmetic precision.
  */
 #define	D_P_E 7
 
 struct qdiff_sample {
 	long qdiff;
 	STAILQ_ENTRY(qdiff_sample) qdiff_lnk;
 };
 
 struct cdg {
 	long max_qtrend;
 	long min_qtrend;
 	STAILQ_HEAD(minrtts_head, qdiff_sample) qdiffmin_q;
 	STAILQ_HEAD(maxrtts_head, qdiff_sample) qdiffmax_q;
 	long window_incr;
 	/* rttcount for window increase when in congestion avoidance */
 	long rtt_count;
 	/* maximum measured rtt within an rtt period */
 	int maxrtt_in_rtt;
 	/* maximum measured rtt within prev rtt period */
 	int maxrtt_in_prevrtt;
 	/* minimum measured rtt within an rtt period */
 	int minrtt_in_rtt;
 	/* minimum measured rtt within prev rtt period */
 	int minrtt_in_prevrtt;
 	/* consecutive congestion episode counter */
 	uint32_t consec_cong_cnt;
 	/* when tracking a new reno type loss window */
 	uint32_t shadow_w;
 	/* maximum number of samples in the moving average queue */
 	int sample_q_size;
 	/* number of samples in the moving average queue */
 	int num_samples;
 	/* estimate of the queue state of the path */
 	int queue_state;
 };
 
 /*
  * Lookup table for:
  *   (1 - exp(-x)) << EXP_PREC, where x = [0,MAXGRAD] in 2^-7 increments
  *
  * Note: probexp[0] is set to 10 (not 0) as a safety for very low increase
  * gradients.
  */
 static const int probexp[641] = {
    10,255,508,759,1008,1255,1501,1744,1985,2225,2463,2698,2932,3165,3395,3624,
    3850,4075,4299,4520,4740,4958,5175,5389,5602,5814,6024,6232,6438,6643,6846,
    7048,7248,7447,7644,7839,8033,8226,8417,8606,8794,8981,9166,9350,9532,9713,
    9892,10070,10247,10422,10596,10769,10940,11110,11278,11445,11611,11776,11939,
    12101,12262,12422,12580,12737,12893,13048,13201,13354,13505,13655,13803,13951,
    14097,14243,14387,14530,14672,14813,14952,15091,15229,15365,15500,15635,15768,
    15900,16032,16162,16291,16419,16547,16673,16798,16922,17046,17168,17289,17410,
    17529,17648,17766,17882,17998,18113,18227,18340,18453,18564,18675,18784,18893,
    19001,19108,19215,19320,19425,19529,19632,19734,19835,19936,20036,20135,20233,
    20331,20427,20523,20619,20713,20807,20900,20993,21084,21175,21265,21355,21444,
    21532,21619,21706,21792,21878,21962,22046,22130,22213,22295,22376,22457,22537,
    22617,22696,22774,22852,22929,23006,23082,23157,23232,23306,23380,23453,23525,
    23597,23669,23739,23810,23879,23949,24017,24085,24153,24220,24286,24352,24418,
    24483,24547,24611,24675,24738,24800,24862,24924,24985,25045,25106,25165,25224,
    25283,25341,25399,25456,25513,25570,25626,25681,25737,25791,25846,25899,25953,
    26006,26059,26111,26163,26214,26265,26316,26366,26416,26465,26514,26563,26611,
    26659,26707,26754,26801,26847,26893,26939,26984,27029,27074,27118,27162,27206,
    27249,27292,27335,27377,27419,27460,27502,27543,27583,27624,27664,27703,27743,
    27782,27821,27859,27897,27935,27973,28010,28047,28084,28121,28157,28193,28228,
    28263,28299,28333,28368,28402,28436,28470,28503,28536,28569,28602,28634,28667,
    28699,28730,28762,28793,28824,28854,28885,28915,28945,28975,29004,29034,29063,
    29092,29120,29149,29177,29205,29232,29260,29287,29314,29341,29368,29394,29421,
    29447,29472,29498,29524,29549,29574,29599,29623,29648,29672,29696,29720,29744,
    29767,29791,29814,29837,29860,29882,29905,29927,29949,29971,29993,30014,30036,
    30057,30078,30099,30120,30141,30161,30181,30201,30221,30241,30261,30280,30300,
    30319,30338,30357,30376,30394,30413,30431,30449,30467,30485,30503,30521,30538,
    30555,30573,30590,30607,30624,30640,30657,30673,30690,30706,30722,30738,30753,
    30769,30785,30800,30815,30831,30846,30861,30876,30890,30905,30919,30934,30948,
    30962,30976,30990,31004,31018,31031,31045,31058,31072,31085,31098,31111,31124,
    31137,31149,31162,31174,31187,31199,31211,31223,31235,31247,31259,31271,31283,
    31294,31306,31317,31328,31339,31351,31362,31373,31383,31394,31405,31416,31426,
    31436,31447,31457,31467,31477,31487,31497,31507,31517,31527,31537,31546,31556,
    31565,31574,31584,31593,31602,31611,31620,31629,31638,31647,31655,31664,31673,
    31681,31690,31698,31706,31715,31723,31731,31739,31747,31755,31763,31771,31778,
    31786,31794,31801,31809,31816,31824,31831,31838,31846,31853,31860,31867,31874,
    31881,31888,31895,31902,31908,31915,31922,31928,31935,31941,31948,31954,31960,
    31967,31973,31979,31985,31991,31997,32003,32009,32015,32021,32027,32033,32038,
    32044,32050,32055,32061,32066,32072,32077,32083,32088,32093,32098,32104,32109,
    32114,32119,32124,32129,32134,32139,32144,32149,32154,32158,32163,32168,32173,
    32177,32182,32186,32191,32195,32200,32204,32209,32213,32217,32222,32226,32230,
    32234,32238,32242,32247,32251,32255,32259,32263,32267,32270,32274,32278,32282,
    32286,32290,32293,32297,32301,32304,32308,32311,32315,32318,32322,32325,32329,
    32332,32336,32339,32342,32346,32349,32352,32356,32359,32362,32365,32368,32371,
    32374,32377,32381,32384,32387,32389,32392,32395,32398,32401,32404,32407,32410,
    32412,32415,32418,32421,32423,32426,32429,32431,32434,32437,32439,32442,32444,
    32447,32449,32452,32454,32457,32459,32461,32464,32466,32469,32471,32473,32476,
    32478,32480,32482,32485,32487,32489,32491,32493,32495,32497,32500,32502,32504,
    32506,32508,32510,32512,32514,32516,32518,32520,32522,32524,32526,32527,32529,
    32531,32533,32535,32537,32538,32540,32542,32544,32545,32547};
 
 static uma_zone_t qdiffsample_zone;
 
 static MALLOC_DEFINE(M_CDG, "cdg data",
   "Per connection data required for the CDG congestion control algorithm");
 
 static int ertt_id;
 
 static VNET_DEFINE(uint32_t, cdg_alpha_inc);
 static VNET_DEFINE(uint32_t, cdg_beta_delay);
 static VNET_DEFINE(uint32_t, cdg_beta_loss);
 static VNET_DEFINE(uint32_t, cdg_smoothing_factor);
 static VNET_DEFINE(uint32_t, cdg_exp_backoff_scale);
 static VNET_DEFINE(uint32_t, cdg_consec_cong);
 static VNET_DEFINE(uint32_t, cdg_hold_backoff);
 #define	V_cdg_alpha_inc		VNET(cdg_alpha_inc)
 #define	V_cdg_beta_delay	VNET(cdg_beta_delay)
 #define	V_cdg_beta_loss		VNET(cdg_beta_loss)
 #define	V_cdg_smoothing_factor	VNET(cdg_smoothing_factor)
 #define	V_cdg_exp_backoff_scale	VNET(cdg_exp_backoff_scale)
 #define	V_cdg_consec_cong	VNET(cdg_consec_cong)
 #define	V_cdg_hold_backoff	VNET(cdg_hold_backoff)
 
 /* Function prototypes. */
 static int cdg_mod_init(void);
 static int cdg_mod_destroy(void);
 static void cdg_conn_init(struct cc_var *ccv);
 static int cdg_cb_init(struct cc_var *ccv);
 static void cdg_cb_destroy(struct cc_var *ccv);
 static void cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type);
 static void cdg_ack_received(struct cc_var *ccv, uint16_t ack_type);
 
 struct cc_algo cdg_cc_algo = {
 	.name = "cdg",
 	.mod_init = cdg_mod_init,
 	.ack_received = cdg_ack_received,
 	.cb_destroy = cdg_cb_destroy,
 	.cb_init = cdg_cb_init,
 	.conn_init = cdg_conn_init,
 	.cong_signal = cdg_cong_signal,
 	.mod_destroy = cdg_mod_destroy
 };
 
 /* Vnet created and being initialised. */
 static void
 cdg_init_vnet(const void *unused __unused)
 {
 
 	V_cdg_alpha_inc = 0;
 	V_cdg_beta_delay = 70;
 	V_cdg_beta_loss = 50;
 	V_cdg_smoothing_factor = 8;
 	V_cdg_exp_backoff_scale = 3;
 	V_cdg_consec_cong = 5;
 	V_cdg_hold_backoff = 5;
 }
 
 static int
 cdg_mod_init(void)
 {
 	VNET_ITERATOR_DECL(v);
 
 	ertt_id = khelp_get_id("ertt");
 	if (ertt_id <= 0)
 		return (EINVAL);
 
 	qdiffsample_zone = uma_zcreate("cdg_qdiffsample",
 	    sizeof(struct qdiff_sample), NULL, NULL, NULL, NULL, 0, 0);
 
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(v) {
 		CURVNET_SET(v);
 		cdg_init_vnet(NULL);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 
 	cdg_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
 	cdg_cc_algo.after_idle = newreno_cc_algo.after_idle;
 
 	return (0);
 }
 
 static int
 cdg_mod_destroy(void)
 {
 
 	uma_zdestroy(qdiffsample_zone);
 	return (0);
 }
 
 static int
 cdg_cb_init(struct cc_var *ccv)
 {
 	struct cdg *cdg_data;
 
 	cdg_data = malloc(sizeof(struct cdg), M_CDG, M_NOWAIT);
 	if (cdg_data == NULL)
 		return (ENOMEM);
 
 	cdg_data->shadow_w = 0;
 	cdg_data->max_qtrend = 0;
 	cdg_data->min_qtrend = 0;
 	cdg_data->queue_state = CDG_Q_UNKNOWN;
 	cdg_data->maxrtt_in_rtt = 0;
 	cdg_data->maxrtt_in_prevrtt = 0;
 	cdg_data->minrtt_in_rtt = INT_MAX;
 	cdg_data->minrtt_in_prevrtt = 0;
 	cdg_data->window_incr = 0;
 	cdg_data->rtt_count = 0;
 	cdg_data->consec_cong_cnt = 0;
 	cdg_data->sample_q_size = V_cdg_smoothing_factor;
 	cdg_data->num_samples = 0;
 	STAILQ_INIT(&cdg_data->qdiffmin_q);
 	STAILQ_INIT(&cdg_data->qdiffmax_q);
 
 	ccv->cc_data = cdg_data;
 
 	return (0);
 }
 
 static void
 cdg_conn_init(struct cc_var *ccv)
 {
 	struct cdg *cdg_data = ccv->cc_data;
 
 	/*
 	 * Initialise the shadow_cwnd in case we are competing with loss based
 	 * flows from the start
 	 */
 	cdg_data->shadow_w = CCV(ccv, snd_cwnd);
 }
 
 static void
 cdg_cb_destroy(struct cc_var *ccv)
 {
 	struct cdg *cdg_data;
 	struct qdiff_sample *qds, *qds_n;
 
 	cdg_data = ccv->cc_data;
 
 	qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
 	while (qds != NULL) {
 		qds_n = STAILQ_NEXT(qds, qdiff_lnk);
 		uma_zfree(qdiffsample_zone,qds);
 		qds = qds_n;
 	}
 
 	qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
 	while (qds != NULL) {
 		qds_n = STAILQ_NEXT(qds, qdiff_lnk);
 		uma_zfree(qdiffsample_zone,qds);
 		qds = qds_n;
 	}
 
 	free(ccv->cc_data, M_CDG);
 }
 
 static int
 cdg_beta_handler(SYSCTL_HANDLER_ARGS)
 {
 
 	if (req->newptr != NULL &&
 	    (CAST_PTR_INT(req->newptr) == 0 || CAST_PTR_INT(req->newptr) > 100))
 		return (EINVAL);
 
 	return (sysctl_handle_int(oidp, arg1, arg2, req));
 }
 
 static int
 cdg_exp_backoff_scale_handler(SYSCTL_HANDLER_ARGS)
 {
 
 	if (req->newptr != NULL && CAST_PTR_INT(req->newptr) < 1)
 		return (EINVAL);
 
 	return (sysctl_handle_int(oidp, arg1, arg2, req));
 }
 
 static inline unsigned long
 cdg_window_decrease(struct cc_var *ccv, unsigned long owin, unsigned int beta)
 {
 
 	return ((ulmin(CCV(ccv, snd_wnd), owin) * beta) / 100);
 }
 
 /*
  * Window increase function
  * This window increase function is independent of the initial window size
  * to ensure small window flows are not discriminated against (i.e. fairness).
  * It increases at 1pkt/rtt like Reno for alpha_inc rtts, and then 2pkts/rtt for
  * the next alpha_inc rtts, etc.
  */
 static void
 cdg_window_increase(struct cc_var *ccv, int new_measurement)
 {
 	struct cdg *cdg_data;
 	int incr, s_w_incr;
 
 	cdg_data = ccv->cc_data;
 	incr = s_w_incr = 0;
 
 	if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) {
 		/* Slow start. */
 		incr = CCV(ccv, t_maxseg);
 		s_w_incr = incr;
 		cdg_data->window_incr = cdg_data->rtt_count = 0;
 	} else {
 		/* Congestion avoidance. */
 		if (new_measurement) {
 			s_w_incr = CCV(ccv, t_maxseg);
 			if (V_cdg_alpha_inc == 0) {
 				incr = CCV(ccv, t_maxseg);
 			} else {
 				if (++cdg_data->rtt_count >= V_cdg_alpha_inc) {
 					cdg_data->window_incr++;
 					cdg_data->rtt_count = 0;
 				}
 				incr = CCV(ccv, t_maxseg) *
 				    cdg_data->window_incr;
 			}
 		}
 	}
 
 	if (cdg_data->shadow_w > 0)
 		cdg_data->shadow_w = ulmin(cdg_data->shadow_w + s_w_incr,
 		    TCP_MAXWIN << CCV(ccv, snd_scale));
 
 	CCV(ccv, snd_cwnd) = ulmin(CCV(ccv, snd_cwnd) + incr,
 	    TCP_MAXWIN << CCV(ccv, snd_scale));
 }
 
 static void
 cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type)
 {
 	struct cdg *cdg_data = ccv->cc_data;
 
 	switch(signal_type) {
 	case CC_CDG_DELAY:
 		CCV(ccv, snd_ssthresh) = cdg_window_decrease(ccv,
 		    CCV(ccv, snd_cwnd), V_cdg_beta_delay);
 		CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
 		CCV(ccv, snd_recover) = CCV(ccv, snd_max);
 		cdg_data->window_incr = cdg_data->rtt_count = 0;
 		ENTER_CONGRECOVERY(CCV(ccv, t_flags));
 		break;
 	case CC_NDUPACK:
 		/*
 		 * If already responding to congestion OR we have guessed no
 		 * queue in the path is full.
 		 */
 		if (IN_CONGRECOVERY(CCV(ccv, t_flags)) ||
 		    cdg_data->queue_state < CDG_Q_FULL) {
 			CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
 			CCV(ccv, snd_recover) = CCV(ccv, snd_max);
 		} else {
 			/*
 			 * Loss is likely to be congestion related. We have
 			 * inferred a queue full state, so have shadow window
 			 * react to loss as NewReno would.
 			 */
 			if (cdg_data->shadow_w > 0)
 				cdg_data->shadow_w = cdg_window_decrease(ccv,
 				    cdg_data->shadow_w, RENO_BETA);
 
 			CCV(ccv, snd_ssthresh) = ulmax(cdg_data->shadow_w,
 			    cdg_window_decrease(ccv, CCV(ccv, snd_cwnd),
 			    V_cdg_beta_loss));
 
 			cdg_data->window_incr = cdg_data->rtt_count = 0;
 		}
 		ENTER_RECOVERY(CCV(ccv, t_flags));
 		break;
 	default:
 		newreno_cc_algo.cong_signal(ccv, signal_type);
 		break;
 	}
 }
 
 /*
  * Using a negative exponential probabilistic backoff so that sources with
  * varying RTTs which share the same link will, on average, have the same
  * probability of backoff over time.
  *
  * Prob_backoff = 1 - exp(-qtrend / V_cdg_exp_backoff_scale), where
  * V_cdg_exp_backoff_scale is the average qtrend for the exponential backoff.
  */
 static inline int
 prob_backoff(long qtrend)
 {
 	int backoff, idx, p;
 
 	backoff = (qtrend > ((MAXGRAD * V_cdg_exp_backoff_scale) << D_P_E));
 
 	if (!backoff) {
 		if (V_cdg_exp_backoff_scale > 1)
 			idx = (qtrend + V_cdg_exp_backoff_scale / 2) /
 			    V_cdg_exp_backoff_scale;
 		else
 			idx = qtrend;
 
 		/* Backoff probability proportional to rate of queue growth. */
 		p = (INT_MAX / (1 << EXP_PREC)) * probexp[idx];
 		backoff = (random() < p);
 	}
 
 	return (backoff);
 }
 
 static inline void
 calc_moving_average(struct cdg *cdg_data, long qdiff_max, long qdiff_min)
 {
 	struct qdiff_sample *qds;
 
 	++cdg_data->num_samples;
 	if (cdg_data->num_samples > cdg_data->sample_q_size) {
 		/* Minimum RTT. */
 		qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
 		cdg_data->min_qtrend =  cdg_data->min_qtrend +
 		    (qdiff_min - qds->qdiff) / cdg_data->sample_q_size;
 		STAILQ_REMOVE_HEAD(&cdg_data->qdiffmin_q, qdiff_lnk);
 		qds->qdiff = qdiff_min;
 		STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, qdiff_lnk);
 
 		/* Maximum RTT. */
 		qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
 		cdg_data->max_qtrend =  cdg_data->max_qtrend +
 		    (qdiff_max - qds->qdiff) / cdg_data->sample_q_size;
 		STAILQ_REMOVE_HEAD(&cdg_data->qdiffmax_q, qdiff_lnk);
 		qds->qdiff = qdiff_max;
 		STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, qdiff_lnk);
 		--cdg_data->num_samples;
 	} else {
 		qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
 		if (qds != NULL) {
 			cdg_data->min_qtrend = cdg_data->min_qtrend +
 			    qdiff_min / cdg_data->sample_q_size;
 			qds->qdiff = qdiff_min;
 			STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds,
 			    qdiff_lnk);
 		}
 
 		qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
 		if (qds) {
 			cdg_data->max_qtrend = cdg_data->max_qtrend +
 			    qdiff_max / cdg_data->sample_q_size;
 			qds->qdiff = qdiff_max;
 			STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds,
 			    qdiff_lnk);
 		}
 	}
 }
 
 static void
 cdg_ack_received(struct cc_var *ccv, uint16_t ack_type)
 {
 	struct cdg *cdg_data;
 	struct ertt *e_t;
 	long qdiff_max, qdiff_min;
 	int congestion, new_measurement, slowstart;
 
 	cdg_data = ccv->cc_data;
 	e_t = (struct ertt *)khelp_get_osd(CCV(ccv, osd), ertt_id);
 	new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT;
 	congestion = 0;
 	cdg_data->maxrtt_in_rtt = imax(e_t->rtt, cdg_data->maxrtt_in_rtt);
 	cdg_data->minrtt_in_rtt = imin(e_t->rtt, cdg_data->minrtt_in_rtt);
 
 	if (new_measurement) {
 		slowstart = (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh));
 		/*
 		 * Update smoothed gradient measurements. Since we are only
 		 * using one measurement per RTT, use max or min rtt_in_rtt.
 		 * This is also less noisy than a sample RTT measurement. Max
 		 * RTT measurements can have trouble due to OS issues.
 		 */
 		if (cdg_data->maxrtt_in_prevrtt) {
 			qdiff_max = ((long)(cdg_data->maxrtt_in_rtt -
 			    cdg_data->maxrtt_in_prevrtt) << D_P_E );
 			qdiff_min = ((long)(cdg_data->minrtt_in_rtt -
 			    cdg_data->minrtt_in_prevrtt) << D_P_E );
 
 			calc_moving_average(cdg_data, qdiff_max, qdiff_min);
 
 			/* Probabilistic backoff with respect to gradient. */
 			if (slowstart && qdiff_min > 0)
 				congestion = prob_backoff(qdiff_min);
 			else if (cdg_data->min_qtrend > 0)
 				congestion = prob_backoff(cdg_data->min_qtrend);
 			else if (slowstart && qdiff_max > 0)
 				congestion = prob_backoff(qdiff_max);
 			else if (cdg_data->max_qtrend > 0)
 				congestion = prob_backoff(cdg_data->max_qtrend);
 			
 			/* Update estimate of queue state. */
 			if (cdg_data->min_qtrend > 0 &&
 			    cdg_data->max_qtrend <= 0) {
 				cdg_data->queue_state = CDG_Q_FULL;
 			} else if (cdg_data->min_qtrend >= 0 &&
 			    cdg_data->max_qtrend < 0) {
 				cdg_data->queue_state = CDG_Q_EMPTY;
 				cdg_data->shadow_w = 0;
 			} else if (cdg_data->min_qtrend > 0 &&
 			    cdg_data->max_qtrend > 0) {
 				cdg_data->queue_state = CDG_Q_RISING;
 			} else if (cdg_data->min_qtrend < 0 &&
 			    cdg_data->max_qtrend < 0) {
 				cdg_data->queue_state = CDG_Q_FALLING;
 			}
 
 			if (cdg_data->min_qtrend < 0 ||
 			    cdg_data->max_qtrend < 0)
 				cdg_data->consec_cong_cnt = 0;
 		}
 
 		cdg_data->minrtt_in_prevrtt = cdg_data->minrtt_in_rtt;
 		cdg_data->minrtt_in_rtt = INT_MAX;
 		cdg_data->maxrtt_in_prevrtt = cdg_data->maxrtt_in_rtt;
 		cdg_data->maxrtt_in_rtt = 0;
 		e_t->flags &= ~ERTT_NEW_MEASUREMENT;
 	}
 
 	if (congestion) {
 		cdg_data->consec_cong_cnt++;
 		if (!IN_RECOVERY(CCV(ccv, t_flags))) {
 			if (cdg_data->consec_cong_cnt <= V_cdg_consec_cong)
 				cdg_cong_signal(ccv, CC_CDG_DELAY);
 			else
 				/*
 				 * We have been backing off but the queue is not
 				 * falling. Assume we are competing with
 				 * loss-based flows and don't back off for the
 				 * next V_cdg_hold_backoff RTT periods.
 				 */
 				if (cdg_data->consec_cong_cnt >=
 				    V_cdg_consec_cong + V_cdg_hold_backoff)
 					cdg_data->consec_cong_cnt = 0;
 
 			/* Won't see effect until 2nd RTT. */
 			cdg_data->maxrtt_in_prevrtt = 0;
 			/*
 			 * Resync shadow window in case we are competing with a
 			 * loss based flow
 			 */
 			cdg_data->shadow_w = ulmax(CCV(ccv, snd_cwnd),
 			    cdg_data->shadow_w);
 		}
 	} else if (ack_type == CC_ACK)
 		cdg_window_increase(ccv, new_measurement);
 }
 
 /* When a vnet is created and being initialised, init the per-stack CDG vars. */
 VNET_SYSINIT(cdg_init_vnet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
     cdg_init_vnet, NULL);
 
 SYSCTL_DECL(_net_inet_tcp_cc_cdg);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, cdg, CTLFLAG_RW, NULL,
     "CAIA delay-gradient congestion control related settings");
 
 SYSCTL_STRING(_net_inet_tcp_cc_cdg, OID_AUTO, version,
     CTLFLAG_RD, CDG_VERSION, sizeof(CDG_VERSION) - 1,
     "Current algorithm/implementation version number");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, alpha_inc,
-    CTLFLAG_RW, &VNET_NAME(cdg_alpha_inc), 0,
+SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, alpha_inc,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_alpha_inc), 0,
     "Increment the window increase factor alpha by 1 MSS segment every "
     "alpha_inc RTTs during congestion avoidance mode.");
 
-SYSCTL_VNET_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_delay,
-    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(cdg_beta_delay), 70,
+SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_delay,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(cdg_beta_delay), 70,
     &cdg_beta_handler, "IU",
     "Delay-based window decrease factor as a percentage "
     "(on delay-based backoff, w = w * beta_delay / 100)");
 
-SYSCTL_VNET_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_loss,
-    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(cdg_beta_loss), 50,
+SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_loss,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(cdg_beta_loss), 50,
     &cdg_beta_handler, "IU",
     "Loss-based window decrease factor as a percentage "
     "(on loss-based backoff, w = w * beta_loss / 100)");
 
-SYSCTL_VNET_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, exp_backoff_scale,
-    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(cdg_exp_backoff_scale), 2,
-    &cdg_exp_backoff_scale_handler, "IU",
+SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, exp_backoff_scale,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
+    &VNET_NAME(cdg_exp_backoff_scale), 2, &cdg_exp_backoff_scale_handler, "IU",
     "Scaling parameter for the probabilistic exponential backoff");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg,  OID_AUTO, smoothing_factor,
-    CTLFLAG_RW, &VNET_NAME(cdg_smoothing_factor), 8,
+SYSCTL_UINT(_net_inet_tcp_cc_cdg,  OID_AUTO, smoothing_factor,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_smoothing_factor), 8,
     "Number of samples used for moving average smoothing (0 = no smoothing)");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_consec_cong,
-    CTLFLAG_RW, &VNET_NAME(cdg_consec_cong), 5,
+SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_consec_cong,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_consec_cong), 5,
     "Number of consecutive delay-gradient based congestion episodes which will "
     "trigger loss based CC compatibility");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_hold_backoff,
-    CTLFLAG_RW, &VNET_NAME(cdg_hold_backoff), 5,
+SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_hold_backoff,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_hold_backoff), 5,
     "Number of consecutive delay-gradient based congestion episodes to hold "
     "the window backoff for loss based CC compatibility");
 
 DECLARE_CC_MODULE(cdg, &cdg_cc_algo);
 
 MODULE_DEPEND(cdg, ertt, 1, 1, 1);
Index: user/dchagin/lemul/sys/netinet/cc/cc_chd.c
===================================================================
--- user/dchagin/lemul/sys/netinet/cc/cc_chd.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/cc/cc_chd.c	(revision 274241)
@@ -1,496 +1,499 @@
 /*-
  * Copyright (c) 2009-2010
  *	Swinburne University of Technology, Melbourne, Australia
  * Copyright (c) 2010-2011 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by David Hayes and
  * Lawrence Stewart, made possible in part by a grant from the Cisco University
  * Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, Melbourne, Australia by
  * David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * An implementation of the CAIA-Hamilton delay based congestion control
  * algorithm, based on "Improved coexistence and loss tolerance for delay based
  * TCP congestion control" by D. A. Hayes and G. Armitage., in 35th Annual IEEE
  * Conference on Local Computer Networks (LCN 2010), Denver, Colorado, USA,
  * 11-14 October 2010.
  *
  * Originally released as part of the NewTCP research project at Swinburne
  * University of Technology's Centre for Advanced Internet Architectures,
  * Melbourne, Australia, which was made possible in part by a grant from the
  * Cisco University Research Program Fund at Community Foundation Silicon
  * Valley. More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/cc.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 
 #include <netinet/cc/cc_module.h>
 
 #include <netinet/khelp/h_ertt.h>
 
 #define	CAST_PTR_INT(X)	(*((int*)(X)))
 
 /*
  * Private signal type for rate based congestion signal.
  * See <netinet/cc.h> for appropriate bit-range to use for private signals.
  */
 #define	CC_CHD_DELAY	0x02000000
 
 /* Largest possible number returned by random(). */
 #define	RANDOM_MAX	INT_MAX
 
 static void	chd_ack_received(struct cc_var *ccv, uint16_t ack_type);
 static void	chd_cb_destroy(struct cc_var *ccv);
 static int	chd_cb_init(struct cc_var *ccv);
 static void	chd_cong_signal(struct cc_var *ccv, uint32_t signal_type);
 static void	chd_conn_init(struct cc_var *ccv);
 static int	chd_mod_init(void);
 
 struct chd {
 	/*
 	 * Shadow window - keeps track of what the NewReno congestion window
 	 * would have been if delay-based cwnd backoffs had not been made. This
 	 * functionality aids coexistence with loss-based TCP flows which may be
 	 * sharing links along the path.
 	 */
 	unsigned long shadow_w;
 	/*
 	 * Loss-based TCP compatibility flag - When set, it turns on the shadow
 	 * window functionality.
 	 */
 	int loss_compete;
 	 /* The maximum round trip time seen within a measured rtt period. */
 	int maxrtt_in_rtt;
 	/* The previous qdly that caused cwnd to backoff. */
 	int prev_backoff_qdly;
 };
 
 static int ertt_id;
 
 static VNET_DEFINE(uint32_t, chd_qmin) = 5;
 static VNET_DEFINE(uint32_t, chd_pmax) = 50;
 static VNET_DEFINE(uint32_t, chd_loss_fair) = 1;
 static VNET_DEFINE(uint32_t, chd_use_max) = 1;
 static VNET_DEFINE(uint32_t, chd_qthresh) = 20;
 #define	V_chd_qthresh	VNET(chd_qthresh)
 #define	V_chd_qmin	VNET(chd_qmin)
 #define	V_chd_pmax	VNET(chd_pmax)
 #define	V_chd_loss_fair	VNET(chd_loss_fair)
 #define	V_chd_use_max	VNET(chd_use_max)
 
 static MALLOC_DEFINE(M_CHD, "chd data",
     "Per connection data required for the CHD congestion control algorithm");
 
 struct cc_algo chd_cc_algo = {
 	.name = "chd",
 	.ack_received = chd_ack_received,
 	.cb_destroy = chd_cb_destroy,
 	.cb_init = chd_cb_init,
 	.cong_signal = chd_cong_signal,
 	.conn_init = chd_conn_init,
 	.mod_init = chd_mod_init
 };
 
 static __inline void
 chd_window_decrease(struct cc_var *ccv)
 {
 	unsigned long win;
 
 	win = min(CCV(ccv, snd_wnd), CCV(ccv, snd_cwnd)) / CCV(ccv, t_maxseg);
 	win -= max((win / 2), 1);
 	CCV(ccv, snd_ssthresh) = max(win, 2) * CCV(ccv, t_maxseg);
 }
 
 /*
  * Probabilistic backoff function. Returns 1 if we should backoff or 0
  * otherwise. The calculation of p is similar to the calculation of p in cc_hd.
  */
 static __inline int
 should_backoff(int qdly, int maxqdly, struct chd *chd_data)
 {
 	unsigned long p, rand;
 
 	rand = random();
 
 	if (qdly < V_chd_qthresh) {
 		chd_data->loss_compete = 0;
 		p = (((RANDOM_MAX / 100) * V_chd_pmax) /
 		    (V_chd_qthresh - V_chd_qmin)) *
 		    (qdly - V_chd_qmin);
 	} else {
 		if (qdly > V_chd_qthresh) {
 			p = (((RANDOM_MAX / 100) * V_chd_pmax) /
 			    (maxqdly - V_chd_qthresh)) *
 			    (maxqdly - qdly);
 			if (V_chd_loss_fair && rand < p)
 				chd_data->loss_compete = 1;
 		} else {
 			p = (RANDOM_MAX / 100) * V_chd_pmax;
 			chd_data->loss_compete = 0;
 		}
 	}
 
 	return (rand < p);
 }
 
 static __inline void
 chd_window_increase(struct cc_var *ccv, int new_measurement)
 {
 	struct chd *chd_data;
 	int incr;
 
 	chd_data = ccv->cc_data;
 	incr = 0;
 
 	if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) {
 		/* Adapted from NewReno slow start. */
 		if (V_tcp_do_rfc3465) {
 			/* In slow-start with ABC enabled. */
 			if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) {
 				/* Not due to RTO. */
 				incr = min(ccv->bytes_this_ack,
 				    V_tcp_abc_l_var * CCV(ccv, t_maxseg));
 			} else {
 				/* Due to RTO. */
 				incr = min(ccv->bytes_this_ack,
 				    CCV(ccv, t_maxseg));
 			}
 		} else
 			incr = CCV(ccv, t_maxseg);
 
 	} else { /* Congestion avoidance. */
 		if (V_tcp_do_rfc3465) {
 			if (ccv->flags & CCF_ABC_SENTAWND) {
 				ccv->flags &= ~CCF_ABC_SENTAWND;
 				incr = CCV(ccv, t_maxseg);
 			}
 		} else if (new_measurement)
 			incr = CCV(ccv, t_maxseg);
 	}
 
 	if (chd_data->shadow_w > 0) {
 		/* Track NewReno window. */
 		chd_data->shadow_w = min(chd_data->shadow_w + incr,
 		    TCP_MAXWIN << CCV(ccv, snd_scale));
 	}
 
 	CCV(ccv,snd_cwnd) = min(CCV(ccv, snd_cwnd) + incr,
 	    TCP_MAXWIN << CCV(ccv, snd_scale));
 }
 
 /*
  * All ACK signals are used for timing measurements to determine delay-based
  * congestion. However, window increases are only performed when
  * ack_type == CC_ACK.
  */
 static void
 chd_ack_received(struct cc_var *ccv, uint16_t ack_type)
 {
 	struct chd *chd_data;
 	struct ertt *e_t;
 	int backoff, new_measurement, qdly, rtt;
 
 	e_t = khelp_get_osd(CCV(ccv, osd), ertt_id);
 	chd_data = ccv->cc_data;
 	new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT;
 	backoff = qdly = 0;
 
 	chd_data->maxrtt_in_rtt = imax(e_t->rtt, chd_data->maxrtt_in_rtt);
 
 	if (new_measurement) {
 		/*
 		 * There is a new per RTT measurement, so check to see if there
 		 * is delay based congestion.
 		 */
 		rtt = V_chd_use_max ? chd_data->maxrtt_in_rtt : e_t->rtt;
 		chd_data->maxrtt_in_rtt = 0;
 
 		if (rtt && e_t->minrtt && !IN_RECOVERY(CCV(ccv, t_flags))) {
 			qdly = rtt - e_t->minrtt;
 			if (qdly > V_chd_qmin) {
 				/*
 				 * Probabilistic delay based congestion
 				 * indication.
 				 */
 				backoff = should_backoff(qdly,
 				    e_t->maxrtt - e_t->minrtt, chd_data);
 			} else
 				chd_data->loss_compete = 0;
 		}
 		/* Reset per RTT measurement flag to start a new measurement. */
 		e_t->flags &= ~ERTT_NEW_MEASUREMENT;
 	}
 
 	if (backoff) {
 		/*
 		 * Update shadow_w before delay based backoff.
 		 */
 		if (chd_data->loss_compete ||
 		    qdly > chd_data->prev_backoff_qdly) {
 			/*
 			 * Delay is higher than when we backed off previously,
 			 * so it is possible that this flow is competing with
 			 * loss based flows.
 			 */
 			chd_data->shadow_w = max(CCV(ccv, snd_cwnd),
 			    chd_data->shadow_w);
 		} else {
 			/*
 			 * Reset shadow_w, as it is probable that this flow is
 			 * not competing with loss based flows at the moment.
 			 */
 			chd_data->shadow_w = 0;
 		}
 
 		chd_data->prev_backoff_qdly = qdly;
 		/*
 		 * Send delay-based congestion signal to the congestion signal
 		 * handler.
 		 */
 		chd_cong_signal(ccv, CC_CHD_DELAY);
 
 	} else if (ack_type == CC_ACK)
 		chd_window_increase(ccv, new_measurement);
 }
 
 static void
 chd_cb_destroy(struct cc_var *ccv)
 {
 
 	if (ccv->cc_data != NULL)
 		free(ccv->cc_data, M_CHD);
 }
 
 static int
 chd_cb_init(struct cc_var *ccv)
 {
 	struct chd *chd_data;
 
 	chd_data = malloc(sizeof(struct chd), M_CHD, M_NOWAIT);
 	if (chd_data == NULL)
 		return (ENOMEM);
 
 	chd_data->shadow_w = 0;
 	ccv->cc_data = chd_data;
 
 	return (0);
 }
 
 static void
 chd_cong_signal(struct cc_var *ccv, uint32_t signal_type)
 {
 	struct ertt *e_t;
 	struct chd *chd_data;
 	int qdly;
 
 	e_t = khelp_get_osd(CCV(ccv, osd), ertt_id);
 	chd_data = ccv->cc_data;
 	qdly = imax(e_t->rtt, chd_data->maxrtt_in_rtt) - e_t->minrtt;
 
 	switch(signal_type) {
 	case CC_CHD_DELAY:
 		chd_window_decrease(ccv); /* Set new ssthresh. */
 		CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
 		CCV(ccv, snd_recover) = CCV(ccv, snd_max);
 		ENTER_CONGRECOVERY(CCV(ccv, t_flags));
 		break;
 
 	case CC_NDUPACK: /* Packet loss. */
 		/*
 		 * Only react to loss as a congestion signal if qdly >
 		 * V_chd_qthresh.  If qdly is less than qthresh, presume that
 		 * this is a non congestion related loss. If qdly is greater
 		 * than qthresh, assume that we are competing with loss based
 		 * tcp flows and restore window from any unnecessary backoffs,
 		 * before the decrease.
 		 */
 		if (!IN_RECOVERY(CCV(ccv, t_flags)) && qdly > V_chd_qthresh) {
 			if (chd_data->loss_compete) {
 				CCV(ccv, snd_cwnd) = max(CCV(ccv, snd_cwnd),
 				    chd_data->shadow_w);
 			}
 			chd_window_decrease(ccv);
 		} else {
 			 /*
 			  * This loss isn't congestion related, or already
 			  * recovering from congestion.
 			  */
 			CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
 			CCV(ccv, snd_recover) = CCV(ccv, snd_max);
 		}
 
 		if (chd_data->shadow_w > 0) {
 			chd_data->shadow_w = max(chd_data->shadow_w /
 			    CCV(ccv, t_maxseg) / 2, 2) * CCV(ccv, t_maxseg);
 		}
 		ENTER_FASTRECOVERY(CCV(ccv, t_flags));
 		break;
 
 	default:
 		newreno_cc_algo.cong_signal(ccv, signal_type);
 	}
 }
 
 static void
 chd_conn_init(struct cc_var *ccv)
 {
 	struct chd *chd_data;
 
 	chd_data = ccv->cc_data;
 	chd_data->prev_backoff_qdly = 0;
 	chd_data->maxrtt_in_rtt = 0;
 	chd_data->loss_compete = 0;
 	/*
 	 * Initialise the shadow_cwnd to be equal to snd_cwnd in case we are
 	 * competing with loss based flows from the start.
 	 */
 	chd_data->shadow_w = CCV(ccv, snd_cwnd);
 }
 
 static int
 chd_mod_init(void)
 {
 
 	ertt_id = khelp_get_id("ertt");
 	if (ertt_id <= 0) {
 		printf("%s: h_ertt module not found\n", __func__);
 		return (ENOENT);
 	}
 
 	chd_cc_algo.after_idle = newreno_cc_algo.after_idle;
 	chd_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
 
 	return (0);
 }
 
 static int
 chd_loss_fair_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_chd_loss_fair;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (CAST_PTR_INT(req->newptr) > 1)
 			error = EINVAL;
 		else
 			V_chd_loss_fair = new;
 	}
 
 	return (error);
 }
 
 static int
 chd_pmax_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_chd_pmax;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (CAST_PTR_INT(req->newptr) == 0 ||
 		    CAST_PTR_INT(req->newptr) > 100)
 			error = EINVAL;
 		else
 			V_chd_pmax = new;
 	}
 
 	return (error);
 }
 
 static int
 chd_qthresh_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_chd_qthresh;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (CAST_PTR_INT(req->newptr) <= V_chd_qmin)
 			error = EINVAL;
 		else
 			V_chd_qthresh = new;
 	}
 
 	return (error);
 }
 
 SYSCTL_DECL(_net_inet_tcp_cc_chd);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, chd, CTLFLAG_RW, NULL,
     "CAIA Hamilton delay-based congestion control related settings");
 
-SYSCTL_VNET_PROC(_net_inet_tcp_cc_chd, OID_AUTO, loss_fair,
-    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_loss_fair), 1, &chd_loss_fair_handler,
+SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, loss_fair,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
+    &VNET_NAME(chd_loss_fair), 1, &chd_loss_fair_handler,
     "IU", "Flag to enable shadow window functionality.");
 
-SYSCTL_VNET_PROC(_net_inet_tcp_cc_chd, OID_AUTO, pmax,
-    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_pmax), 5, &chd_pmax_handler,
+SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, pmax,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
+    &VNET_NAME(chd_pmax), 5, &chd_pmax_handler,
     "IU", "Per RTT maximum backoff probability as a percentage");
 
-SYSCTL_VNET_PROC(_net_inet_tcp_cc_chd, OID_AUTO, queue_threshold,
-    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_qthresh), 20, &chd_qthresh_handler,
+SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, queue_threshold,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
+    &VNET_NAME(chd_qthresh), 20, &chd_qthresh_handler,
     "IU", "Queueing congestion threshold in ticks");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_cc_chd, OID_AUTO, queue_min,
-    CTLFLAG_RW, &VNET_NAME(chd_qmin), 5,
+SYSCTL_UINT(_net_inet_tcp_cc_chd, OID_AUTO, queue_min,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(chd_qmin), 5,
     "Minimum queueing delay threshold in ticks");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_cc_chd,  OID_AUTO, use_max,
-    CTLFLAG_RW, &VNET_NAME(chd_use_max), 1,
+SYSCTL_UINT(_net_inet_tcp_cc_chd,  OID_AUTO, use_max,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(chd_use_max), 1,
     "Use the maximum RTT seen within the measurement period (RTT) "
     "as the basic delay measurement for the algorithm.");
 
 DECLARE_CC_MODULE(chd, &chd_cc_algo);
 MODULE_DEPEND(chd, ertt, 1, 1, 1);
Index: user/dchagin/lemul/sys/netinet/cc/cc_hd.c
===================================================================
--- user/dchagin/lemul/sys/netinet/cc/cc_hd.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/cc/cc_hd.c	(revision 274241)
@@ -1,254 +1,255 @@
 /*-
  * Copyright (c) 2009-2010
  *	Swinburne University of Technology, Melbourne, Australia
  * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010-2011 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by David Hayes and
  * Lawrence Stewart, made possible in part by a grant from the Cisco University
  * Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, Melbourne, Australia by
  * David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * An implementation of the Hamilton Institute's delay-based congestion control
  * algorithm for FreeBSD, based on "A strategy for fair coexistence of loss and
  * delay-based congestion control algorithms," by L. Budzisz, R. Stanojevic, R.
  * Shorten, and F. Baker, IEEE Commun. Lett., vol. 13, no. 7, pp. 555--557, Jul.
  * 2009.
  *
  * Originally released as part of the NewTCP research project at Swinburne
  * University of Technology's Centre for Advanced Internet Architectures,
  * Melbourne, Australia, which was made possible in part by a grant from the
  * Cisco University Research Program Fund at Community Foundation Silicon
  * Valley. More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/cc.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 
 #include <netinet/cc/cc_module.h>
 
 #include <netinet/khelp/h_ertt.h>
 
 #define	CAST_PTR_INT(X)	(*((int*)(X)))
 
 /* Largest possible number returned by random(). */
 #define	RANDOM_MAX	INT_MAX
 
 static void	hd_ack_received(struct cc_var *ccv, uint16_t ack_type);
 static int	hd_mod_init(void);
 
 static int ertt_id;
 
 static VNET_DEFINE(uint32_t, hd_qthresh) = 20;
 static VNET_DEFINE(uint32_t, hd_qmin) = 5;
 static VNET_DEFINE(uint32_t, hd_pmax) = 5;
 #define	V_hd_qthresh	VNET(hd_qthresh)
 #define	V_hd_qmin	VNET(hd_qmin)
 #define	V_hd_pmax	VNET(hd_pmax)
 
 struct cc_algo hd_cc_algo = {
 	.name = "hd",
 	.ack_received = hd_ack_received,
 	.mod_init = hd_mod_init
 };
 
 /*
  * Hamilton backoff function. Returns 1 if we should backoff or 0 otherwise.
  */
 static __inline int
 should_backoff(int qdly, int maxqdly)
 {
 	unsigned long p;
 
 	if (qdly < V_hd_qthresh) {
 		p = (((RANDOM_MAX / 100) * V_hd_pmax) /
 		    (V_hd_qthresh - V_hd_qmin)) * (qdly - V_hd_qmin);
 	} else {
 		if (qdly > V_hd_qthresh)
 			p = (((RANDOM_MAX / 100) * V_hd_pmax) /
 			    (maxqdly - V_hd_qthresh)) * (maxqdly - qdly);
 		else
 			p = (RANDOM_MAX / 100) * V_hd_pmax;
 	}
 
 	return (random() < p);
 }
 
 /*
  * If the ack type is CC_ACK, and the inferred queueing delay is greater than
  * the Qmin threshold, cwnd is reduced probabilistically. When backing off due
  * to delay, HD behaves like NewReno when an ECN signal is received. HD behaves
  * as NewReno in all other circumstances.
  */
 static void
 hd_ack_received(struct cc_var *ccv, uint16_t ack_type)
 {
 	struct ertt *e_t;
 	int qdly;
 
 	if (ack_type == CC_ACK) {
 		e_t = khelp_get_osd(CCV(ccv, osd), ertt_id);
 
 		if (e_t->rtt && e_t->minrtt && V_hd_qthresh > 0) {
 			qdly = e_t->rtt - e_t->minrtt;
 
 			if (qdly > V_hd_qmin &&
 			    !IN_RECOVERY(CCV(ccv, t_flags))) {
 				/* Probabilistic backoff of cwnd. */
 				if (should_backoff(qdly,
 				    e_t->maxrtt - e_t->minrtt)) {
 					/*
 					 * Update cwnd and ssthresh update to
 					 * half cwnd and behave like an ECN (ie
 					 * not a packet loss).
 					 */
 					newreno_cc_algo.cong_signal(ccv,
 					    CC_ECN);
 					return;
 				}
 			}
 		}
 	}
 	newreno_cc_algo.ack_received(ccv, ack_type); /* As for NewReno. */
 }
 
 static int
 hd_mod_init(void)
 {
 
 	ertt_id = khelp_get_id("ertt");
 	if (ertt_id <= 0) {
 		printf("%s: h_ertt module not found\n", __func__);
 		return (ENOENT);
 	}
 
 	hd_cc_algo.after_idle = newreno_cc_algo.after_idle;
 	hd_cc_algo.cong_signal = newreno_cc_algo.cong_signal;
 	hd_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
 
 	return (0);
 }
 
 static int
 hd_pmax_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_hd_pmax;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (CAST_PTR_INT(req->newptr) == 0 ||
 		    CAST_PTR_INT(req->newptr) > 100)
 			error = EINVAL;
 		else
 			V_hd_pmax = new;
 	}
 
 	return (error);
 }
 
 static int
 hd_qmin_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_hd_qmin;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (CAST_PTR_INT(req->newptr) > V_hd_qthresh)
 			error = EINVAL;
 		else
 			V_hd_qmin = new;
 	}
 
 	return (error);
 }
 
 static int
 hd_qthresh_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_hd_qthresh;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (CAST_PTR_INT(req->newptr) < 1 ||
 		    CAST_PTR_INT(req->newptr) < V_hd_qmin)
 			error = EINVAL;
 		else
 			V_hd_qthresh = new;
 	}
 
 	return (error);
 }
 
 SYSCTL_DECL(_net_inet_tcp_cc_hd);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hd, CTLFLAG_RW, NULL,
     "Hamilton delay-based congestion control related settings");
 
-SYSCTL_VNET_PROC(_net_inet_tcp_cc_hd, OID_AUTO, queue_threshold,
-    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(hd_qthresh), 20, &hd_qthresh_handler,
-    "IU", "queueing congestion threshold (qth) in ticks");
+SYSCTL_PROC(_net_inet_tcp_cc_hd, OID_AUTO, queue_threshold,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(hd_qthresh), 20,
+    &hd_qthresh_handler, "IU", "queueing congestion threshold (qth) in ticks");
 
-SYSCTL_VNET_PROC(_net_inet_tcp_cc_hd, OID_AUTO, pmax,
-    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(hd_pmax), 5, &hd_pmax_handler,
-    "IU", "per packet maximum backoff probability as a percentage");
+SYSCTL_PROC(_net_inet_tcp_cc_hd, OID_AUTO, pmax,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(hd_pmax), 5,
+    &hd_pmax_handler, "IU",
+    "per packet maximum backoff probability as a percentage");
 
-SYSCTL_VNET_PROC(_net_inet_tcp_cc_hd, OID_AUTO, queue_min,
-    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(hd_qmin), 5, &hd_qmin_handler,
-    "IU", "minimum queueing delay threshold (qmin) in ticks");
+SYSCTL_PROC(_net_inet_tcp_cc_hd, OID_AUTO, queue_min,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(hd_qmin), 5,
+    &hd_qmin_handler, "IU", "minimum queueing delay threshold (qmin) in ticks");
 
 DECLARE_CC_MODULE(hd, &hd_cc_algo);
 MODULE_DEPEND(hd, ertt, 1, 1, 1);
Index: user/dchagin/lemul/sys/netinet/cc/cc_htcp.c
===================================================================
--- user/dchagin/lemul/sys/netinet/cc/cc_htcp.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/cc/cc_htcp.c	(revision 274241)
@@ -1,520 +1,522 @@
 /*-
  * Copyright (c) 2007-2008
  * 	Swinburne University of Technology, Melbourne, Australia
  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by Lawrence Stewart and
  * James Healy, made possible in part by a grant from the Cisco University
  * Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * An implementation of the H-TCP congestion control algorithm for FreeBSD,
  * based on the Internet Draft "draft-leith-tcp-htcp-06.txt" by Leith and
  * Shorten. Originally released as part of the NewTCP research project at
  * Swinburne University of Technology's Centre for Advanced Internet
  * Architectures, Melbourne, Australia, which was made possible in part by a
  * grant from the Cisco University Research Program Fund at Community Foundation
  * Silicon Valley. More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/vnet.h>
 
 #include <netinet/cc.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 
 #include <netinet/cc/cc_module.h>
 
 /* Fixed point math shifts. */
 #define HTCP_SHIFT 8
 #define HTCP_ALPHA_INC_SHIFT 4
 
 #define HTCP_INIT_ALPHA 1
 #define HTCP_DELTA_L hz		/* 1 sec in ticks. */
 #define HTCP_MINBETA 128	/* 0.5 << HTCP_SHIFT. */
 #define HTCP_MAXBETA 204	/* ~0.8 << HTCP_SHIFT. */
 #define HTCP_MINROWE 26		/* ~0.1 << HTCP_SHIFT. */
 #define HTCP_MAXROWE 512	/* 2 << HTCP_SHIFT. */
 
 /* RTT_ref (ms) used in the calculation of alpha if RTT scaling is enabled. */
 #define HTCP_RTT_REF 100
 
 /* Don't trust SRTT until this many samples have been taken. */
 #define HTCP_MIN_RTT_SAMPLES 8
 
 /*
  * HTCP_CALC_ALPHA performs a fixed point math calculation to determine the
  * value of alpha, based on the function defined in the HTCP spec.
  *
  * i.e. 1 + 10(delta - delta_l) + ((delta - delta_l) / 2) ^ 2
  *
  * "diff" is passed in to the macro as "delta - delta_l" and is expected to be
  * in units of ticks.
  *
  * The joyousnous of fixed point maths means our function implementation looks a
  * little funky...
  *
  * In order to maintain some precision in the calculations, a fixed point shift
  * HTCP_ALPHA_INC_SHIFT is used to ensure the integer divisions don't
  * truncate the results too badly.
  *
  * The "16" value is the "1" term in the alpha function shifted up by
  * HTCP_ALPHA_INC_SHIFT
  *
  * The "160" value is the "10" multiplier in the alpha function multiplied by
  * 2^HTCP_ALPHA_INC_SHIFT
  *
  * Specifying these as constants reduces the computations required. After
  * up-shifting all the terms in the function and performing the required
  * calculations, we down-shift the final result by HTCP_ALPHA_INC_SHIFT to
  * ensure it is back in the correct range.
  *
  * The "hz" terms are required as kernels can be configured to run with
  * different tick timers, which we have to adjust for in the alpha calculation
  * (which originally was defined in terms of seconds).
  *
  * We also have to be careful to constrain the value of diff such that it won't
  * overflow whilst performing the calculation. The middle term i.e. (160 * diff)
  * / hz is the limiting factor in the calculation. We must constrain diff to be
  * less than the max size of an int divided by the constant 160 figure
  * i.e. diff < INT_MAX / 160
  *
  * NB: Changing HTCP_ALPHA_INC_SHIFT will require you to MANUALLY update the
  * constants used in this function!
  */
 #define HTCP_CALC_ALPHA(diff) \
 ((\
 	(16) + \
 	((160 * (diff)) / hz) + \
 	(((diff) / hz) * (((diff) << HTCP_ALPHA_INC_SHIFT) / (4 * hz))) \
 ) >> HTCP_ALPHA_INC_SHIFT)
 
 static void	htcp_ack_received(struct cc_var *ccv, uint16_t type);
 static void	htcp_cb_destroy(struct cc_var *ccv);
 static int	htcp_cb_init(struct cc_var *ccv);
 static void	htcp_cong_signal(struct cc_var *ccv, uint32_t type);
 static int	htcp_mod_init(void);
 static void	htcp_post_recovery(struct cc_var *ccv);
 static void	htcp_recalc_alpha(struct cc_var *ccv);
 static void	htcp_recalc_beta(struct cc_var *ccv);
 static void	htcp_record_rtt(struct cc_var *ccv);
 static void	htcp_ssthresh_update(struct cc_var *ccv);
 
 struct htcp {
 	/* cwnd before entering cong recovery. */
 	unsigned long	prev_cwnd;
 	/* cwnd additive increase parameter. */
 	int		alpha;
 	/* cwnd multiplicative decrease parameter. */
 	int		beta;
 	/* Largest rtt seen for the flow. */
 	int		maxrtt;
 	/* Shortest rtt seen for the flow. */
 	int		minrtt;
 	/* Time of last congestion event in ticks. */
 	int		t_last_cong;
 };
 
 static int htcp_rtt_ref;
 /*
  * The maximum number of ticks the value of diff can reach in
  * htcp_recalc_alpha() before alpha will stop increasing due to overflow.
  * See comment above HTCP_CALC_ALPHA for more info.
  */
 static int htcp_max_diff = INT_MAX / ((1 << HTCP_ALPHA_INC_SHIFT) * 10);
 
 /* Per-netstack vars. */
 static VNET_DEFINE(u_int, htcp_adaptive_backoff) = 0;
 static VNET_DEFINE(u_int, htcp_rtt_scaling) = 0;
 #define	V_htcp_adaptive_backoff    VNET(htcp_adaptive_backoff)
 #define	V_htcp_rtt_scaling    VNET(htcp_rtt_scaling)
 
 static MALLOC_DEFINE(M_HTCP, "htcp data",
     "Per connection data required for the HTCP congestion control algorithm");
 
 struct cc_algo htcp_cc_algo = {
 	.name = "htcp",
 	.ack_received = htcp_ack_received,
 	.cb_destroy = htcp_cb_destroy,
 	.cb_init = htcp_cb_init,
 	.cong_signal = htcp_cong_signal,
 	.mod_init = htcp_mod_init,
 	.post_recovery = htcp_post_recovery,
 };
 
 static void
 htcp_ack_received(struct cc_var *ccv, uint16_t type)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = ccv->cc_data;
 	htcp_record_rtt(ccv);
 
 	/*
 	 * Regular ACK and we're not in cong/fast recovery and we're cwnd
 	 * limited and we're either not doing ABC or are slow starting or are
 	 * doing ABC and we've sent a cwnd's worth of bytes.
 	 */
 	if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
 	    (ccv->flags & CCF_CWND_LIMITED) && (!V_tcp_do_rfc3465 ||
 	    CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) ||
 	    (V_tcp_do_rfc3465 && ccv->flags & CCF_ABC_SENTAWND))) {
 		htcp_recalc_beta(ccv);
 		htcp_recalc_alpha(ccv);
 		/*
 		 * Use the logic in NewReno ack_received() for slow start and
 		 * for the first HTCP_DELTA_L ticks after either the flow starts
 		 * or a congestion event (when alpha equals 1).
 		 */
 		if (htcp_data->alpha == 1 ||
 		    CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh))
 			newreno_cc_algo.ack_received(ccv, type);
 		else {
 			if (V_tcp_do_rfc3465) {
 				/* Increment cwnd by alpha segments. */
 				CCV(ccv, snd_cwnd) += htcp_data->alpha *
 				    CCV(ccv, t_maxseg);
 				ccv->flags &= ~CCF_ABC_SENTAWND;
 			} else
 				/*
 				 * Increment cwnd by alpha/cwnd segments to
 				 * approximate an increase of alpha segments
 				 * per RTT.
 				 */
 				CCV(ccv, snd_cwnd) += (((htcp_data->alpha <<
 				    HTCP_SHIFT) / (CCV(ccv, snd_cwnd) /
 				    CCV(ccv, t_maxseg))) * CCV(ccv, t_maxseg))
 				    >> HTCP_SHIFT;
 		}
 	}
 }
 
 static void
 htcp_cb_destroy(struct cc_var *ccv)
 {
 
 	if (ccv->cc_data != NULL)
 		free(ccv->cc_data, M_HTCP);
 }
 
 static int
 htcp_cb_init(struct cc_var *ccv)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = malloc(sizeof(struct htcp), M_HTCP, M_NOWAIT);
 
 	if (htcp_data == NULL)
 		return (ENOMEM);
 
 	/* Init some key variables with sensible defaults. */
 	htcp_data->alpha = HTCP_INIT_ALPHA;
 	htcp_data->beta = HTCP_MINBETA;
 	htcp_data->maxrtt = TCPTV_SRTTBASE;
 	htcp_data->minrtt = TCPTV_SRTTBASE;
 	htcp_data->prev_cwnd = 0;
 	htcp_data->t_last_cong = ticks;
 
 	ccv->cc_data = htcp_data;
 
 	return (0);
 }
 
 /*
  * Perform any necessary tasks before we enter congestion recovery.
  */
 static void
 htcp_cong_signal(struct cc_var *ccv, uint32_t type)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = ccv->cc_data;
 
 	switch (type) {
 	case CC_NDUPACK:
 		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
 				/*
 				 * Apply hysteresis to maxrtt to ensure
 				 * reductions in the RTT are reflected in our
 				 * measurements.
 				 */
 				htcp_data->maxrtt = (htcp_data->minrtt +
 				    (htcp_data->maxrtt - htcp_data->minrtt) *
 				    95) / 100;
 				htcp_ssthresh_update(ccv);
 				htcp_data->t_last_cong = ticks;
 				htcp_data->prev_cwnd = CCV(ccv, snd_cwnd);
 			}
 			ENTER_RECOVERY(CCV(ccv, t_flags));
 		}
 		break;
 
 	case CC_ECN:
 		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
 			/*
 			 * Apply hysteresis to maxrtt to ensure reductions in
 			 * the RTT are reflected in our measurements.
 			 */
 			htcp_data->maxrtt = (htcp_data->minrtt + (htcp_data->maxrtt -
 			    htcp_data->minrtt) * 95) / 100;
 			htcp_ssthresh_update(ccv);
 			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
 			htcp_data->t_last_cong = ticks;
 			htcp_data->prev_cwnd = CCV(ccv, snd_cwnd);
 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
 		}
 		break;
 
 	case CC_RTO:
 		/*
 		 * Grab the current time and record it so we know when the
 		 * most recent congestion event was. Only record it when the
 		 * timeout has fired more than once, as there is a reasonable
 		 * chance the first one is a false alarm and may not indicate
 		 * congestion.
 		 */
 		if (CCV(ccv, t_rxtshift) >= 2)
 			htcp_data->t_last_cong = ticks;
 		break;
 	}
 }
 
 static int
 htcp_mod_init(void)
 {
 
 	htcp_cc_algo.after_idle = newreno_cc_algo.after_idle;
 
 	/*
 	 * HTCP_RTT_REF is defined in ms, and t_srtt in the tcpcb is stored in
 	 * units of TCP_RTT_SCALE*hz. Scale HTCP_RTT_REF to be in the same units
 	 * as t_srtt.
 	 */
 	htcp_rtt_ref = (HTCP_RTT_REF * TCP_RTT_SCALE * hz) / 1000;
 
 	return (0);
 }
 
 /*
  * Perform any necessary tasks before we exit congestion recovery.
  */
 static void
 htcp_post_recovery(struct cc_var *ccv)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = ccv->cc_data;
 
 	if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
 		/*
 		 * If inflight data is less than ssthresh, set cwnd
 		 * conservatively to avoid a burst of data, as suggested in the
 		 * NewReno RFC. Otherwise, use the HTCP method.
 		 *
 		 * XXXLAS: Find a way to do this without needing curack
 		 */
 		if (SEQ_GT(ccv->curack + CCV(ccv, snd_ssthresh),
 		    CCV(ccv, snd_max)))
 			CCV(ccv, snd_cwnd) = CCV(ccv, snd_max) - ccv->curack +
 			    CCV(ccv, t_maxseg);
 		else
 			CCV(ccv, snd_cwnd) = max(1, ((htcp_data->beta *
 			    htcp_data->prev_cwnd / CCV(ccv, t_maxseg))
 			    >> HTCP_SHIFT)) * CCV(ccv, t_maxseg);
 	}
 }
 
 static void
 htcp_recalc_alpha(struct cc_var *ccv)
 {
 	struct htcp *htcp_data;
 	int alpha, diff, now;
 
 	htcp_data = ccv->cc_data;
 	now = ticks;
 
 	/*
 	 * If ticks has wrapped around (will happen approximately once every 49
 	 * days on a machine with the default kern.hz=1000) and a flow straddles
 	 * the wrap point, our alpha calcs will be completely wrong. We cut our
 	 * losses and restart alpha from scratch by setting t_last_cong = now -
 	 * HTCP_DELTA_L.
 	 *
 	 * This does not deflate our cwnd at all. It simply slows the rate cwnd
 	 * is growing by until alpha regains the value it held prior to taking
 	 * this drastic measure.
 	 */
 	if (now < htcp_data->t_last_cong)
 		htcp_data->t_last_cong = now - HTCP_DELTA_L;
 
 	diff = now - htcp_data->t_last_cong - HTCP_DELTA_L;
 
 	/* Cap alpha if the value of diff would overflow HTCP_CALC_ALPHA(). */
 	if (diff < htcp_max_diff) {
 		/*
 		 * If it has been more than HTCP_DELTA_L ticks since congestion,
 		 * increase alpha according to the function defined in the spec.
 		 */
 		if (diff > 0) {
 			alpha = HTCP_CALC_ALPHA(diff);
 
 			/*
 			 * Adaptive backoff fairness adjustment:
 			 * 2 * (1 - beta) * alpha_raw
 			 */
 			if (V_htcp_adaptive_backoff)
 				alpha = max(1, (2 * ((1 << HTCP_SHIFT) -
 				    htcp_data->beta) * alpha) >> HTCP_SHIFT);
 
 			/*
 			 * RTT scaling: (RTT / RTT_ref) * alpha
 			 * alpha will be the raw value from HTCP_CALC_ALPHA() if
 			 * adaptive backoff is off, or the adjusted value if
 			 * adaptive backoff is on.
 			 */
 			if (V_htcp_rtt_scaling)
 				alpha = max(1, (min(max(HTCP_MINROWE,
 				    (CCV(ccv, t_srtt) << HTCP_SHIFT) /
 				    htcp_rtt_ref), HTCP_MAXROWE) * alpha)
 				    >> HTCP_SHIFT);
 
 		} else
 			alpha = 1;
 
 		htcp_data->alpha = alpha;
 	}
 }
 
 static void
 htcp_recalc_beta(struct cc_var *ccv)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = ccv->cc_data;
 
 	/*
 	 * TCPTV_SRTTBASE is the initialised value of each connection's SRTT, so
 	 * we only calc beta if the connection's SRTT has been changed from its
 	 * inital value. beta is bounded to ensure it is always between
 	 * HTCP_MINBETA and HTCP_MAXBETA.
 	 */
 	if (V_htcp_adaptive_backoff && htcp_data->minrtt != TCPTV_SRTTBASE &&
 	    htcp_data->maxrtt != TCPTV_SRTTBASE)
 		htcp_data->beta = min(max(HTCP_MINBETA,
 		    (htcp_data->minrtt << HTCP_SHIFT) / htcp_data->maxrtt),
 		    HTCP_MAXBETA);
 	else
 		htcp_data->beta = HTCP_MINBETA;
 }
 
 /*
  * Record the minimum and maximum RTT seen for the connection. These are used in
  * the calculation of beta if adaptive backoff is enabled.
  */
 static void
 htcp_record_rtt(struct cc_var *ccv)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = ccv->cc_data;
 
 	/* XXXLAS: Should there be some hysteresis for minrtt? */
 
 	/*
 	 * Record the current SRTT as our minrtt if it's the smallest we've seen
 	 * or minrtt is currently equal to its initialised value. Ignore SRTT
 	 * until a min number of samples have been taken.
 	 */
 	if ((CCV(ccv, t_srtt) < htcp_data->minrtt ||
 	    htcp_data->minrtt == TCPTV_SRTTBASE) &&
 	    (CCV(ccv, t_rttupdated) >= HTCP_MIN_RTT_SAMPLES))
 		htcp_data->minrtt = CCV(ccv, t_srtt);
 
 	/*
 	 * Record the current SRTT as our maxrtt if it's the largest we've
 	 * seen. Ignore SRTT until a min number of samples have been taken.
 	 */
 	if (CCV(ccv, t_srtt) > htcp_data->maxrtt
 	    && CCV(ccv, t_rttupdated) >= HTCP_MIN_RTT_SAMPLES)
 		htcp_data->maxrtt = CCV(ccv, t_srtt);
 }
 
 /*
  * Update the ssthresh in the event of congestion.
  */
 static void
 htcp_ssthresh_update(struct cc_var *ccv)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = ccv->cc_data;
 
 	/*
 	 * On the first congestion event, set ssthresh to cwnd * 0.5, on
 	 * subsequent congestion events, set it to cwnd * beta.
 	 */
 	if (CCV(ccv, snd_ssthresh) == TCP_MAXWIN << TCP_MAX_WINSHIFT)
 		CCV(ccv, snd_ssthresh) = (CCV(ccv, snd_cwnd) * HTCP_MINBETA)
 		    >> HTCP_SHIFT;
 	else {
 		htcp_recalc_beta(ccv);
 		CCV(ccv, snd_ssthresh) = (CCV(ccv, snd_cwnd) * htcp_data->beta)
 		    >> HTCP_SHIFT;
 	}
 }
 
 
 SYSCTL_DECL(_net_inet_tcp_cc_htcp);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, htcp, CTLFLAG_RW,
     NULL, "H-TCP related settings");
-SYSCTL_VNET_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, adaptive_backoff, CTLFLAG_RW,
-    &VNET_NAME(htcp_adaptive_backoff), 0, "enable H-TCP adaptive backoff");
-SYSCTL_VNET_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, rtt_scaling, CTLFLAG_RW,
-    &VNET_NAME(htcp_rtt_scaling), 0, "enable H-TCP RTT scaling");
+SYSCTL_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, adaptive_backoff,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(htcp_adaptive_backoff), 0,
+    "enable H-TCP adaptive backoff");
+SYSCTL_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, rtt_scaling,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(htcp_rtt_scaling), 0,
+    "enable H-TCP RTT scaling");
 
 DECLARE_CC_MODULE(htcp, &htcp_cc_algo);
Index: user/dchagin/lemul/sys/netinet/cc/cc_vegas.c
===================================================================
--- user/dchagin/lemul/sys/netinet/cc/cc_vegas.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/cc/cc_vegas.c	(revision 274241)
@@ -1,307 +1,309 @@
 /*-
  * Copyright (c) 2009-2010
  *	Swinburne University of Technology, Melbourne, Australia
  * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010-2011 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by David Hayes and
  * Lawrence Stewart, made possible in part by a grant from the Cisco University
  * Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, Melbourne, Australia by
  * David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * An implementation of the Vegas congestion control algorithm for FreeBSD,
  * based on L. S. Brakmo and L. L. Peterson, "TCP Vegas: end to end congestion
  * avoidance on a global internet", IEEE J. Sel. Areas Commun., vol. 13, no. 8,
  * pp. 1465-1480, Oct. 1995. The original Vegas duplicate ack policy has not
  * been implemented, since clock ticks are not as coarse as they were (i.e.
  * 500ms) when Vegas was designed. Also, packets are timed once per RTT as in
  * the original paper.
  *
  * Originally released as part of the NewTCP research project at Swinburne
  * University of Technology's Centre for Advanced Internet Architectures,
  * Melbourne, Australia, which was made possible in part by a grant from the
  * Cisco University Research Program Fund at Community Foundation Silicon
  * Valley. More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/cc.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 
 #include <netinet/cc/cc_module.h>
 
 #include <netinet/khelp/h_ertt.h>
 
 #define	CAST_PTR_INT(X)	(*((int*)(X)))
 
 /*
  * Private signal type for rate based congestion signal.
  * See <netinet/cc.h> for appropriate bit-range to use for private signals.
  */
 #define	CC_VEGAS_RATE	0x01000000
 
 static void	vegas_ack_received(struct cc_var *ccv, uint16_t ack_type);
 static void	vegas_cb_destroy(struct cc_var *ccv);
 static int	vegas_cb_init(struct cc_var *ccv);
 static void	vegas_cong_signal(struct cc_var *ccv, uint32_t signal_type);
 static void	vegas_conn_init(struct cc_var *ccv);
 static int	vegas_mod_init(void);
 
 struct vegas {
 	int slow_start_toggle;
 };
 
 static int32_t ertt_id;
 
 static VNET_DEFINE(uint32_t, vegas_alpha) = 1;
 static VNET_DEFINE(uint32_t, vegas_beta) = 3;
 #define	V_vegas_alpha	VNET(vegas_alpha)
 #define	V_vegas_beta	VNET(vegas_beta)
 
 static MALLOC_DEFINE(M_VEGAS, "vegas data",
     "Per connection data required for the Vegas congestion control algorithm");
 
 struct cc_algo vegas_cc_algo = {
 	.name = "vegas",
 	.ack_received = vegas_ack_received,
 	.cb_destroy = vegas_cb_destroy,
 	.cb_init = vegas_cb_init,
 	.cong_signal = vegas_cong_signal,
 	.conn_init = vegas_conn_init,
 	.mod_init = vegas_mod_init
 };
 
 /*
  * The vegas window adjustment is done once every RTT, as indicated by the
  * ERTT_NEW_MEASUREMENT flag. This flag is reset once the new measurment data
  * has been used.
  */
 static void
 vegas_ack_received(struct cc_var *ccv, uint16_t ack_type)
 {
 	struct ertt *e_t;
 	struct vegas *vegas_data;
 	long actual_tx_rate, expected_tx_rate, ndiff;
 
 	e_t = khelp_get_osd(CCV(ccv, osd), ertt_id);
 	vegas_data = ccv->cc_data;
 
 	if (e_t->flags & ERTT_NEW_MEASUREMENT) { /* Once per RTT. */
 		if (e_t->minrtt && e_t->markedpkt_rtt) {
 			expected_tx_rate = e_t->marked_snd_cwnd / e_t->minrtt;
 			actual_tx_rate = e_t->bytes_tx_in_marked_rtt /
 			    e_t->markedpkt_rtt;
 			ndiff = (expected_tx_rate - actual_tx_rate) *
 			    e_t->minrtt / CCV(ccv, t_maxseg);
 
 			if (ndiff < V_vegas_alpha) {
 				if (CCV(ccv, snd_cwnd) <=
 				    CCV(ccv, snd_ssthresh)) {
 					vegas_data->slow_start_toggle =
 					    vegas_data->slow_start_toggle ?
 					    0 : 1;
 				} else {
 					vegas_data->slow_start_toggle = 0;
 					CCV(ccv, snd_cwnd) =
 					    min(CCV(ccv, snd_cwnd) +
 					    CCV(ccv, t_maxseg),
 					    TCP_MAXWIN << CCV(ccv, snd_scale));
 				}
 			} else if (ndiff > V_vegas_beta) {
 				/* Rate-based congestion. */
 				vegas_cong_signal(ccv, CC_VEGAS_RATE);
 				vegas_data->slow_start_toggle = 0;
 			}
 		}
 		e_t->flags &= ~ERTT_NEW_MEASUREMENT;
 	}
 
 	if (vegas_data->slow_start_toggle)
 		newreno_cc_algo.ack_received(ccv, ack_type);
 }
 
 static void
 vegas_cb_destroy(struct cc_var *ccv)
 {
 
 	if (ccv->cc_data != NULL)
 		free(ccv->cc_data, M_VEGAS);
 }
 
 static int
 vegas_cb_init(struct cc_var *ccv)
 {
 	struct vegas *vegas_data;
 
 	vegas_data = malloc(sizeof(struct vegas), M_VEGAS, M_NOWAIT);
 
 	if (vegas_data == NULL)
 		return (ENOMEM);
 
 	vegas_data->slow_start_toggle = 1;
 	ccv->cc_data = vegas_data;
 
 	return (0);
 }
 
 /*
  * If congestion has been triggered triggered by the Vegas measured rates, it is
  * handled here, otherwise it falls back to newreno's congestion handling.
  */
 static void
 vegas_cong_signal(struct cc_var *ccv, uint32_t signal_type)
 {
 	struct vegas *vegas_data;
 	int presignalrecov;
 
 	vegas_data = ccv->cc_data;
 
 	if (IN_RECOVERY(CCV(ccv, t_flags)))
 		presignalrecov = 1;
 	else
 		presignalrecov = 0;
 
 	switch(signal_type) {
 	case CC_VEGAS_RATE:
 		if (!IN_RECOVERY(CCV(ccv, t_flags))) {
 			CCV(ccv, snd_cwnd) = max(2 * CCV(ccv, t_maxseg),
 			    CCV(ccv, snd_cwnd) - CCV(ccv, t_maxseg));
 			if (CCV(ccv, snd_cwnd) < CCV(ccv, snd_ssthresh))
 				/* Exit slow start. */
 				CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
 		}
 		break;
 
 	default:
 		newreno_cc_algo.cong_signal(ccv, signal_type);
 	}
 
 	if (IN_RECOVERY(CCV(ccv, t_flags)) && !presignalrecov)
 		vegas_data->slow_start_toggle =
 		    (CCV(ccv, snd_cwnd) < CCV(ccv, snd_ssthresh)) ? 1 : 0;
 }
 
 static void
 vegas_conn_init(struct cc_var *ccv)
 {
 	struct vegas *vegas_data;
 
 	vegas_data = ccv->cc_data;
 	vegas_data->slow_start_toggle = 1;
 }
 
 static int
 vegas_mod_init(void)
 {
 
 	ertt_id = khelp_get_id("ertt");
 	if (ertt_id <= 0) {
 		printf("%s: h_ertt module not found\n", __func__);
 		return (ENOENT);
 	}
 
 	vegas_cc_algo.after_idle = newreno_cc_algo.after_idle;
 	vegas_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
 
 	return (0);
 }
 
 static int
 vegas_alpha_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_vegas_alpha;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (CAST_PTR_INT(req->newptr) < 1 ||
 		    CAST_PTR_INT(req->newptr) > V_vegas_beta)
 			error = EINVAL;
 		else
 			V_vegas_alpha = new;
 	}
 
 	return (error);
 }
 
 static int
 vegas_beta_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_vegas_beta;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (CAST_PTR_INT(req->newptr) < 1 ||
 		    CAST_PTR_INT(req->newptr) < V_vegas_alpha)
 			 error = EINVAL;
 		else
 			V_vegas_beta = new;
 	}
 
 	return (error);
 }
 
 SYSCTL_DECL(_net_inet_tcp_cc_vegas);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, vegas, CTLFLAG_RW, NULL,
     "Vegas related settings");
 
-SYSCTL_VNET_PROC(_net_inet_tcp_cc_vegas, OID_AUTO, alpha,
-    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(vegas_alpha), 1, &vegas_alpha_handler,
-    "IU", "vegas alpha, specified as number of \"buffers\" (0 < alpha < beta)");
+SYSCTL_PROC(_net_inet_tcp_cc_vegas, OID_AUTO, alpha,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
+    &VNET_NAME(vegas_alpha), 1, &vegas_alpha_handler, "IU",
+    "vegas alpha, specified as number of \"buffers\" (0 < alpha < beta)");
 
-SYSCTL_VNET_PROC(_net_inet_tcp_cc_vegas, OID_AUTO, beta,
-    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(vegas_beta), 3, &vegas_beta_handler,
-    "IU", "vegas beta, specified as number of \"buffers\" (0 < alpha < beta)");
+SYSCTL_PROC(_net_inet_tcp_cc_vegas, OID_AUTO, beta,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
+    &VNET_NAME(vegas_beta), 3, &vegas_beta_handler, "IU",
+    "vegas beta, specified as number of \"buffers\" (0 < alpha < beta)");
 
 DECLARE_CC_MODULE(vegas, &vegas_cc_algo);
 MODULE_DEPEND(vegas, ertt, 1, 1, 1);
Index: user/dchagin/lemul/sys/netinet/if_ether.c
===================================================================
--- user/dchagin/lemul/sys/netinet/if_ether.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/if_ether.c	(revision 274241)
@@ -1,944 +1,944 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ether.c	8.1 (Berkeley) 6/10/93
  */
 
 /*
  * Ethernet address resolution protocol.
  * TODO:
  *	add "inuse/lock" bit (or ref. count) along with valid bit
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/if_llc.h>
 #include <net/ethernet.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #ifdef INET
 #include <netinet/ip_carp.h>
 #endif
 
 #include <net/if_arc.h>
 #include <net/iso88025.h>
 
 #include <security/mac/mac_framework.h>
 
 #define SIN(s) ((const struct sockaddr_in *)(s))
 #define SDL(s) ((struct sockaddr_dl *)s)
 
 SYSCTL_DECL(_net_link_ether);
 static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");
 static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, "");
 
 /* timer values */
 static VNET_DEFINE(int, arpt_keep) = (20*60);	/* once resolved, good for 20
 						 * minutes */
 static VNET_DEFINE(int, arp_maxtries) = 5;
 static VNET_DEFINE(int, arp_proxyall) = 0;
 static VNET_DEFINE(int, arpt_down) = 20;	/* keep incomplete entries for
 						 * 20 seconds */
 VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat);  /* ARP statistics, see if_arp.h */
 VNET_PCPUSTAT_SYSINIT(arpstat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(arpstat);
 #endif /* VIMAGE */
 
 static VNET_DEFINE(int, arp_maxhold) = 1;
 
 #define	V_arpt_keep		VNET(arpt_keep)
 #define	V_arpt_down		VNET(arpt_down)
 #define	V_arp_maxtries		VNET(arp_maxtries)
 #define	V_arp_proxyall		VNET(arp_proxyall)
 #define	V_arp_maxhold		VNET(arp_maxhold)
 
-SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW,
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arpt_keep), 0,
 	"ARP entry lifetime in seconds");
-SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW,
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_maxtries), 0,
 	"ARP resolution attempts before returning error");
-SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW,
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_proxyall), 0,
 	"Enable proxy ARP for all suitable requests");
-SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_RW,
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arpt_down), 0,
 	"Incomplete ARP entry lifetime in seconds");
 SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat,
     arpstat, "ARP statistics (struct arpstat, net/if_arp.h)");
-SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_RW,
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_maxhold), 0,
 	"Number of packets to hold per ARP entry");
 
 static void	arp_init(void);
 static void	arpintr(struct mbuf *);
 static void	arptimer(void *);
 #ifdef INET
 static void	in_arpinput(struct mbuf *);
 #endif
 
 static const struct netisr_handler arp_nh = {
 	.nh_name = "arp",
 	.nh_handler = arpintr,
 	.nh_proto = NETISR_ARP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 #ifdef AF_INET
 /*
  * called by in_scrubprefix() to remove entry from the table when
  * the interface goes away
  */
 void
 arp_ifscrub(struct ifnet *ifp, uint32_t addr)
 {
 	struct sockaddr_in addr4;
 
 	bzero((void *)&addr4, sizeof(addr4));
 	addr4.sin_len    = sizeof(addr4);
 	addr4.sin_family = AF_INET;
 	addr4.sin_addr.s_addr = addr;
 	IF_AFDATA_RLOCK(ifp);
 	lla_lookup(LLTABLE(ifp), (LLE_DELETE | LLE_IFADDR),
 	    (struct sockaddr *)&addr4);
 	IF_AFDATA_RUNLOCK(ifp);
 }
 #endif
 
 /*
  * Timeout routine.  Age arp_tab entries periodically.
  */
 static void
 arptimer(void *arg)
 {
 	struct llentry *lle = (struct llentry *)arg;
 	struct ifnet *ifp;
 
 	if (lle->la_flags & LLE_STATIC) {
 		LLE_WUNLOCK(lle);
 		return;
 	}
 
 	ifp = lle->lle_tbl->llt_ifp;
 	CURVNET_SET(ifp->if_vnet);
 
 	if ((lle->la_flags & LLE_DELETED) == 0) {
 		int evt;
 
 		if (lle->la_flags & LLE_VALID)
 			evt = LLENTRY_EXPIRED;
 		else
 			evt = LLENTRY_TIMEDOUT;
 		EVENTHANDLER_INVOKE(lle_event, lle, evt);
 	}
 
 	callout_stop(&lle->la_timer);
 
 	/* XXX: LOR avoidance. We still have ref on lle. */
 	LLE_WUNLOCK(lle);
 	IF_AFDATA_LOCK(ifp);
 	LLE_WLOCK(lle);
 
 	/* Guard against race with other llentry_free(). */
 	if (lle->la_flags & LLE_LINKED) {
 		size_t pkts_dropped;
 
 		LLE_REMREF(lle);
 		pkts_dropped = llentry_free(lle);
 		ARPSTAT_ADD(dropped, pkts_dropped);
 	} else
 		LLE_FREE_LOCKED(lle);
 
 	IF_AFDATA_UNLOCK(ifp);
 
 	ARPSTAT_INC(timeouts);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Broadcast an ARP request. Caller specifies:
  *	- arp header source ip address
  *	- arp header target ip address
  *	- arp header source ethernet address
  */
 void
 arprequest(struct ifnet *ifp, const struct in_addr *sip,
     const struct in_addr *tip, u_char *enaddr)
 {
 	struct mbuf *m;
 	struct arphdr *ah;
 	struct sockaddr sa;
 	u_char *carpaddr = NULL;
 
 	if (sip == NULL) {
 		/*
 		 * The caller did not supply a source address, try to find
 		 * a compatible one among those assigned to this interface.
 		 */
 		struct ifaddr *ifa;
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 
 			if (ifa->ifa_carp) {
 				if ((*carp_iamatch_p)(ifa, &carpaddr) == 0)
 					continue;
 				sip = &IA_SIN(ifa)->sin_addr;
 			} else {
 				carpaddr = NULL;
 				sip = &IA_SIN(ifa)->sin_addr;
 			}
 
 			if (0 == ((sip->s_addr ^ tip->s_addr) &
 			    IA_MASKSIN(ifa)->sin_addr.s_addr))
 				break;  /* found it. */
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (sip == NULL) {
 			printf("%s: cannot find matching address\n", __func__);
 			return;
 		}
 	}
 	if (enaddr == NULL)
 		enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp);
 
 	if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
 		return;
 	m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) +
 		2 * ifp->if_addrlen;
 	m->m_pkthdr.len = m->m_len;
 	MH_ALIGN(m, m->m_len);
 	ah = mtod(m, struct arphdr *);
 	bzero((caddr_t)ah, m->m_len);
 #ifdef MAC
 	mac_netinet_arp_send(ifp, m);
 #endif
 	ah->ar_pro = htons(ETHERTYPE_IP);
 	ah->ar_hln = ifp->if_addrlen;		/* hardware address length */
 	ah->ar_pln = sizeof(struct in_addr);	/* protocol address length */
 	ah->ar_op = htons(ARPOP_REQUEST);
 	bcopy(enaddr, ar_sha(ah), ah->ar_hln);
 	bcopy(sip, ar_spa(ah), ah->ar_pln);
 	bcopy(tip, ar_tpa(ah), ah->ar_pln);
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 	m->m_flags |= M_BCAST;
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	(*ifp->if_output)(ifp, m, &sa, NULL);
 	ARPSTAT_INC(txrequests);
 }
 
 /*
  * Resolve an IP address into an ethernet address.
  * On input:
  *    ifp is the interface we use
  *    rt0 is the route to the final destination (possibly useless)
  *    m is the mbuf. May be NULL if we don't have a packet.
  *    dst is the next hop,
  *    desten is where we want the address.
  *
  * On success, desten is filled in and the function returns 0;
  * If the packet must be held pending resolution, we return EWOULDBLOCK
  * On other errors, we return the corresponding error code.
  * Note that m_freem() handles NULL.
  */
 int
 arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m,
 	const struct sockaddr *dst, u_char *desten, struct llentry **lle)
 {
 	struct llentry *la = 0;
 	u_int flags = 0;
 	struct mbuf *curr = NULL;
 	struct mbuf *next = NULL;
 	int error, renew;
 
 	*lle = NULL;
 	if (m != NULL) {
 		if (m->m_flags & M_BCAST) {
 			/* broadcast */
 			(void)memcpy(desten,
 			    ifp->if_broadcastaddr, ifp->if_addrlen);
 			return (0);
 		}
 		if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) {
 			/* multicast */
 			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
 			return (0);
 		}
 	}
 retry:
 	IF_AFDATA_RLOCK(ifp);
 	la = lla_lookup(LLTABLE(ifp), flags, dst);
 	IF_AFDATA_RUNLOCK(ifp);
 	if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0)
 	    && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) {
 		flags |= (LLE_CREATE | LLE_EXCLUSIVE);
 		IF_AFDATA_WLOCK(ifp);
 		la = lla_lookup(LLTABLE(ifp), flags, dst);
 		IF_AFDATA_WUNLOCK(ifp);
 	}
 	if (la == NULL) {
 		if (flags & LLE_CREATE)
 			log(LOG_DEBUG,
 			    "arpresolve: can't allocate llinfo for %s on %s\n",
 			    inet_ntoa(SIN(dst)->sin_addr), ifp->if_xname);
 		m_freem(m);
 		return (EINVAL);
 	}
 
 	if ((la->la_flags & LLE_VALID) &&
 	    ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
 		bcopy(&la->ll_addr, desten, ifp->if_addrlen);
 		/*
 		 * If entry has an expiry time and it is approaching,
 		 * see if we need to send an ARP request within this
 		 * arpt_down interval.
 		 */
 		if (!(la->la_flags & LLE_STATIC) &&
 		    time_uptime + la->la_preempt > la->la_expire) {
 			arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
 			la->la_preempt--;
 		}
 
 		*lle = la;
 		error = 0;
 		goto done;
 	}
 
 	if (la->la_flags & LLE_STATIC) {   /* should not happen! */
 		log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n",
 		    inet_ntoa(SIN(dst)->sin_addr));
 		m_freem(m);
 		error = EINVAL;
 		goto done;
 	}
 
 	renew = (la->la_asked == 0 || la->la_expire != time_uptime);
 	if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) {
 		flags |= LLE_EXCLUSIVE;
 		LLE_RUNLOCK(la);
 		goto retry;
 	}
 	/*
 	 * There is an arptab entry, but no ethernet address
 	 * response yet.  Add the mbuf to the list, dropping
 	 * the oldest packet if we have exceeded the system
 	 * setting.
 	 */
 	if (m != NULL) {
 		if (la->la_numheld >= V_arp_maxhold) {
 			if (la->la_hold != NULL) {
 				next = la->la_hold->m_nextpkt;
 				m_freem(la->la_hold);
 				la->la_hold = next;
 				la->la_numheld--;
 				ARPSTAT_INC(dropped);
 			}
 		}
 		if (la->la_hold != NULL) {
 			curr = la->la_hold;
 			while (curr->m_nextpkt != NULL)
 				curr = curr->m_nextpkt;
 			curr->m_nextpkt = m;
 		} else
 			la->la_hold = m;
 		la->la_numheld++;
 		if (renew == 0 && (flags & LLE_EXCLUSIVE)) {
 			flags &= ~LLE_EXCLUSIVE;
 			LLE_DOWNGRADE(la);
 		}
 
 	}
 	/*
 	 * Return EWOULDBLOCK if we have tried less than arp_maxtries. It
 	 * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
 	 * if we have already sent arp_maxtries ARP requests. Retransmit the
 	 * ARP request, but not faster than one request per second.
 	 */
 	if (la->la_asked < V_arp_maxtries)
 		error = EWOULDBLOCK;	/* First request. */
 	else
 		error = rt0 != NULL && (rt0->rt_flags & RTF_GATEWAY) ?
 		    EHOSTUNREACH : EHOSTDOWN;
 
 	if (renew) {
 		int canceled;
 
 		LLE_ADDREF(la);
 		la->la_expire = time_uptime;
 		canceled = callout_reset(&la->la_timer, hz * V_arpt_down,
 		    arptimer, la);
 		if (canceled)
 			LLE_REMREF(la);
 		la->la_asked++;
 		LLE_WUNLOCK(la);
 		arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
 		return (error);
 	}
 done:
 	if (flags & LLE_EXCLUSIVE)
 		LLE_WUNLOCK(la);
 	else
 		LLE_RUNLOCK(la);
 	return (error);
 }
 
 /*
  * Common length and type checks are done here,
  * then the protocol-specific routine is called.
  */
 static void
 arpintr(struct mbuf *m)
 {
 	struct arphdr *ar;
 
 	if (m->m_len < sizeof(struct arphdr) &&
 	    ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
 		log(LOG_NOTICE, "arp: runt packet -- m_pullup failed\n");
 		return;
 	}
 	ar = mtod(m, struct arphdr *);
 
 	if (ntohs(ar->ar_hrd) != ARPHRD_ETHER &&
 	    ntohs(ar->ar_hrd) != ARPHRD_IEEE802 &&
 	    ntohs(ar->ar_hrd) != ARPHRD_ARCNET &&
 	    ntohs(ar->ar_hrd) != ARPHRD_IEEE1394 &&
 	    ntohs(ar->ar_hrd) != ARPHRD_INFINIBAND) {
 		log(LOG_NOTICE, "arp: unknown hardware address format (0x%2D)"
 		    " (from %*D to %*D)\n", (unsigned char *)&ar->ar_hrd, "",
 		    ETHER_ADDR_LEN, (u_char *)ar_sha(ar), ":",
 		    ETHER_ADDR_LEN, (u_char *)ar_tha(ar), ":");
 		m_freem(m);
 		return;
 	}
 
 	if (m->m_len < arphdr_len(ar)) {
 		if ((m = m_pullup(m, arphdr_len(ar))) == NULL) {
 			log(LOG_NOTICE, "arp: runt packet\n");
 			m_freem(m);
 			return;
 		}
 		ar = mtod(m, struct arphdr *);
 	}
 
 	ARPSTAT_INC(received);
 	switch (ntohs(ar->ar_pro)) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		in_arpinput(m);
 		return;
 #endif
 	}
 	m_freem(m);
 }
 
 #ifdef INET
 /*
  * ARP for Internet protocols on 10 Mb/s Ethernet.
  * Algorithm is that given in RFC 826.
  * In addition, a sanity check is performed on the sender
  * protocol address, to catch impersonators.
  * We no longer handle negotiations for use of trailer protocol:
  * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
  * along with IP replies if we wanted trailers sent to us,
  * and also sent them in response to IP replies.
  * This allowed either end to announce the desire to receive
  * trailer packets.
  * We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
  * but formerly didn't normally send requests.
  */
 static int log_arp_wrong_iface = 1;
 static int log_arp_movements = 1;
 static int log_arp_permanent_modify = 1;
 static int allow_multicast = 0;
 static struct timeval arp_lastlog;
 static int arp_curpps;
 static int arp_maxpps = 1;
 
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
 	&log_arp_wrong_iface, 0,
 	"log arp packets arriving on the wrong interface");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
 	&log_arp_movements, 0,
 	"log arp replies from MACs different than the one in the cache");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
 	&log_arp_permanent_modify, 0,
 	"log arp replies from MACs different than the one in the permanent arp entry");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW,
 	&allow_multicast, 0, "accept multicast addresses");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second,
 	CTLFLAG_RW, &arp_maxpps, 0,
 	"Maximum number of remotely triggered ARP messages that can be "
 	"logged per second");
 
 #define	ARP_LOG(pri, ...)	do {					\
 	if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps))	\
 		log((pri), "arp: " __VA_ARGS__);			\
 } while (0)
 
 static void
 in_arpinput(struct mbuf *m)
 {
 	struct arphdr *ah;
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct llentry *la = NULL;
 	struct rtentry *rt;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	struct sockaddr sa;
 	struct in_addr isaddr, itaddr, myaddr;
 	u_int8_t *enaddr = NULL;
 	int op, flags;
 	int req_len;
 	int bridged = 0, is_bridge = 0;
 	int carped;
 	struct sockaddr_in sin;
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = 0;
 
 	if (ifp->if_bridge)
 		bridged = 1;
 	if (ifp->if_type == IFT_BRIDGE)
 		is_bridge = 1;
 
 	req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
 	if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) {
 		ARP_LOG(LOG_NOTICE, "runt packet -- m_pullup failed\n");
 		return;
 	}
 
 	ah = mtod(m, struct arphdr *);
 	/*
 	 * ARP is only for IPv4 so we can reject packets with
 	 * a protocol length not equal to an IPv4 address.
 	 */
 	if (ah->ar_pln != sizeof(struct in_addr)) {
 		ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n",
 		    sizeof(struct in_addr));
 		goto drop;
 	}
 
 	if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) {
 		ARP_LOG(LOG_NOTICE, "%*D is multicast\n",
 		    ifp->if_addrlen, (u_char *)ar_sha(ah), ":");
 		goto drop;
 	}
 
 	op = ntohs(ah->ar_op);
 	(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
 	(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
 
 	if (op == ARPOP_REPLY)
 		ARPSTAT_INC(rxreplies);
 
 	/*
 	 * For a bridge, we want to check the address irrespective
 	 * of the receive interface. (This will change slightly
 	 * when we have clusters of interfaces).
 	 */
 	IN_IFADDR_RLOCK();
 	LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
 		    ia->ia_ifp == ifp) &&
 		    itaddr.s_addr == ia->ia_addr.sin_addr.s_addr &&
 		    (ia->ia_ifa.ifa_carp == NULL ||
 		    (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) {
 			ifa_ref(&ia->ia_ifa);
 			IN_IFADDR_RUNLOCK();
 			goto match;
 		}
 	}
 	LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
 		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
 		    ia->ia_ifp == ifp) &&
 		    isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
 			ifa_ref(&ia->ia_ifa);
 			IN_IFADDR_RUNLOCK();
 			goto match;
 		}
 
 #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia)				\
   (ia->ia_ifp->if_bridge == ifp->if_softc &&				\
   !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) &&	\
   addr == ia->ia_addr.sin_addr.s_addr)
 	/*
 	 * Check the case when bridge shares its MAC address with
 	 * some of its children, so packets are claimed by bridge
 	 * itself (bridge_input() does it first), but they are really
 	 * meant to be destined to the bridge member.
 	 */
 	if (is_bridge) {
 		LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 			if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
 				ifa_ref(&ia->ia_ifa);
 				ifp = ia->ia_ifp;
 				IN_IFADDR_RUNLOCK();
 				goto match;
 			}
 		}
 	}
 #undef BDG_MEMBER_MATCHES_ARP
 	IN_IFADDR_RUNLOCK();
 
 	/*
 	 * No match, use the first inet address on the receive interface
 	 * as a dummy address for the rest of the function.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (ifa->ifa_addr->sa_family == AF_INET &&
 		    (ifa->ifa_carp == NULL ||
 		    (*carp_iamatch_p)(ifa, &enaddr))) {
 			ia = ifatoia(ifa);
 			ifa_ref(ifa);
 			IF_ADDR_RUNLOCK(ifp);
 			goto match;
 		}
 	IF_ADDR_RUNLOCK(ifp);
 
 	/*
 	 * If bridging, fall back to using any inet address.
 	 */
 	IN_IFADDR_RLOCK();
 	if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) {
 		IN_IFADDR_RUNLOCK();
 		goto drop;
 	}
 	ifa_ref(&ia->ia_ifa);
 	IN_IFADDR_RUNLOCK();
 match:
 	if (!enaddr)
 		enaddr = (u_int8_t *)IF_LLADDR(ifp);
 	carped = (ia->ia_ifa.ifa_carp != NULL);
 	myaddr = ia->ia_addr.sin_addr;
 	ifa_free(&ia->ia_ifa);
 	if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
 		goto drop;	/* it's from me, ignore it. */
 	if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
 		ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address "
 		    "%s!\n", inet_ntoa(isaddr));
 		goto drop;
 	}
 	/*
 	 * Warn if another host is using the same IP address, but only if the
 	 * IP address isn't 0.0.0.0, which is used for DHCP only, in which
 	 * case we suppress the warning to avoid false positive complaints of
 	 * potential misconfiguration.
 	 */
 	if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr &&
 	    myaddr.s_addr != 0) {
 		ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n",
 		   ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 		   inet_ntoa(isaddr), ifp->if_xname);
 		itaddr = myaddr;
 		ARPSTAT_INC(dupips);
 		goto reply;
 	}
 	if (ifp->if_flags & IFF_STATICARP)
 		goto reply;
 
 	bzero(&sin, sizeof(sin));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr = isaddr;
 	flags = (itaddr.s_addr == myaddr.s_addr) ? LLE_CREATE : 0;
 	flags |= LLE_EXCLUSIVE;
 	IF_AFDATA_LOCK(ifp);
 	la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin);
 	IF_AFDATA_UNLOCK(ifp);
 	if (la != NULL) {
 		/* the following is not an error when doing bridging */
 		if (!bridged && la->lle_tbl->llt_ifp != ifp) {
 			if (log_arp_wrong_iface)
 				ARP_LOG(LOG_WARNING, "%s is on %s "
 				    "but got reply from %*D on %s\n",
 				    inet_ntoa(isaddr),
 				    la->lle_tbl->llt_ifp->if_xname,
 				    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 				    ifp->if_xname);
 			LLE_WUNLOCK(la);
 			goto reply;
 		}
 		if ((la->la_flags & LLE_VALID) &&
 		    bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) {
 			if (la->la_flags & LLE_STATIC) {
 				LLE_WUNLOCK(la);
 				if (log_arp_permanent_modify)
 					ARP_LOG(LOG_ERR,
 					    "%*D attempts to modify "
 					    "permanent entry for %s on %s\n",
 					    ifp->if_addrlen,
 					    (u_char *)ar_sha(ah), ":",
 					    inet_ntoa(isaddr), ifp->if_xname);
 				goto reply;
 			}
 			if (log_arp_movements) {
 				ARP_LOG(LOG_INFO, "%s moved from %*D "
 				    "to %*D on %s\n",
 				    inet_ntoa(isaddr),
 				    ifp->if_addrlen,
 				    (u_char *)&la->ll_addr, ":",
 				    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 				    ifp->if_xname);
 			}
 		}
 
 		if (ifp->if_addrlen != ah->ar_hln) {
 			LLE_WUNLOCK(la);
 			ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
 			    "i/f %d (ignored)\n", ifp->if_addrlen,
 			    (u_char *) ar_sha(ah), ":", ah->ar_hln,
 			    ifp->if_addrlen);
 			goto drop;
 		}
 		(void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
 		la->la_flags |= LLE_VALID;
 
 		EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
 
 		if (!(la->la_flags & LLE_STATIC)) {
 			int canceled;
 
 			LLE_ADDREF(la);
 			la->la_expire = time_uptime + V_arpt_keep;
 			canceled = callout_reset(&la->la_timer,
 			    hz * V_arpt_keep, arptimer, la);
 			if (canceled)
 				LLE_REMREF(la);
 		}
 		la->la_asked = 0;
 		la->la_preempt = V_arp_maxtries;
 		/*
 		 * The packets are all freed within the call to the output
 		 * routine.
 		 *
 		 * NB: The lock MUST be released before the call to the
 		 * output routine.
 		 */
 		if (la->la_hold != NULL) {
 			struct mbuf *m_hold, *m_hold_next;
 
 			m_hold = la->la_hold;
 			la->la_hold = NULL;
 			la->la_numheld = 0;
 			memcpy(&sa, L3_ADDR(la), sizeof(sa));
 			LLE_WUNLOCK(la);
 			for (; m_hold != NULL; m_hold = m_hold_next) {
 				m_hold_next = m_hold->m_nextpkt;
 				m_hold->m_nextpkt = NULL;
 				/* Avoid confusing lower layers. */
 				m_clrprotoflags(m_hold);
 				(*ifp->if_output)(ifp, m_hold, &sa, NULL);
 			}
 		} else
 			LLE_WUNLOCK(la);
 	}
 reply:
 	if (op != ARPOP_REQUEST)
 		goto drop;
 	ARPSTAT_INC(rxrequests);
 
 	if (itaddr.s_addr == myaddr.s_addr) {
 		/* Shortcut.. the receiving interface is the target. */
 		(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 		(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 	} else {
 		struct llentry *lle = NULL;
 
 		sin.sin_addr = itaddr;
 		IF_AFDATA_RLOCK(ifp);
 		lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
 		IF_AFDATA_RUNLOCK(ifp);
 
 		if ((lle != NULL) && (lle->la_flags & LLE_PUB)) {
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln);
 			LLE_RUNLOCK(lle);
 		} else {
 
 			if (lle != NULL)
 				LLE_RUNLOCK(lle);
 
 			if (!V_arp_proxyall)
 				goto drop;
 
 			sin.sin_addr = itaddr;
 			/* XXX MRT use table 0 for arp reply  */
 			rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
 			if (!rt)
 				goto drop;
 
 			/*
 			 * Don't send proxies for nodes on the same interface
 			 * as this one came out of, or we'll get into a fight
 			 * over who claims what Ether address.
 			 */
 			if (!rt->rt_ifp || rt->rt_ifp == ifp) {
 				RTFREE_LOCKED(rt);
 				goto drop;
 			}
 			RTFREE_LOCKED(rt);
 
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 
 			/*
 			 * Also check that the node which sent the ARP packet
 			 * is on the interface we expect it to be on. This
 			 * avoids ARP chaos if an interface is connected to the
 			 * wrong network.
 			 */
 			sin.sin_addr = isaddr;
 
 			/* XXX MRT use table 0 for arp checks */
 			rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
 			if (!rt)
 				goto drop;
 			if (rt->rt_ifp != ifp) {
 				ARP_LOG(LOG_INFO, "proxy: ignoring request"
 				    " from %s via %s, expecting %s\n",
 				    inet_ntoa(isaddr), ifp->if_xname,
 				    rt->rt_ifp->if_xname);
 				RTFREE_LOCKED(rt);
 				goto drop;
 			}
 			RTFREE_LOCKED(rt);
 
 #ifdef DEBUG_PROXY
 			printf("arp: proxying for %s\n", inet_ntoa(itaddr));
 #endif
 		}
 	}
 
 	if (itaddr.s_addr == myaddr.s_addr &&
 	    IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
 		/* RFC 3927 link-local IPv4; always reply by broadcast. */
 #ifdef DEBUG_LINKLOCAL
 		printf("arp: sending reply for link-local addr %s\n",
 		    inet_ntoa(itaddr));
 #endif
 		m->m_flags |= M_BCAST;
 		m->m_flags &= ~M_MCAST;
 	} else {
 		/* default behaviour; never reply by broadcast. */
 		m->m_flags &= ~(M_BCAST|M_MCAST);
 	}
 	(void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
 	(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
 	ah->ar_op = htons(ARPOP_REPLY);
 	ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
 	m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
 	m->m_pkthdr.len = m->m_len;
 	m->m_pkthdr.rcvif = NULL;
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	(*ifp->if_output)(ifp, m, &sa, NULL);
 	ARPSTAT_INC(txreplies);
 	return;
 
 drop:
 	m_freem(m);
 }
 #endif
 
 void
 arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
 {
 	struct llentry *lle;
 
 	if (ifa->ifa_carp != NULL)
 		return;
 
 	if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) {
 		arprequest(ifp, &IA_SIN(ifa)->sin_addr,
 				&IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp));
 		/*
 		 * interface address is considered static entry
 		 * because the output of the arp utility shows
 		 * that L2 entry as permanent
 		 */
 		IF_AFDATA_LOCK(ifp);
 		lle = lla_lookup(LLTABLE(ifp), (LLE_CREATE | LLE_IFADDR | LLE_STATIC),
 				 (struct sockaddr *)IA_SIN(ifa));
 		IF_AFDATA_UNLOCK(ifp);
 		if (lle == NULL)
 			log(LOG_INFO, "arp_ifinit: cannot create arp "
 			    "entry for interface address\n");
 		else
 			LLE_RUNLOCK(lle);
 	}
 	ifa->ifa_rtrequest = NULL;
 }
 
 void
 arp_ifinit2(struct ifnet *ifp, struct ifaddr *ifa, u_char *enaddr)
 {
 	if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY)
 		arprequest(ifp, &IA_SIN(ifa)->sin_addr,
 				&IA_SIN(ifa)->sin_addr, enaddr);
 	ifa->ifa_rtrequest = NULL;
 }
 
 static void
 arp_init(void)
 {
 
 	netisr_register(&arp_nh);
 }
 SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0);
Index: user/dchagin/lemul/sys/netinet/igmp.c
===================================================================
--- user/dchagin/lemul/sys/netinet/igmp.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/igmp.c	(revision 274241)
@@ -1,3655 +1,3655 @@
 /*-
  * Copyright (c) 2007-2009 Bruce Simpson.
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 /*
  * Internet Group Management Protocol (IGMP) routines.
  * [RFC1112, RFC2236, RFC3376]
  *
  * Written by Steve Deering, Stanford, May 1988.
  * Modified by Rosen Sharma, Stanford, Aug 1994.
  * Modified by Bill Fenner, Xerox PARC, Feb 1995.
  * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
  * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson.
  *
  * MULTICAST Revision: 3.5.1.4
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/ktr.h>
 #include <sys/condvar.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/igmp.h>
 #include <netinet/igmp_var.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifndef KTR_IGMPV3
 #define KTR_IGMPV3 KTR_INET
 #endif
 
 static struct igmp_ifinfo *
 		igi_alloc_locked(struct ifnet *);
 static void	igi_delete_locked(const struct ifnet *);
 static void	igmp_dispatch_queue(struct ifqueue *, int, const int);
 static void	igmp_fasttimo_vnet(void);
 static void	igmp_final_leave(struct in_multi *, struct igmp_ifinfo *);
 static int	igmp_handle_state_change(struct in_multi *,
 		    struct igmp_ifinfo *);
 static int	igmp_initial_join(struct in_multi *, struct igmp_ifinfo *);
 static int	igmp_input_v1_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v2_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v3_query(struct ifnet *, const struct ip *,
 		    /*const*/ struct igmpv3 *);
 static int	igmp_input_v3_group_query(struct in_multi *,
 		    struct igmp_ifinfo *, int, /*const*/ struct igmpv3 *);
 static int	igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static int	igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static void	igmp_intr(struct mbuf *);
 static int	igmp_isgroupreported(const struct in_addr);
 static struct mbuf *
 		igmp_ra_alloc(void);
 #ifdef KTR
 static char *	igmp_rec_type_to_str(const int);
 #endif
 static void	igmp_set_version(struct igmp_ifinfo *, const int);
 static void	igmp_slowtimo_vnet(void);
 static int	igmp_v1v2_queue_report(struct in_multi *, const int);
 static void	igmp_v1v2_process_group_timer(struct in_multi *, const int);
 static void	igmp_v1v2_process_querier_timers(struct igmp_ifinfo *);
 static void	igmp_v2_update_group(struct in_multi *, const int);
 static void	igmp_v3_cancel_link_timers(struct igmp_ifinfo *);
 static void	igmp_v3_dispatch_general_query(struct igmp_ifinfo *);
 static struct mbuf *
 		igmp_v3_encap_report(struct ifnet *, struct mbuf *);
 static int	igmp_v3_enqueue_group_record(struct ifqueue *,
 		    struct in_multi *, const int, const int, const int);
 static int	igmp_v3_enqueue_filter_change(struct ifqueue *,
 		    struct in_multi *);
 static void	igmp_v3_process_group_timers(struct igmp_ifinfo *,
 		    struct ifqueue *, struct ifqueue *, struct in_multi *,
 		    const int);
 static int	igmp_v3_merge_state_changes(struct in_multi *,
 		    struct ifqueue *);
 static void	igmp_v3_suppress_group_record(struct in_multi *);
 static int	sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS);
 
 static const struct netisr_handler igmp_nh = {
 	.nh_name = "igmp",
 	.nh_handler = igmp_intr,
 	.nh_proto = NETISR_IGMP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 /*
  * System-wide globals.
  *
  * Unlocked access to these is OK, except for the global IGMP output
  * queue. The IGMP subsystem lock ends up being system-wide for the moment,
  * because all VIMAGEs have to share a global output queue, as netisrs
  * themselves are not virtualized.
  *
  * Locking:
  *  * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
  *    Any may be taken independently; if any are held at the same
  *    time, the above lock order must be followed.
  *  * All output is delegated to the netisr.
  *    Now that Giant has been eliminated, the netisr may be inlined.
  *  * IN_MULTI_LOCK covers in_multi.
  *  * IGMP_LOCK covers igmp_ifinfo and any global variables in this file,
  *    including the output queue.
  *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
  *    per-link state iterators.
  *  * igmp_ifinfo is valid as long as PF_INET is attached to the interface,
  *    therefore it is not refcounted.
  *    We allow unlocked reads of igmp_ifinfo when accessed via in_multi.
  *
  * Reference counting
  *  * IGMP acquires its own reference every time an in_multi is passed to
  *    it and the group is being joined for the first time.
  *  * IGMP releases its reference(s) on in_multi in a deferred way,
  *    because the operations which process the release run as part of
  *    a loop whose control variables are directly affected by the release
  *    (that, and not recursing on the IF_ADDR_LOCK).
  *
  * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds
  * to a vnet in ifp->if_vnet.
  *
  * SMPng: XXX We may potentially race operations on ifma_protospec.
  * The problem is that we currently lack a clean way of taking the
  * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing,
  * as anything which modifies ifma needs to be covered by that lock.
  * So check for ifma_protospec being NULL before proceeding.
  */
 struct mtx		 igmp_mtx;
 
 struct mbuf		*m_raopt;		 /* Router Alert option */
 static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
 
 /*
  * VIMAGE-wide globals.
  *
  * The IGMPv3 timers themselves need to run per-image, however,
  * protosw timers run globally (see tcp).
  * An ifnet can only be in one vimage at a time, and the loopback
  * ifnet, loif, is itself virtualized.
  * It would otherwise be possible to seriously hose IGMP state,
  * and create inconsistencies in upstream multicast routing, if you have
  * multiple VIMAGEs running on the same link joining different multicast
  * groups, UNLESS the "primary IP address" is different. This is because
  * IGMP for IPv4 does not force link-local addresses to be used for each
  * node, unlike MLD for IPv6.
  * Obviously the IGMPv3 per-interface state has per-vimage granularity
  * also as a result.
  *
  * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection
  * policy to control the address used by IGMP on the link.
  */
 static VNET_DEFINE(int, interface_timers_running);	/* IGMPv3 general
 							 * query response */
 static VNET_DEFINE(int, state_change_timers_running);	/* IGMPv3 state-change
 							 * retransmit */
 static VNET_DEFINE(int, current_state_timers_running);	/* IGMPv1/v2 host
 							 * report; IGMPv3 g/sg
 							 * query response */
 
 #define	V_interface_timers_running	VNET(interface_timers_running)
 #define	V_state_change_timers_running	VNET(state_change_timers_running)
 #define	V_current_state_timers_running	VNET(current_state_timers_running)
 
 static VNET_DEFINE(LIST_HEAD(, igmp_ifinfo), igi_head);
 static VNET_DEFINE(struct igmpstat, igmpstat) = {
 	.igps_version = IGPS_VERSION_3,
 	.igps_len = sizeof(struct igmpstat),
 };
 static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0};
 
 #define	V_igi_head			VNET(igi_head)
 #define	V_igmpstat			VNET(igmpstat)
 #define	V_igmp_gsrdelay			VNET(igmp_gsrdelay)
 
 static VNET_DEFINE(int, igmp_recvifkludge) = 1;
 static VNET_DEFINE(int, igmp_sendra) = 1;
 static VNET_DEFINE(int, igmp_sendlocal) = 1;
 static VNET_DEFINE(int, igmp_v1enable) = 1;
 static VNET_DEFINE(int, igmp_v2enable) = 1;
 static VNET_DEFINE(int, igmp_legacysupp);
 static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3;
 
 #define	V_igmp_recvifkludge		VNET(igmp_recvifkludge)
 #define	V_igmp_sendra			VNET(igmp_sendra)
 #define	V_igmp_sendlocal		VNET(igmp_sendlocal)
 #define	V_igmp_v1enable			VNET(igmp_v1enable)
 #define	V_igmp_v2enable			VNET(igmp_v2enable)
 #define	V_igmp_legacysupp		VNET(igmp_legacysupp)
 #define	V_igmp_default_version		VNET(igmp_default_version)
 
 /*
  * Virtualized sysctls.
  */
-SYSCTL_VNET_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW,
+SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmpstat), igmpstat, "");
-SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_recvifkludge), 0,
     "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
-SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendra), 0,
     "Send IP Router Alert option in IGMPv2/v3 messages");
-SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendlocal), 0,
     "Send IGMP membership reports for 224.0.0.0/24 groups");
-SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v1enable), 0,
     "Enable backwards compatibility with IGMPv1");
-SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v2enable), 0,
     "Enable backwards compatibility with IGMPv2");
-SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_legacysupp), 0,
     "Allow v1/v2 reports to suppress v3 group responses");
-SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, default_version,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I",
     "Default version of IGMP to run on each interface");
-SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I",
     "Rate limit for IGMPv3 Group-and-Source queries in seconds");
 
 /*
  * Non-virtualized sysctls.
  */
 static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo,
     "Per-interface IGMPv3 state");
 
 static __inline void
 igmp_save_context(struct mbuf *m, struct ifnet *ifp)
 {
 
 #ifdef VIMAGE
 	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
 #endif /* VIMAGE */
 	m->m_pkthdr.flowid = ifp->if_index;
 }
 
 static __inline void
 igmp_scrub_context(struct mbuf *m)
 {
 
 	m->m_pkthdr.PH_loc.ptr = NULL;
 	m->m_pkthdr.flowid = 0;
 }
 
 #ifdef KTR
 static __inline char *
 inet_ntoa_haddr(in_addr_t haddr)
 {
 	struct in_addr ia;
 
 	ia.s_addr = htonl(haddr);
 	return (inet_ntoa(ia));
 }
 #endif
 
 /*
  * Restore context from a queued IGMP output chain.
  * Return saved ifindex.
  *
  * VIMAGE: The assertion is there to make sure that we
  * actually called CURVNET_SET() with what's in the mbuf chain.
  */
 static __inline uint32_t
 igmp_restore_context(struct mbuf *m)
 {
 
 #ifdef notyet
 #if defined(VIMAGE) && defined(INVARIANTS)
 	KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr),
 	    ("%s: called when curvnet was not restored", __func__));
 #endif
 #endif
 	return (m->m_pkthdr.flowid);
 }
 
 /*
  * Retrieve or set default IGMP version.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
 {
 	int	 error;
 	int	 new;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	new = V_igmp_default_version;
 
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
 	     V_igmp_default_version, new);
 
 	V_igmp_default_version = new;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Retrieve or set threshold between group-source queries in seconds.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	i = V_igmp_gsrdelay.tv_sec;
 
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (i < -1 || i >= 60) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d",
 	     V_igmp_gsrdelay.tv_sec, i);
 	V_igmp_gsrdelay.tv_sec = i;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Expose struct igmp_ifinfo to userland, keyed by ifindex.
  * For use by ifmcstat(8).
  *
  * SMPng: NOTE: Does an unlocked ifindex space read.
  * VIMAGE: Assume curvnet set by caller. The node handler itself
  * is not directly virtualized.
  */
 static int
 sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
 {
 	int			*name;
 	int			 error;
 	u_int			 namelen;
 	struct ifnet		*ifp;
 	struct igmp_ifinfo	*igi;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo));
 	if (error)
 		return (error);
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	if (name[0] <= 0 || name[0] > V_if_index) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	error = ENOENT;
 
 	ifp = ifnet_byindex(name[0]);
 	if (ifp == NULL)
 		goto out_locked;
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		if (ifp == igi->igi_ifp) {
 			error = SYSCTL_OUT(req, igi,
 			    sizeof(struct igmp_ifinfo));
 			break;
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Dispatch an entire queue of pending packet chains
  * using the netisr.
  * VIMAGE: Assumes the vnet pointer has been set.
  */
 static void
 igmp_dispatch_queue(struct ifqueue *ifq, int limit, const int loop)
 {
 	struct mbuf *m;
 
 	for (;;) {
 		_IF_DEQUEUE(ifq, m);
 		if (m == NULL)
 			break;
 		CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, ifq, m);
 		if (loop)
 			m->m_flags |= M_IGMP_LOOP;
 		netisr_dispatch(NETISR_IGMP, m);
 		if (--limit == 0)
 			break;
 	}
 }
 
 /*
  * Filter outgoing IGMP report state by group.
  *
  * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1).
  * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are
  * disabled for all groups in the 224.0.0.0/24 link-local scope. However,
  * this may break certain IGMP snooping switches which rely on the old
  * report behaviour.
  *
  * Return zero if the given group is one for which IGMP reports
  * should be suppressed, or non-zero if reports should be issued.
  */
 static __inline int
 igmp_isgroupreported(const struct in_addr addr)
 {
 
 	if (in_allhosts(addr) ||
 	    ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr)))))
 		return (0);
 
 	return (1);
 }
 
 /*
  * Construct a Router Alert option to use in outgoing packets.
  */
 static struct mbuf *
 igmp_ra_alloc(void)
 {
 	struct mbuf	*m;
 	struct ipoption	*p;
 
 	m = m_get(M_WAITOK, MT_DATA);
 	p = mtod(m, struct ipoption *);
 	p->ipopt_dst.s_addr = INADDR_ANY;
 	p->ipopt_list[0] = IPOPT_RA;	/* Router Alert Option */
 	p->ipopt_list[1] = 0x04;	/* 4 bytes long */
 	p->ipopt_list[2] = IPOPT_EOL;	/* End of IP option list */
 	p->ipopt_list[3] = 0x00;	/* pad byte */
 	m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1];
 
 	return (m);
 }
 
 /*
  * Attach IGMP when PF_INET is attached to an interface.
  */
 struct igmp_ifinfo *
 igmp_domifattach(struct ifnet *ifp)
 {
 	struct igmp_ifinfo *igi;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = igi_alloc_locked(ifp);
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		igi->igi_flags |= IGIF_SILENT;
 
 	IGMP_UNLOCK();
 
 	return (igi);
 }
 
 /*
  * VIMAGE: assume curvnet set by caller.
  */
 static struct igmp_ifinfo *
 igi_alloc_locked(/*const*/ struct ifnet *ifp)
 {
 	struct igmp_ifinfo *igi;
 
 	IGMP_LOCK_ASSERT();
 
 	igi = malloc(sizeof(struct igmp_ifinfo), M_IGMP, M_NOWAIT|M_ZERO);
 	if (igi == NULL)
 		goto out;
 
 	igi->igi_ifp = ifp;
 	igi->igi_version = V_igmp_default_version;
 	igi->igi_flags = 0;
 	igi->igi_rv = IGMP_RV_INIT;
 	igi->igi_qi = IGMP_QI_INIT;
 	igi->igi_qri = IGMP_QRI_INIT;
 	igi->igi_uri = IGMP_URI_INIT;
 
 	SLIST_INIT(&igi->igi_relinmhead);
 
 	/*
 	 * Responses to general queries are subject to bounds.
 	 */
 	IFQ_SET_MAXLEN(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
 
 	LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
 
 	CTR2(KTR_IGMPV3, "allocate igmp_ifinfo for ifp %p(%s)",
 	     ifp, ifp->if_xname);
 
 out:
 	return (igi);
 }
 
 /*
  * Hook for ifdetach.
  *
  * NOTE: Some finalization tasks need to run before the protocol domain
  * is detached, but also before the link layer does its cleanup.
  *
  * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK().
  * XXX This is also bitten by unlocked ifma_protospec access.
  */
 void
 igmp_ifdetach(struct ifnet *ifp)
 {
 	struct igmp_ifinfo	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm, *tinm;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
 	    ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	if (igi->igi_version == IGMP_VERSION_3) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 #if 0
 			KASSERT(ifma->ifma_protospec != NULL,
 			    ("%s: ifma_protospec is NULL", __func__));
 #endif
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			if (inm->inm_state == IGMP_LEAVING_MEMBER) {
 				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
 				    inm, inm_nrele);
 			}
 			inm_clear_recorded(inm);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		/*
 		 * Free the in_multi reference(s) for this IGMP lifecycle.
 		 */
 		SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele,
 		    tinm) {
 			SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
 			inm_release_locked(inm);
 		}
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Hook for domifdetach.
  */
 void
 igmp_domifdetach(struct ifnet *ifp)
 {
 	struct igmp_ifinfo *igi;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	igi_delete_locked(ifp);
 
 	IGMP_UNLOCK();
 }
 
 static void
 igi_delete_locked(const struct ifnet *ifp)
 {
 	struct igmp_ifinfo *igi, *tigi;
 
 	CTR3(KTR_IGMPV3, "%s: freeing igmp_ifinfo for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) {
 		if (igi->igi_ifp == ifp) {
 			/*
 			 * Free deferred General Query responses.
 			 */
 			_IF_DRAIN(&igi->igi_gq);
 
 			LIST_REMOVE(igi, igi_link);
 
 			KASSERT(SLIST_EMPTY(&igi->igi_relinmhead),
 			    ("%s: there are dangling in_multi references",
 			    __func__));
 
 			free(igi, M_IGMP);
 			return;
 		}
 	}
 
 #ifdef INVARIANTS
 	panic("%s: igmp_ifinfo not found for ifp %p\n", __func__,  ifp);
 #endif
 }
 
 /*
  * Process a received IGMPv1 query.
  * Return non-zero if the message should be dropped.
  *
  * VIMAGE: The curvnet pointer is derived from the input ifp.
  */
 static int
 igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifinfo	*igi;
 	struct in_multi		*inm;
 
 	/*
 	 * IGMPv1 Host Mmembership Queries SHOULD always be addressed to
 	 * 224.0.0.1. They are always treated as General Queries.
 	 * igmp_group is always ignored. Do not drop it as a userland
 	 * daemon may wish to see it.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) {
 		IGMPSTAT_INC(igps_rcv_badqueries);
 		return (0);
 	}
 	IGMPSTAT_INC(igps_rcv_gen_queries);
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Switch to IGMPv1 host compatibility mode.
 	 */
 	igmp_set_version(igi, IGMP_VERSION_1);
 
 	CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	/*
 	 * Start the timers in all of our group records
 	 * for the interface on which the query arrived,
 	 * except those which are already running.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (inm->inm_timer != 0)
 			continue;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(
 			    IGMP_V1V2_MAX_RI * PR_FASTHZ);
 			V_current_state_timers_running = 1;
 			break;
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 general or group-specific query.
  */
 static int
 igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifinfo	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint16_t		 timer;
 
 	is_general_query = 0;
 
 	/*
 	 * Validate address fields upfront.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (in_nullhost(igmp->igmp_group)) {
 		/*
 		 * IGMPv2 General Query.
 		 * If this was not sent to the all-hosts group, ignore it.
 		 */
 		if (!in_allhosts(ip->ip_dst))
 			return (0);
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		is_general_query = 1;
 	} else {
 		/* IGMPv2 Group-Specific Query. */
 		IGMPSTAT_INC(igps_rcv_group_queries);
 	}
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Ignore v2 query if in v1 Compatibility Mode.
 	 */
 	if (igi->igi_version == IGMP_VERSION_1)
 		goto out_locked;
 
 	igmp_set_version(igi, IGMP_VERSION_2);
 
 	timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	if (is_general_query) {
 		/*
 		 * For each reporting group joined on this
 		 * interface, kick the report timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			igmp_v2_update_group(inm, timer);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	} else {
 		/*
 		 * Group-specific IGMPv2 query, we need only
 		 * look up the single group to process it.
 		 */
 		inm = inm_lookup(ifp, igmp->igmp_group);
 		if (inm != NULL) {
 			CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 			igmp_v2_update_group(inm, timer);
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Update the report timer on a group in response to an IGMPv2 query.
  *
  * If we are becoming the reporting member for this group, start the timer.
  * If we already are the reporting member for this group, and timer is
  * below the threshold, reset it.
  *
  * We may be updating the group for the first time since we switched
  * to IGMPv3. If we are, then we must clear any recorded source lists,
  * and transition to REPORTING state; the group timer is overloaded
  * for group and group-source query responses. 
  *
  * Unlike IGMPv3, the delay per group should be jittered
  * to avoid bursts of IGMPv2 reports.
  */
 static void
 igmp_v2_update_group(struct in_multi *inm, const int timer)
 {
 
 	CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__,
 	    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer);
 
 	IN_MULTI_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (inm->inm_timer != 0 &&
 		    inm->inm_timer <= timer) {
 			CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, "
 			    "skipping.", __func__);
 			break;
 		}
 		/* FALLTHROUGH */
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__);
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		break;
 	case IGMP_SLEEPING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__);
 		inm->inm_state = IGMP_AWAKENING_MEMBER;
 		break;
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Process a received IGMPv3 general, group-specific or
  * group-and-source-specific query.
  * Assumes m has already been pulled up to the full IGMP message length.
  * Return 0 if successful, otherwise an appropriate error code is returned.
  */
 static int
 igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
     /*const*/ struct igmpv3 *igmpv3)
 {
 	struct igmp_ifinfo	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint32_t		 maxresp, nsrc, qqi;
 	uint16_t		 timer;
 	uint8_t			 qrv;
 
 	is_general_query = 0;
 
 	CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	maxresp = igmpv3->igmp_code;	/* in 1/10ths of a second */
 	if (maxresp >= 128) {
 		maxresp = IGMP_MANT(igmpv3->igmp_code) <<
 			  (IGMP_EXP(igmpv3->igmp_code) + 3);
 	}
 
 	/*
 	 * Robustness must never be less than 2 for on-wire IGMPv3.
 	 * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make
 	 * an exception for interfaces whose IGMPv3 state changes
 	 * are redirected to loopback (e.g. MANET).
 	 */
 	qrv = IGMP_QRV(igmpv3->igmp_misc);
 	if (qrv < 2) {
 		CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__,
 		    qrv, IGMP_RV_INIT);
 		qrv = IGMP_RV_INIT;
 	}
 
 	qqi = igmpv3->igmp_qqi;
 	if (qqi >= 128) {
 		qqi = IGMP_MANT(igmpv3->igmp_qqi) <<
 		     (IGMP_EXP(igmpv3->igmp_qqi) + 3);
 	}
 
 	timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Validate address fields and versions upfront before
 	 * accepting v3 query.
 	 * XXX SMPng: Unlocked access to igmpstat counters here.
 	 */
 	if (in_nullhost(igmpv3->igmp_group)) {
 		/*
 		 * IGMPv3 General Query.
 		 *
 		 * General Queries SHOULD be directed to 224.0.0.1.
 		 * A general query with a source list has undefined
 		 * behaviour; discard it.
 		 */
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		if (!in_allhosts(ip->ip_dst) || nsrc > 0) {
 			IGMPSTAT_INC(igps_rcv_badqueries);
 			return (0);
 		}
 		is_general_query = 1;
 	} else {
 		/* Group or group-source specific query. */
 		if (nsrc == 0)
 			IGMPSTAT_INC(igps_rcv_group_queries);
 		else
 			IGMPSTAT_INC(igps_rcv_gsr_queries);
 	}
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Discard the v3 query if we're in Compatibility Mode.
 	 * The RFC is not obviously worded that hosts need to stay in
 	 * compatibility mode until the Old Version Querier Present
 	 * timer expires.
 	 */
 	if (igi->igi_version != IGMP_VERSION_3) {
 		CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)",
 		    igi->igi_version, ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	igmp_set_version(igi, IGMP_VERSION_3);
 	igi->igi_rv = qrv;
 	igi->igi_qi = qqi;
 	igi->igi_qri = maxresp;
 
 	CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi,
 	    maxresp);
 
 	if (is_general_query) {
 		/*
 		 * Schedule a current-state report on this ifp for
 		 * all groups, possibly containing source lists.
 		 * If there is a pending General Query response
 		 * scheduled earlier than the selected delay, do
 		 * not schedule any other reports.
 		 * Otherwise, reset the interface timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) {
 			igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer);
 			V_interface_timers_running = 1;
 		}
 	} else {
 		/*
 		 * Group-source-specific queries are throttled on
 		 * a per-group basis to defeat denial-of-service attempts.
 		 * Queries for groups we are not a member of on this
 		 * link are simply ignored.
 		 */
 		inm = inm_lookup(ifp, igmpv3->igmp_group);
 		if (inm == NULL)
 			goto out_locked;
 		if (nsrc > 0) {
 			if (!ratecheck(&inm->inm_lastgsrtv,
 			    &V_igmp_gsrdelay)) {
 				CTR1(KTR_IGMPV3, "%s: GS query throttled.",
 				    __func__);
 				IGMPSTAT_INC(igps_drop_gsr_queries);
 				goto out_locked;
 			}
 		}
 		CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)",
 		     inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname);
 		/*
 		 * If there is a pending General Query response
 		 * scheduled sooner than the selected delay, no
 		 * further report need be scheduled.
 		 * Otherwise, prepare to respond to the
 		 * group-specific or group-and-source query.
 		 */
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer)
 			igmp_input_v3_group_query(inm, igi, timer, igmpv3);
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a recieved IGMPv3 group-specific or group-and-source-specific
  * query.
  * Return <0 if any error occured. Currently this is ignored.
  */
 static int
 igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifinfo *igi,
     int timer, /*const*/ struct igmpv3 *igmpv3)
 {
 	int			 retval;
 	uint16_t		 nsrc;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	retval = 0;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		return (retval);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		break;
 	}
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Deal with group-specific queries upfront.
 	 * If any group query is already pending, purge any recorded
 	 * source-list state if it exists, and schedule a query response
 	 * for this group-specific query.
 	 */
 	if (nsrc == 0) {
 		if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 		    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) {
 			inm_clear_recorded(inm);
 			timer = min(inm->inm_timer, timer);
 		}
 		inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Deal with the case where a group-and-source-specific query has
 	 * been received but a group-specific query is already pending.
 	 */
 	if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) {
 		timer = min(inm->inm_timer, timer);
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Finally, deal with the case where a group-and-source-specific
 	 * query has been received, where a response to a previous g-s-r
 	 * query exists, or none exists.
 	 * In this case, we need to parse the source-list which the Querier
 	 * has provided us with and check if we have any source list filter
 	 * entries at T1 for these sources. If we do not, there is no need
 	 * schedule a report and the query may be dropped.
 	 * If we do, we must record them and schedule a current-state
 	 * report for those sources.
 	 * FIXME: Handling source lists larger than 1 mbuf requires that
 	 * we pass the mbuf chain pointer down to this function, and use
 	 * m_getptr() to walk the chain.
 	 */
 	if (inm->inm_nsrc > 0) {
 		const struct in_addr	*ap;
 		int			 i, nrecorded;
 
 		ap = (const struct in_addr *)(igmpv3 + 1);
 		nrecorded = 0;
 		for (i = 0; i < nsrc; i++, ap++) {
 			retval = inm_record_source(inm, ap->s_addr);
 			if (retval < 0)
 				break;
 			nrecorded += retval;
 		}
 		if (nrecorded > 0) {
 			CTR1(KTR_IGMPV3,
 			    "%s: schedule response to SG query", __func__);
 			inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 			V_current_state_timers_running = 1;
 		}
 	}
 
 	return (retval);
 }
 
 /*
  * Process a received IGMPv1 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		IFP_TO_IA(ifp, ia);
 		if (ia != NULL) {
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)",
 	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv1 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, stop our group timer and transition to the 'lazy' state.
 	 */
 	IN_MULTI_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifinfo *igi;
 
 		igi = inm->inm_igi;
 		if (igi == NULL) {
 			KASSERT(igi != NULL,
 			    ("%s: no igi for ifp %p", __func__, ifp));
 			goto out_locked;
 		}
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 		case IGMP_SLEEPING_MEMBER:
 			inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_REPORTING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 			if (igi->igi_version == IGMP_VERSION_1)
 				inm->inm_state = IGMP_LAZY_MEMBER;
 			else if (igi->igi_version == IGMP_VERSION_2)
 				inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	/*
 	 * Make sure we don't hear our own membership report.  Fast
 	 * leave requires knowing that we are the only member of a
 	 * group.
 	 */
 	IFP_TO_IA(ifp, ia);
 	if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
 		ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		if (ia != NULL)
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 
 	CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)",
 	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv2 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, and our group timer is pending or about to be reset,
 	 * stop our group timer by transitioning to the 'lazy' state.
 	 */
 	IN_MULTI_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifinfo *igi;
 
 		igi = inm->inm_igi;
 		KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for %s on ifp %p(%s)",
 			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
 		case IGMP_LAZY_MEMBER:
 			inm->inm_state = IGMP_LAZY_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_UNLOCK();
 
 	return (0);
 }
 
 int
 igmp_input(struct mbuf **mp, int *offp, int proto)
 {
 	int iphlen;
 	struct ifnet *ifp;
 	struct igmp *igmp;
 	struct ip *ip;
 	struct mbuf *m;
 	int igmplen;
 	int minlen;
 	int queryver;
 
 	CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp);
 
 	m = *mp;
 	ifp = m->m_pkthdr.rcvif;
 	*mp = NULL;
 
 	IGMPSTAT_INC(igps_rcv_total);
 
 	ip = mtod(m, struct ip *);
 	iphlen = *offp;
 	igmplen = ntohs(ip->ip_len) - iphlen;
 
 	/*
 	 * Validate lengths.
 	 */
 	if (igmplen < IGMP_MINLEN) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Always pullup to the minimum size for v1/v2 or v3
 	 * to amortize calls to m_pullup().
 	 */
 	minlen = iphlen;
 	if (igmplen >= IGMP_V3_QUERY_MINLEN)
 		minlen += IGMP_V3_QUERY_MINLEN;
 	else
 		minlen += IGMP_MINLEN;
 	if ((!M_WRITABLE(m) || m->m_len < minlen) &&
 	    (m = m_pullup(m, minlen)) == 0) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 
 	/*
 	 * Validate checksum.
 	 */
 	m->m_data += iphlen;
 	m->m_len -= iphlen;
 	igmp = mtod(m, struct igmp *);
 	if (in_cksum(m, igmplen)) {
 		IGMPSTAT_INC(igps_rcv_badsum);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= iphlen;
 	m->m_len += iphlen;
 
 	/*
 	 * IGMP control traffic is link-scope, and must have a TTL of 1.
 	 * DVMRP traffic (e.g. mrinfo, mtrace) is an exception;
 	 * probe packets may come from beyond the LAN.
 	 */
 	if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) {
 		IGMPSTAT_INC(igps_rcv_badttl);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	switch (igmp->igmp_type) {
 	case IGMP_HOST_MEMBERSHIP_QUERY:
 		if (igmplen == IGMP_MINLEN) {
 			if (igmp->igmp_code == 0)
 				queryver = IGMP_VERSION_1;
 			else
 				queryver = IGMP_VERSION_2;
 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
 			queryver = IGMP_VERSION_3;
 		} else {
 			IGMPSTAT_INC(igps_rcv_tooshort);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		switch (queryver) {
 		case IGMP_VERSION_1:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v1enable)
 				break;
 			if (igmp_input_v1_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_2:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v2enable)
 				break;
 			if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_3: {
 				struct igmpv3 *igmpv3;
 				uint16_t igmpv3len;
 				uint16_t srclen;
 				int nsrc;
 
 				IGMPSTAT_INC(igps_rcv_v3_queries);
 				igmpv3 = (struct igmpv3 *)igmp;
 				/*
 				 * Validate length based on source count.
 				 */
 				nsrc = ntohs(igmpv3->igmp_numsrc);
 				srclen = sizeof(struct in_addr) * nsrc;
 				if (nsrc * sizeof(in_addr_t) > srclen) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				/*
 				 * m_pullup() may modify m, so pullup in
 				 * this scope.
 				 */
 				igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
 				    srclen;
 				if ((!M_WRITABLE(m) ||
 				     m->m_len < igmpv3len) &&
 				    (m = m_pullup(m, igmpv3len)) == NULL) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
 				    + iphlen);
 				if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
 					m_freem(m);
 					return (IPPROTO_DONE);
 				}
 			}
 			break;
 		}
 		break;
 
 	case IGMP_v1_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v1enable)
 			break;
 		if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v2_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v2enable)
 			break;
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v3_HOST_MEMBERSHIP_REPORT:
 		/*
 		 * Hosts do not need to process IGMPv3 membership reports,
 		 * as report suppression is no longer required.
 		 */
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		break;
 
 	default:
 		break;
 	}
 
 	/*
 	 * Pass all valid IGMP packets up to any process(es) listening on a
 	 * raw IGMP socket.
 	 */
 	*mp = m;
 	return (rip_input(mp, offp, proto));
 }
 
 
 /*
  * Fast timeout handler (global).
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 igmp_fasttimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_fasttimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Fast timeout handler (per-vnet).
  * Sends are shuffled off to a netisr to deal with Giant.
  *
  * VIMAGE: Assume caller has set up our curvnet.
  */
 static void
 igmp_fasttimo_vnet(void)
 {
 	struct ifqueue		 scq;	/* State-change packets */
 	struct ifqueue		 qrq;	/* Query response packets */
 	struct ifnet		*ifp;
 	struct igmp_ifinfo	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm;
 	int			 loop, uri_fasthz;
 
 	loop = 0;
 	uri_fasthz = 0;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order to
 	 * minimize the overhead of fasttimo processing.
 	 * SMPng: XXX Unlocked reads.
 	 */
 	if (!V_current_state_timers_running &&
 	    !V_interface_timers_running &&
 	    !V_state_change_timers_running)
 		return;
 
 	IN_MULTI_LOCK();
 	IGMP_LOCK();
 
 	/*
 	 * IGMPv3 General Query response timer processing.
 	 */
 	if (V_interface_timers_running) {
 		CTR1(KTR_IGMPV3, "%s: interface timers running", __func__);
 
 		V_interface_timers_running = 0;
 		LIST_FOREACH(igi, &V_igi_head, igi_link) {
 			if (igi->igi_v3_timer == 0) {
 				/* Do nothing. */
 			} else if (--igi->igi_v3_timer == 0) {
 				igmp_v3_dispatch_general_query(igi);
 			} else {
 				V_interface_timers_running = 1;
 			}
 		}
 	}
 
 	if (!V_current_state_timers_running &&
 	    !V_state_change_timers_running)
 		goto out_locked;
 
 	V_current_state_timers_running = 0;
 	V_state_change_timers_running = 0;
 
 	CTR1(KTR_IGMPV3, "%s: state change timers running", __func__);
 
 	/*
 	 * IGMPv1/v2/v3 host report and state-change timer processing.
 	 * Note: Processing a v3 group timer may remove a node.
 	 */
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		ifp = igi->igi_ifp;
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 			uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
 			    PR_FASTHZ);
 
 			memset(&qrq, 0, sizeof(struct ifqueue));
 			IFQ_SET_MAXLEN(&qrq, IGMP_MAX_G_GS_PACKETS);
 
 			memset(&scq, 0, sizeof(struct ifqueue));
 			IFQ_SET_MAXLEN(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
 		}
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			switch (igi->igi_version) {
 			case IGMP_VERSION_1:
 			case IGMP_VERSION_2:
 				igmp_v1v2_process_group_timer(inm,
 				    igi->igi_version);
 				break;
 			case IGMP_VERSION_3:
 				igmp_v3_process_group_timers(igi, &qrq,
 				    &scq, inm, uri_fasthz);
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			struct in_multi		*tinm;
 
 			igmp_dispatch_queue(&qrq, 0, loop);
 			igmp_dispatch_queue(&scq, 0, loop);
 
 			/*
 			 * Free the in_multi reference(s) for this
 			 * IGMP lifecycle.
 			 */
 			SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead,
 			    inm_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&igi->igi_relinmhead,
 				    inm_nrele);
 				inm_release_locked(inm);
 			}
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_UNLOCK();
 }
 
 /*
  * Update host report group timer for IGMPv1/v2.
  * Will update the global pending timer flags.
  */
 static void
 igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
 {
 	int report_timer_expired;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	if (inm->inm_timer == 0) {
 		report_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		report_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 		return;
 	}
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (report_timer_expired) {
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			(void)igmp_v1v2_queue_report(inm,
 			    (version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 		}
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Update a group's timers for IGMPv3.
  * Will update the global pending timer flags.
  * Note: Unlocked read from igi.
  */
 static void
 igmp_v3_process_group_timers(struct igmp_ifinfo *igi,
     struct ifqueue *qrq, struct ifqueue *scq,
     struct in_multi *inm, const int uri_fasthz)
 {
 	int query_response_timer_expired;
 	int state_change_retransmit_timer_expired;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	query_response_timer_expired = 0;
 	state_change_retransmit_timer_expired = 0;
 
 	/*
 	 * During a transition from v1/v2 compatibility mode back to v3,
 	 * a group record in REPORTING state may still have its group
 	 * timer active. This is a no-op in this function; it is easier
 	 * to deal with it here than to complicate the slow-timeout path.
 	 */
 	if (inm->inm_timer == 0) {
 		query_response_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		query_response_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 	}
 
 	if (inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 0;
 	} else if (--inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 1;
 	} else {
 		V_state_change_timers_running = 1;
 	}
 
 	/* We are in fasttimo, so be quick about it. */
 	if (!state_change_retransmit_timer_expired &&
 	    !query_response_timer_expired)
 		return;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		/*
 		 * Respond to a previously pending Group-Specific
 		 * or Group-and-Source-Specific query by enqueueing
 		 * the appropriate Current-State report for
 		 * immediate transmission.
 		 */
 		if (query_response_timer_expired) {
 			int retval;
 
 			retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
 			    (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			/* XXX Clear recorded sources for next time. */
 			inm_clear_recorded(inm);
 		}
 		/* FALLTHROUGH */
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		if (state_change_retransmit_timer_expired) {
 			/*
 			 * State-change retransmission timer fired.
 			 * If there are any further pending retransmissions,
 			 * set the global pending state-change flag, and
 			 * reset the timer.
 			 */
 			if (--inm->inm_scrv > 0) {
 				inm->inm_sctimer = uri_fasthz;
 				V_state_change_timers_running = 1;
 			}
 			/*
 			 * Retransmit the previously computed state-change
 			 * report. If there are no further pending
 			 * retransmissions, the mbuf queue will be consumed.
 			 * Update T0 state to T1 as we have now sent
 			 * a state-change.
 			 */
 			(void)igmp_v3_merge_state_changes(inm, scq);
 
 			inm_commit(inm);
 			CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 			    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 
 			/*
 			 * If we are leaving the group for good, make sure
 			 * we release IGMP's reference to it.
 			 * This release must be deferred using a SLIST,
 			 * as we are called from a loop which traverses
 			 * the in_ifmultiaddr TAILQ.
 			 */
 			if (inm->inm_state == IGMP_LEAVING_MEMBER &&
 			    inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
 				    inm, inm_nrele);
 			}
 		}
 		break;
 	}
 }
 
 
 /*
  * Suppress a group's pending response to a group or source/group query.
  *
  * Do NOT suppress state changes. This leads to IGMPv3 inconsistency.
  * Do NOT update ST1/ST0 as this operation merely suppresses
  * the currently pending group record.
  * Do NOT suppress the response to a general query. It is possible but
  * it would require adding another state or flag.
  */
 static void
 igmp_v3_suppress_group_record(struct in_multi *inm)
 {
 
 	IN_MULTI_LOCK_ASSERT();
 
 	KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
 		("%s: not IGMPv3 mode on link", __func__));
 
 	if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER ||
 	    inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER)
 		return;
 
 	if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 		inm_clear_recorded(inm);
 
 	inm->inm_timer = 0;
 	inm->inm_state = IGMP_REPORTING_MEMBER;
 }
 
 /*
  * Switch to a different IGMP version on the given interface,
  * as per Section 7.2.1.
  */
 static void
 igmp_set_version(struct igmp_ifinfo *igi, const int version)
 {
 	int old_version_timer;
 
 	IGMP_LOCK_ASSERT();
 
 	CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__,
 	    version, igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) {
 		/*
 		 * Compute the "Older Version Querier Present" timer as per
 		 * Section 8.12.
 		 */
 		old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri;
 		old_version_timer *= PR_SLOWHZ;
 
 		if (version == IGMP_VERSION_1) {
 			igi->igi_v1_timer = old_version_timer;
 			igi->igi_v2_timer = 0;
 		} else if (version == IGMP_VERSION_2) {
 			igi->igi_v1_timer = 0;
 			igi->igi_v2_timer = old_version_timer;
 		}
 	}
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_2) {
 			igi->igi_version = IGMP_VERSION_2;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_1) {
 			igi->igi_version = IGMP_VERSION_1;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	}
 }
 
 /*
  * Cancel pending IGMPv3 timers for the given link and all groups
  * joined on it; state-change, general-query, and group-query timers.
  *
  * Only ever called on a transition from v3 to Compatibility mode. Kill
  * the timers stone dead (this may be expensive for large N groups), they
  * will be restarted if Compatibility Mode deems that they must be due to
  * query processing.
  */
 static void
 igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm, *tinm;
 
 	CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
 	    igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * Stop the v3 General Query Response on this link stone dead.
 	 * If fasttimo is woken up due to V_interface_timers_running,
 	 * the flag will be cleared if there are no pending link timers.
 	 */
 	igi->igi_v3_timer = 0;
 
 	/*
 	 * Now clear the current-state and state-change report timers
 	 * for all memberships scoped to this link.
 	 */
 	ifp = igi->igi_ifp;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			/*
 			 * These states are either not relevant in v3 mode,
 			 * or are unreported. Do nothing.
 			 */
 			break;
 		case IGMP_LEAVING_MEMBER:
 			/*
 			 * If we are leaving the group and switching to
 			 * compatibility mode, we need to release the final
 			 * reference held for issuing the INCLUDE {}, and
 			 * transition to REPORTING to ensure the host leave
 			 * message is sent upstream to the old querier --
 			 * transition to NOT would lose the leave and race.
 			 */
 			SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele);
 			/* FALLTHROUGH */
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 			inm_clear_recorded(inm);
 			/* FALLTHROUGH */
 		case IGMP_REPORTING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			break;
 		}
 		/*
 		 * Always clear state-change and group report timers.
 		 * Free any pending IGMPv3 state-change records.
 		 */
 		inm->inm_sctimer = 0;
 		inm->inm_timer = 0;
 		_IF_DRAIN(&inm->inm_scq);
 	}
 	IF_ADDR_RUNLOCK(ifp);
 	SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
 		inm_release_locked(inm);
 	}
 }
 
 /*
  * Update the Older Version Querier Present timers for a link.
  * See Section 7.2.1 of RFC 3376.
  */
 static void
 igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi)
 {
 
 	IGMP_LOCK_ASSERT();
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) {
 		/*
 		 * IGMPv1 and IGMPv2 Querier Present timers expired.
 		 *
 		 * Revert to IGMPv3.
 		 */
 		if (igi->igi_version != IGMP_VERSION_3) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_version = IGMP_VERSION_3;
 		}
 	} else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer expired,
 		 * IGMPv2 Querier Present timer running.
 		 * If IGMPv2 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv2 is enabled, revert to IGMPv2.
 		 */
 		if (!V_igmp_v2enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v2_timer;
 			if (igi->igi_version != IGMP_VERSION_2) {
 				CTR5(KTR_IGMPV3,
 				    "%s: transition from v%d -> v%d on %p(%s)",
 				    __func__, igi->igi_version, IGMP_VERSION_2,
 				    igi->igi_ifp, igi->igi_ifp->if_xname);
 				igi->igi_version = IGMP_VERSION_2;
 				igmp_v3_cancel_link_timers(igi);
 			}
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer running.
 		 * Stop IGMPv2 timer if running.
 		 *
 		 * If IGMPv1 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv1 is enabled, reset IGMPv2 timer if running.
 		 */
 		if (!V_igmp_v1enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v1_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v1_timer;
 		}
 		if (igi->igi_v2_timer > 0) {
 			CTR3(KTR_IGMPV3,
 			    "%s: cancel v2 timer on %p(%s)",
 			    __func__, igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 		}
 	}
 }
 
 /*
  * Global slowtimo handler.
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 igmp_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_slowtimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Per-vnet slowtimo handler.
  */
 static void
 igmp_slowtimo_vnet(void)
 {
 	struct igmp_ifinfo *igi;
 
 	IGMP_LOCK();
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		igmp_v1v2_process_querier_timers(igi);
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Dispatch an IGMPv1/v2 host report or leave message.
  * These are always small enough to fit inside a single mbuf.
  */
 static int
 igmp_v1v2_queue_report(struct in_multi *inm, const int type)
 {
 	struct ifnet		*ifp;
 	struct igmp		*igmp;
 	struct ip		*ip;
 	struct mbuf		*m;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	ifp = inm->inm_ifp;
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOMEM);
 	MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
 
 	m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len = sizeof(struct igmp);
 
 	igmp = mtod(m, struct igmp *);
 	igmp->igmp_type = type;
 	igmp->igmp_code = 0;
 	igmp->igmp_group = inm->inm_addr;
 	igmp->igmp_cksum = 0;
 	igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp));
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = 0;
 	ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp));
 	ip->ip_off = 0;
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (type == IGMP_HOST_LEAVE_MESSAGE)
 		ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP);
 	else
 		ip->ip_dst = inm->inm_addr;
 
 	igmp_save_context(m, ifp);
 
 	m->m_flags |= M_IGMPV2;
 	if (inm->inm_igi->igi_flags & IGIF_LOOPBACK)
 		m->m_flags |= M_IGMP_LOOP;
 
 	CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m);
 	netisr_dispatch(NETISR_IGMP, m);
 
 	return (0);
 }
 
 /*
  * Process a state change from the upper layer for the given IPv4 group.
  *
  * Each socket holds a reference on the in_multi in its own ip_moptions.
  * The socket layer will have made the necessary updates to.the group
  * state, it is now up to IGMP to issue a state change report if there
  * has been any change between T0 (when the last state-change was issued)
  * and T1 (now).
  *
  * We use the IGMPv3 state machine at group level. The IGMP module
  * however makes the decision as to which IGMP protocol version to speak.
  * A state change *from* INCLUDE {} always means an initial join.
  * A state change *to* INCLUDE {} always means a final leave.
  *
  * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can
  * save ourselves a bunch of work; any exclusive mode groups need not
  * compute source filter lists.
  *
  * VIMAGE: curvnet should have been set by caller, as this routine
  * is called from the socket option handlers.
  */
 int
 igmp_change_state(struct in_multi *inm)
 {
 	struct igmp_ifinfo *igi;
 	struct ifnet *ifp;
 	int error;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	error = 0;
 
 	/*
 	 * Try to detect if the upper layer just asked us to change state
 	 * for an interface which has now gone away.
 	 */
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
 	/*
 	 * Sanity check that netinet's notion of ifp is the
 	 * same as net's.
 	 */
 	KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
 
 	/*
 	 * If we detect a state transition to or from MCAST_UNDEFINED
 	 * for this group, then we are starting or finishing an IGMP
 	 * life cycle for this group.
 	 */
 	if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) {
 		CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__,
 		    inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode);
 		if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: initial join", __func__);
 			error = igmp_initial_join(inm, igi);
 			goto out_locked;
 		} else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: final leave", __func__);
 			igmp_final_leave(inm, igi);
 			goto out_locked;
 		}
 	} else {
 		CTR1(KTR_IGMPV3, "%s: filter set change", __func__);
 	}
 
 	error = igmp_handle_state_change(inm, igi);
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Perform the initial join for an IGMP group.
  *
  * When joining a group:
  *  If the group should have its IGMP traffic suppressed, do nothing.
  *  IGMPv1 starts sending IGMPv1 host membership reports.
  *  IGMPv2 starts sending IGMPv2 host membership reports.
  *  IGMPv3 will schedule an IGMPv3 state-change report containing the
  *  initial state of the membership.
  */
 static int
 igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi)
 {
 	struct ifnet		*ifp;
 	struct ifqueue		*ifq;
 	int			 error, retval, syncstates;
 
 	CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	error = 0;
 	syncstates = 1;
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	/*
 	 * Groups joined on loopback or marked as 'not reported',
 	 * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and
 	 * are never reported in any IGMP protocol exchanges.
 	 * All other groups enter the appropriate IGMP state machine
 	 * for the version in use on this link.
 	 * A link marked as IGIF_SILENT causes IGMP to be completely
 	 * disabled for the link.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr)) {
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		inm->inm_state = IGMP_SILENT_MEMBER;
 		inm->inm_timer = 0;
 	} else {
 		/*
 		 * Deal with overlapping in_multi lifecycle.
 		 * If this group was LEAVING, then make sure
 		 * we drop the reference we picked up to keep the
 		 * group around for the final INCLUDE {} enqueue.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3 &&
 		    inm->inm_state == IGMP_LEAVING_MEMBER)
 			inm_release_locked(inm);
 
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 
 		switch (igi->igi_version) {
 		case IGMP_VERSION_1:
 		case IGMP_VERSION_2:
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			error = igmp_v1v2_queue_report(inm,
 			    (igi->igi_version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 			if (error == 0) {
 				inm->inm_timer = IGMP_RANDOM_DELAY(
 				    IGMP_V1V2_MAX_RI * PR_FASTHZ);
 				V_current_state_timers_running = 1;
 			}
 			break;
 
 		case IGMP_VERSION_3:
 			/*
 			 * Defer update of T0 to T1, until the first copy
 			 * of the state change has been transmitted.
 			 */
 			syncstates = 0;
 
 			/*
 			 * Immediately enqueue a State-Change Report for
 			 * this interface, freeing any previous reports.
 			 * Don't kick the timers if there is nothing to do,
 			 * or if an error occurred.
 			 */
 			ifq = &inm->inm_scq;
 			_IF_DRAIN(ifq);
 			retval = igmp_v3_enqueue_group_record(ifq, inm, 1,
 			    0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			if (retval <= 0) {
 				error = retval * -1;
 				break;
 			}
 
 			/*
 			 * Schedule transmission of pending state-change
 			 * report up to RV times for this link. The timer
 			 * will fire at the next igmp_fasttimo (~200ms),
 			 * giving us an opportunity to merge the reports.
 			 */
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				KASSERT(igi->igi_rv > 1,
 				   ("%s: invalid robustness %d", __func__,
 				    igi->igi_rv));
 				inm->inm_scrv = igi->igi_rv;
 			}
 			inm->inm_sctimer = 1;
 			V_state_change_timers_running = 1;
 
 			error = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Only update the T0 state if state change is atomic,
 	 * i.e. we don't need to wait for a timer to fire before we
 	 * can consider the state change to have been communicated.
 	 */
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 	}
 
 	return (error);
 }
 
 /*
  * Issue an intermediate state change during the IGMP life-cycle.
  */
 static int
 igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi)
 {
 	struct ifnet		*ifp;
 	int			 retval;
 
 	CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr) ||
 	    (igi->igi_version != IGMP_VERSION_3)) {
 		if (!igmp_isgroupreported(inm->inm_addr)) {
 			CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		}
 		CTR1(KTR_IGMPV3, "%s: nothing to do", __func__);
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	_IF_DRAIN(&inm->inm_scq);
 
 	retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
 	CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
 	if (retval <= 0)
 		return (-retval);
 
 	/*
 	 * If record(s) were enqueued, start the state-change
 	 * report timer for this group.
 	 */
 	inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv);
 	inm->inm_sctimer = 1;
 	V_state_change_timers_running = 1;
 
 	return (0);
 }
 
 /*
  * Perform the final leave for an IGMP group.
  *
  * When leaving a group:
  *  IGMPv1 does nothing.
  *  IGMPv2 sends a host leave message, if and only if we are the reporter.
  *  IGMPv3 enqueues a state-change report containing a transition
  *  to INCLUDE {} for immediate transmission.
  */
 static void
 igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi)
 {
 	int syncstates;
 
 	syncstates = 1;
 
 	CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)",
 	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		/* Already leaving or left; do nothing. */
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		if (igi->igi_version == IGMP_VERSION_2) {
 #ifdef INVARIANTS
 			if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 			    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 			panic("%s: IGMPv3 state reached, not IGMPv3 mode",
 			     __func__);
 #endif
 			igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE);
 			inm->inm_state = IGMP_NOT_MEMBER;
 		} else if (igi->igi_version == IGMP_VERSION_3) {
 			/*
 			 * Stop group timer and all pending reports.
 			 * Immediately enqueue a state-change report
 			 * TO_IN {} to be sent on the next fast timeout,
 			 * giving us an opportunity to merge reports.
 			 */
 			_IF_DRAIN(&inm->inm_scq);
 			inm->inm_timer = 0;
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				inm->inm_scrv = igi->igi_rv;
 			}
 			CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d "
 			    "pending retransmissions.", __func__,
 			    inet_ntoa(inm->inm_addr),
 			    inm->inm_ifp->if_xname, inm->inm_scrv);
 			if (inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				inm->inm_sctimer = 0;
 			} else {
 				int retval;
 
 				inm_acquire_locked(inm);
 
 				retval = igmp_v3_enqueue_group_record(
 				    &inm->inm_scq, inm, 1, 0, 0);
 				KASSERT(retval != 0,
 				    ("%s: enqueue record = %d", __func__,
 				     retval));
 
 				inm->inm_state = IGMP_LEAVING_MEMBER;
 				inm->inm_sctimer = 1;
 				V_state_change_timers_running = 1;
 				syncstates = 0;
 			}
 			break;
 		}
 		break;
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		/* Our reports are suppressed; do nothing. */
 		break;
 	}
 
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
 		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 		CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s",
 		    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
 	}
 }
 
 /*
  * Enqueue an IGMPv3 group record to the given output queue.
  *
  * XXX This function could do with having the allocation code
  * split out, and the multiple-tree-walks coalesced into a single
  * routine as has been done in igmp_v3_enqueue_filter_change().
  *
  * If is_state_change is zero, a current-state record is appended.
  * If is_state_change is non-zero, a state-change report is appended.
  *
  * If is_group_query is non-zero, an mbuf packet chain is allocated.
  * If is_group_query is zero, and if there is a packet with free space
  * at the tail of the queue, it will be appended to providing there
  * is enough free space.
  * Otherwise a new mbuf packet chain is allocated.
  *
  * If is_source_query is non-zero, each source is checked to see if
  * it was recorded for a Group-Source query, and will be omitted if
  * it is not both in-mode and recorded.
  *
  * The function will attempt to allocate leading space in the packet
  * for the IP/IGMP header to be prepended without fragmenting the chain.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
     const int is_state_change, const int is_group_query,
     const int is_source_query)
 {
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ifnet		*ifp;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m0, *m, *md;
 	int			 error, is_filter_list_change;
 	int			 minrec0len, m0srcs, msrcs, nbytes, off;
 	int			 record_has_sources;
 	int			 now;
 	int			 type;
 	in_addr_t		 naddr;
 	uint8_t			 mode;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	error = 0;
 	ifp = inm->inm_ifp;
 	is_filter_list_change = 0;
 	m = NULL;
 	m0 = NULL;
 	m0srcs = 0;
 	msrcs = 0;
 	nbytes = 0;
 	nims = NULL;
 	record_has_sources = 1;
 	pig = NULL;
 	type = IGMP_DO_NOTHING;
 	mode = inm->inm_st[1].iss_fmode;
 
 	/*
 	 * If we did not transition out of ASM mode during t0->t1,
 	 * and there are no source nodes to process, we can skip
 	 * the generation of source records.
 	 */
 	if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 &&
 	    inm->inm_nsrc == 0)
 		record_has_sources = 0;
 
 	if (is_state_change) {
 		/*
 		 * Queue a state change record.
 		 * If the mode did not change, and there are non-ASM
 		 * listeners or source filters present,
 		 * we potentially need to issue two records for the group.
 		 * If we are transitioning to MCAST_UNDEFINED, we need
 		 * not send any sources.
 		 * If there are ASM listeners, and there was no filter
 		 * mode transition of any kind, do nothing.
 		 */
 		if (mode != inm->inm_st[0].iss_fmode) {
 			if (mode == MCAST_EXCLUDE) {
 				CTR1(KTR_IGMPV3, "%s: change to EXCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_EXCLUDE_MODE;
 			} else {
 				CTR1(KTR_IGMPV3, "%s: change to INCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_INCLUDE_MODE;
 				if (mode == MCAST_UNDEFINED)
 					record_has_sources = 0;
 			}
 		} else {
 			if (record_has_sources) {
 				is_filter_list_change = 1;
 			} else {
 				type = IGMP_DO_NOTHING;
 			}
 		}
 	} else {
 		/*
 		 * Queue a current state record.
 		 */
 		if (mode == MCAST_EXCLUDE) {
 			type = IGMP_MODE_IS_EXCLUDE;
 		} else if (mode == MCAST_INCLUDE) {
 			type = IGMP_MODE_IS_INCLUDE;
 			KASSERT(inm->inm_st[1].iss_asm == 0,
 			    ("%s: inm %p is INCLUDE but ASM count is %d",
 			     __func__, inm, inm->inm_st[1].iss_asm));
 		}
 	}
 
 	/*
 	 * Generate the filter list changes using a separate function.
 	 */
 	if (is_filter_list_change)
 		return (igmp_v3_enqueue_filter_change(ifq, inm));
 
 	if (type == IGMP_DO_NOTHING) {
 		CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s",
 		    __func__, inet_ntoa(inm->inm_addr),
 		    inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	/*
 	 * If any sources are present, we must be able to fit at least
 	 * one in the trailing space of the tail packet's mbuf,
 	 * ideally more.
 	 */
 	minrec0len = sizeof(struct igmp_grouprec);
 	if (record_has_sources)
 		minrec0len += sizeof(in_addr_t);
 
 	CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__,
 	    igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr),
 	    inm->inm_ifp->if_xname);
 
 	/*
 	 * Check if we have a packet in the tail of the queue for this
 	 * group into which the first group record for this group will fit.
 	 * Otherwise allocate a new packet.
 	 * Always allocate leading space for IP+RA_OPT+IGMP+REPORT.
 	 * Note: Group records for G/GSR query responses MUST be sent
 	 * in their own packet.
 	 */
 	m0 = ifq->ifq_tail;
 	if (!is_group_query &&
 	    m0 != NULL &&
 	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
 	    (m0->m_pkthdr.len + minrec0len) <
 	     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 			    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		m = m0;
 		CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
 	} else {
 		if (_IF_QFULL(ifq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = NULL;
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		if (!is_state_change && !is_group_query) {
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 			if (m)
 				m->m_data += IGMP_LEADINGSPACE;
 		}
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
 				MH_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 
 		igmp_save_context(m, ifp);
 
 		CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__);
 	}
 
 	/*
 	 * Append group record.
 	 * If we have sources, we don't know how many yet.
 	 */
 	ig.ig_type = type;
 	ig.ig_datalen = 0;
 	ig.ig_numsrc = 0;
 	ig.ig_group = inm->inm_addr;
 	if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 		if (m != m0)
 			m_freem(m);
 		CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 		return (-ENOMEM);
 	}
 	nbytes += sizeof(struct igmp_grouprec);
 
 	/*
 	 * Append as many sources as will fit in the first packet.
 	 * If we are appending to a new packet, the chain allocation
 	 * may potentially use clusters; use m_getptr() in this case.
 	 * If we are appending to an existing packet, we need to obtain
 	 * a pointer to the group record after m_append(), in case a new
 	 * mbuf was allocated.
 	 * Only append sources which are in-mode at t1. If we are
 	 * transitioning to MCAST_UNDEFINED state on the group, do not
 	 * include source entries.
 	 * Only report recorded sources in our filter set when responding
 	 * to a group-source query.
 	 */
 	if (record_has_sources) {
 		if (m == m0) {
 			md = m_last(m);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    md->m_len - nbytes);
 		} else {
 			md = m_getptr(m, 0, &off);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    off);
 		}
 		msrcs = 0;
 		RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa_haddr(ims->ims_haddr));
 			now = ims_get_mode(inm, ims, 1);
 			CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			nbytes += sizeof(in_addr_t);
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__,
 		    msrcs);
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 	}
 
 	if (is_source_query && msrcs == 0) {
 		CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__);
 		if (m != m0)
 			m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * We are good to go with first packet.
 	 */
 	if (m != m0) {
 		CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		_IF_ENQUEUE(ifq, m);
 	} else
 		m->m_pkthdr.PH_vt.vt_nrecs++;
 
 	/*
 	 * No further work needed if no source list in packet(s).
 	 */
 	if (!record_has_sources)
 		return (nbytes);
 
 	/*
 	 * Whilst sources remain to be announced, we need to allocate
 	 * a new packet and fill out as many sources as will fit.
 	 * Always try for a cluster first.
 	 */
 	while (nims != NULL) {
 		if (_IF_QFULL(ifq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m)
 			m->m_data += IGMP_LEADINGSPACE;
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
 				MH_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 		igmp_save_context(m, ifp);
 		md = m_getptr(m, 0, &off);
 		pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off);
 		CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__);
 
 		if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 			if (m != m0)
 				m_freem(m);
 			CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 			return (-ENOMEM);
 		}
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		nbytes += sizeof(struct igmp_grouprec);
 
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 
 		msrcs = 0;
 		RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa_haddr(ims->ims_haddr));
 			now = ims_get_mode(inm, ims, 1);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 
 		CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
 		_IF_ENQUEUE(ifq, m);
 	}
 
 	return (nbytes);
 }
 
 /*
  * Type used to mark record pass completion.
  * We exploit the fact we can cast to this easily from the
  * current filter modes on each ip_msource node.
  */
 typedef enum {
 	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
 	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
 	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
 	REC_FULL = REC_ALLOW | REC_BLOCK
 } rectype_t;
 
 /*
  * Enqueue an IGMPv3 filter list change to the given output queue.
  *
  * Source list filter state is held in an RB-tree. When the filter list
  * for a group is changed without changing its mode, we need to compute
  * the deltas between T0 and T1 for each source in the filter set,
  * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
  *
  * As we may potentially queue two record types, and the entire R-B tree
  * needs to be walked at once, we break this out into its own function
  * so we can generate a tightly packed queue of packets.
  *
  * XXX This could be written to only use one tree walk, although that makes
  * serializing into the mbuf chains a bit harder. For now we do two walks
  * which makes things easier on us, and it may or may not be harder on
  * the L2 cache.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm)
 {
 	static const int MINRECLEN =
 	    sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
 	struct ifnet		*ifp;
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m, *m0, *md;
 	in_addr_t		 naddr;
 	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
 	int			 nallow, nblock;
 	uint8_t			 mode, now, then;
 	rectype_t		 crt, drt, nrt;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	if (inm->inm_nsrc == 0 ||
 	    (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
 		return (0);
 
 	ifp = inm->inm_ifp;			/* interface */
 	mode = inm->inm_st[1].iss_fmode;	/* filter mode at t1 */
 	crt = REC_NONE;	/* current group record type */
 	drt = REC_NONE;	/* mask of completed group record types */
 	nrt = REC_NONE;	/* record type for current node */
 	m0srcs = 0;	/* # source which will fit in current mbuf chain */
 	nbytes = 0;	/* # of bytes appended to group's state-change queue */
 	npbytes = 0;	/* # of bytes appended this packet */
 	rsrcs = 0;	/* # sources encoded in current record */
 	schanged = 0;	/* # nodes encoded in overall filter change */
 	nallow = 0;	/* # of source entries in ALLOW_NEW */
 	nblock = 0;	/* # of source entries in BLOCK_OLD */
 	nims = NULL;	/* next tree node pointer */
 
 	/*
 	 * For each possible filter record mode.
 	 * The first kind of source we encounter tells us which
 	 * is the first kind of record we start appending.
 	 * If a node transitioned to UNDEFINED at t1, its mode is treated
 	 * as the inverse of the group's filter mode.
 	 */
 	while (drt != REC_FULL) {
 		do {
 			m0 = ifq->ifq_tail;
 			if (m0 != NULL &&
 			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
 			     IGMP_V3_REPORT_MAXRECS) &&
 			    (m0->m_pkthdr.len + MINRECLEN) <
 			     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 				m = m0;
 				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 					    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				CTR1(KTR_IGMPV3,
 				    "%s: use previous packet", __func__);
 			} else {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				if (m)
 					m->m_data += IGMP_LEADINGSPACE;
 				if (m == NULL) {
 					m = m_gethdr(M_NOWAIT, MT_DATA);
 					if (m)
 						MH_ALIGN(m, IGMP_LEADINGSPACE);
 				}
 				if (m == NULL) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_get*() failed", __func__);
 					return (-ENOMEM);
 				}
 				m->m_pkthdr.PH_vt.vt_nrecs = 0;
 				igmp_save_context(m, ifp);
 				m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 				    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				npbytes = 0;
 				CTR1(KTR_IGMPV3,
 				    "%s: allocated new packet", __func__);
 			}
 			/*
 			 * Append the IGMP group record header to the
 			 * current packet's data area.
 			 * Recalculate pointer to free space for next
 			 * group record, in case m_append() allocated
 			 * a new mbuf or cluster.
 			 */
 			memset(&ig, 0, sizeof(ig));
 			ig.ig_group = inm->inm_addr;
 			if (!m_append(m, sizeof(ig), (void *)&ig)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3,
 				    "%s: m_append() failed", __func__);
 				return (-ENOMEM);
 			}
 			npbytes += sizeof(struct igmp_grouprec);
 			if (m != m0) {
 				/* new packet; offset in c hain */
 				md = m_getptr(m, npbytes -
 				    sizeof(struct igmp_grouprec), &off);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + off);
 			} else {
 				/* current packet; offset from last append */
 				md = m_last(m);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + md->m_len -
 				    sizeof(struct igmp_grouprec));
 			}
 			/*
 			 * Begin walking the tree for this record type
 			 * pass, or continue from where we left off
 			 * previously if we had to allocate a new packet.
 			 * Only report deltas in-mode at t1.
 			 * We need not report included sources as allowed
 			 * if we are in inclusive mode on the group,
 			 * however the converse is not true.
 			 */
 			rsrcs = 0;
 			if (nims == NULL)
 				nims = RB_MIN(ip_msource_tree, &inm->inm_srcs);
 			RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 				CTR2(KTR_IGMPV3, "%s: visit node %s",
 				    __func__, inet_ntoa_haddr(ims->ims_haddr));
 				now = ims_get_mode(inm, ims, 1);
 				then = ims_get_mode(inm, ims, 0);
 				CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d",
 				    __func__, then, now);
 				if (now == then) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip unchanged", __func__);
 					continue;
 				}
 				if (mode == MCAST_EXCLUDE &&
 				    now == MCAST_INCLUDE) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip IN src on EX group",
 					    __func__);
 					continue;
 				}
 				nrt = (rectype_t)now;
 				if (nrt == REC_NONE)
 					nrt = (rectype_t)(~mode & REC_FULL);
 				if (schanged++ == 0) {
 					crt = nrt;
 				} else if (crt != nrt)
 					continue;
 				naddr = htonl(ims->ims_haddr);
 				if (!m_append(m, sizeof(in_addr_t),
 				    (void *)&naddr)) {
 					if (m != m0)
 						m_freem(m);
 					CTR1(KTR_IGMPV3,
 					    "%s: m_append() failed", __func__);
 					return (-ENOMEM);
 				}
 				nallow += !!(crt == REC_ALLOW);
 				nblock += !!(crt == REC_BLOCK);
 				if (++rsrcs == m0srcs)
 					break;
 			}
 			/*
 			 * If we did not append any tree nodes on this
 			 * pass, back out of allocations.
 			 */
 			if (rsrcs == 0) {
 				npbytes -= sizeof(struct igmp_grouprec);
 				if (m != m0) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_free(m)", __func__);
 					m_freem(m);
 				} else {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_adj(m, -ig)", __func__);
 					m_adj(m, -((int)sizeof(
 					    struct igmp_grouprec)));
 				}
 				continue;
 			}
 			npbytes += (rsrcs * sizeof(in_addr_t));
 			if (crt == REC_ALLOW)
 				pig->ig_type = IGMP_ALLOW_NEW_SOURCES;
 			else if (crt == REC_BLOCK)
 				pig->ig_type = IGMP_BLOCK_OLD_SOURCES;
 			pig->ig_numsrc = htons(rsrcs);
 			/*
 			 * Count the new group record, and enqueue this
 			 * packet if it wasn't already queued.
 			 */
 			m->m_pkthdr.PH_vt.vt_nrecs++;
 			if (m != m0)
 				_IF_ENQUEUE(ifq, m);
 			nbytes += npbytes;
 		} while (nims != NULL);
 		drt |= crt;
 		crt = (~crt & REC_FULL);
 	}
 
 	CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
 	    nallow, nblock);
 
 	return (nbytes);
 }
 
 static int
 igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
 {
 	struct ifqueue	*gq;
 	struct mbuf	*m;		/* pending state-change */
 	struct mbuf	*m0;		/* copy of pending state-change */
 	struct mbuf	*mt;		/* last state-change in packet */
 	int		 docopy, domerge;
 	u_int		 recslen;
 
 	docopy = 0;
 	domerge = 0;
 	recslen = 0;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * If there are further pending retransmissions, make a writable
 	 * copy of each queued state-change message before merging.
 	 */
 	if (inm->inm_scrv > 0)
 		docopy = 1;
 
 	gq = &inm->inm_scq;
 #ifdef KTR
 	if (gq->ifq_head == NULL) {
 		CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
 		    __func__, inm);
 	}
 #endif
 
 	m = gq->ifq_head;
 	while (m != NULL) {
 		/*
 		 * Only merge the report into the current packet if
 		 * there is sufficient space to do so; an IGMPv3 report
 		 * packet may only contain 65,535 group records.
 		 * Always use a simple mbuf chain concatentation to do this,
 		 * as large state changes for single groups may have
 		 * allocated clusters.
 		 */
 		domerge = 0;
 		mt = ifscq->ifq_tail;
 		if (mt != NULL) {
 			recslen = m_length(m, NULL);
 
 			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
 			    m->m_pkthdr.PH_vt.vt_nrecs <=
 			    IGMP_V3_REPORT_MAXRECS) &&
 			    (mt->m_pkthdr.len + recslen <=
 			    (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE)))
 				domerge = 1;
 		}
 
 		if (!domerge && _IF_QFULL(gq)) {
 			CTR2(KTR_IGMPV3,
 			    "%s: outbound queue full, skipping whole packet %p",
 			    __func__, m);
 			mt = m->m_nextpkt;
 			if (!docopy)
 				m_freem(m);
 			m = mt;
 			continue;
 		}
 
 		if (!docopy) {
 			CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
 			_IF_DEQUEUE(gq, m0);
 			m = m0->m_nextpkt;
 		} else {
 			CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
 			m0 = m_dup(m, M_NOWAIT);
 			if (m0 == NULL)
 				return (ENOMEM);
 			m0->m_nextpkt = NULL;
 			m = m->m_nextpkt;
 		}
 
 		if (!domerge) {
 			CTR3(KTR_IGMPV3, "%s: queueing %p to ifscq %p)",
 			    __func__, m0, ifscq);
 			_IF_ENQUEUE(ifscq, m0);
 		} else {
 			struct mbuf *mtl;	/* last mbuf of packet mt */
 
 			CTR3(KTR_IGMPV3, "%s: merging %p with ifscq tail %p)",
 			    __func__, m0, mt);
 
 			mtl = m_last(mt);
 			m0->m_flags &= ~M_PKTHDR;
 			mt->m_pkthdr.len += recslen;
 			mt->m_pkthdr.PH_vt.vt_nrecs +=
 			    m0->m_pkthdr.PH_vt.vt_nrecs;
 
 			mtl->m_next = m0;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Respond to a pending IGMPv3 General Query.
  */
 static void
 igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm;
 	int			 retval, loop;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi->igi_version == IGMP_VERSION_3,
 	    ("%s: called when version %d", __func__, igi->igi_version));
 
 	ifp = igi->igi_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		KASSERT(ifp == inm->inm_ifp,
 		    ("%s: inconsistent ifp", __func__));
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			retval = igmp_v3_enqueue_group_record(&igi->igi_gq,
 			    inm, 0, 0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 	igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
 
 	/*
 	 * Slew transmission of bursts over 500ms intervals.
 	 */
 	if (igi->igi_gq.ifq_head != NULL) {
 		igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
 		    IGMP_RESPONSE_BURST_INTERVAL);
 		V_interface_timers_running = 1;
 	}
 }
 
 /*
  * Transmit the next pending IGMP message in the output queue.
  *
  * We get called from netisr_processqueue(). A mutex private to igmpoq
  * will be acquired and released around this routine.
  *
  * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
  * MRT: Nothing needs to be done, as IGMP traffic is always local to
  * a link and uses a link-scope multicast address.
  */
 static void
 igmp_intr(struct mbuf *m)
 {
 	struct ip_moptions	 imo;
 	struct ifnet		*ifp;
 	struct mbuf		*ipopts, *m0;
 	int			 error;
 	uint32_t		 ifindex;
 
 	CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m);
 
 	/*
 	 * Set VNET image pointer from enqueued mbuf chain
 	 * before doing anything else. Whilst we use interface
 	 * indexes to guard against interface detach, they are
 	 * unique to each VIMAGE and must be retrieved.
 	 */
 	CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr));
 	ifindex = igmp_restore_context(m);
 
 	/*
 	 * Check if the ifnet still exists. This limits the scope of
 	 * any race in the absence of a global ifp lock for low cost
 	 * (an array lookup).
 	 */
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.",
 		    __func__, m, ifindex);
 		m_freem(m);
 		IPSTAT_INC(ips_noroute);
 		goto out;
 	}
 
 	ipopts = V_igmp_sendra ? m_raopt : NULL;
 
 	imo.imo_multicast_ttl  = 1;
 	imo.imo_multicast_vif  = -1;
 	imo.imo_multicast_loop = (V_ip_mrouter != NULL);
 
 	/*
 	 * If the user requested that IGMP traffic be explicitly
 	 * redirected to the loopback interface (e.g. they are running a
 	 * MANET interface and the routing protocol needs to see the
 	 * updates), handle this now.
 	 */
 	if (m->m_flags & M_IGMP_LOOP)
 		imo.imo_multicast_ifp = V_loif;
 	else
 		imo.imo_multicast_ifp = ifp;
 
 	if (m->m_flags & M_IGMPV2) {
 		m0 = m;
 	} else {
 		m0 = igmp_v3_encap_report(ifp, m);
 		if (m0 == NULL) {
 			CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m);
 			m_freem(m);
 			IPSTAT_INC(ips_odropped);
 			goto out;
 		}
 	}
 
 	igmp_scrub_context(m0);
 	m_clrprotoflags(m);
 	m0->m_pkthdr.rcvif = V_loif;
 #ifdef MAC
 	mac_netinet_igmp_send(ifp, m0);
 #endif
 	error = ip_output(m0, ipopts, NULL, 0, &imo, NULL);
 	if (error) {
 		CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error);
 		goto out;
 	}
 
 	IGMPSTAT_INC(igps_snd_reports);
 
 out:
 	/*
 	 * We must restore the existing vnet pointer before
 	 * continuing as we are run from netisr context.
 	 */
 	CURVNET_RESTORE();
 }
 
 /*
  * Encapsulate an IGMPv3 report.
  *
  * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf
  * chain has already had its IP/IGMPv3 header prepended. In this case
  * the function will not attempt to prepend; the lengths and checksums
  * will however be re-computed.
  *
  * Returns a pointer to the new mbuf chain head, or NULL if the
  * allocation failed.
  */
 static struct mbuf *
 igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
 {
 	struct igmp_report	*igmp;
 	struct ip		*ip;
 	int			 hdrlen, igmpreclen;
 
 	KASSERT((m->m_flags & M_PKTHDR),
 	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
 
 	igmpreclen = m_length(m, NULL);
 	hdrlen = sizeof(struct ip) + sizeof(struct igmp_report);
 
 	if (m->m_flags & M_IGMPV3_HDR) {
 		igmpreclen -= hdrlen;
 	} else {
 		M_PREPEND(m, hdrlen, M_NOWAIT);
 		if (m == NULL)
 			return (NULL);
 		m->m_flags |= M_IGMPV3_HDR;
 	}
 
 	CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len -= sizeof(struct ip);
 
 	igmp = mtod(m, struct igmp_report *);
 	igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT;
 	igmp->ir_rsv1 = 0;
 	igmp->ir_rsv2 = 0;
 	igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs);
 	igmp->ir_cksum = 0;
 	igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen);
 	m->m_pkthdr.PH_vt.vt_nrecs = 0;
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
 	ip->ip_len = htons(hdrlen + igmpreclen);
 	ip->ip_off = htons(IP_DF);
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_sum = 0;
 
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (m->m_flags & M_IGMP_LOOP) {
 		struct in_ifaddr *ia;
 
 		IFP_TO_IA(ifp, ia);
 		if (ia != NULL) {
 			ip->ip_src = ia->ia_addr.sin_addr;
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
 
 	return (m);
 }
 
 #ifdef KTR
 static char *
 igmp_rec_type_to_str(const int type)
 {
 
 	switch (type) {
 		case IGMP_CHANGE_TO_EXCLUDE_MODE:
 			return "TO_EX";
 			break;
 		case IGMP_CHANGE_TO_INCLUDE_MODE:
 			return "TO_IN";
 			break;
 		case IGMP_MODE_IS_EXCLUDE:
 			return "MODE_EX";
 			break;
 		case IGMP_MODE_IS_INCLUDE:
 			return "MODE_IN";
 			break;
 		case IGMP_ALLOW_NEW_SOURCES:
 			return "ALLOW_NEW";
 			break;
 		case IGMP_BLOCK_OLD_SOURCES:
 			return "BLOCK_OLD";
 			break;
 		default:
 			break;
 	}
 	return "unknown";
 }
 #endif
 
 static void
 igmp_init(void *unused __unused)
 {
 
 	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
 
 	IGMP_LOCK_INIT();
 
 	m_raopt = igmp_ra_alloc();
 
 	netisr_register(&igmp_nh);
 }
 SYSINIT(igmp_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_init, NULL);
 
 static void
 igmp_uninit(void *unused __unused)
 {
 
 	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
 
 	netisr_unregister(&igmp_nh);
 
 	m_free(m_raopt);
 	m_raopt = NULL;
 
 	IGMP_LOCK_DESTROY();
 }
 SYSUNINIT(igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_uninit, NULL);
 
 static void
 vnet_igmp_init(const void *unused __unused)
 {
 
 	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
 
 	LIST_INIT(&V_igi_head);
 }
 VNET_SYSINIT(vnet_igmp_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_init,
     NULL);
 
 static void
 vnet_igmp_uninit(const void *unused __unused)
 {
 
 	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
 
 	KASSERT(LIST_EMPTY(&V_igi_head),
 	    ("%s: igi list not empty; ifnets not detached?", __func__));
 }
 VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_igmp_uninit, NULL);
 
 static int
 igmp_modevent(module_t mod, int type, void *unused __unused)
 {
 
     switch (type) {
     case MOD_LOAD:
     case MOD_UNLOAD:
 	break;
     default:
 	return (EOPNOTSUPP);
     }
     return (0);
 }
 
 static moduledata_t igmp_mod = {
     "igmp",
     igmp_modevent,
     0
 };
 DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
Index: user/dchagin/lemul/sys/netinet/in.c
===================================================================
--- user/dchagin/lemul/sys/netinet/in.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/in.c	(revision 274241)
@@ -1,1269 +1,1269 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (C) 2001 WIDE Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in.c	8.4 (Berkeley) 1/9/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/systm.h>
 #include <sys/sockio.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/socket.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sx.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_carp.h>
 #include <netinet/igmp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *);
 static int in_difaddr_ioctl(caddr_t, struct ifnet *, struct thread *);
 
 static void	in_socktrim(struct sockaddr_in *);
 static void	in_purgemaddrs(struct ifnet *);
 
 static VNET_DEFINE(int, nosameprefix);
 #define	V_nosameprefix			VNET(nosameprefix)
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(nosameprefix), 0,
 	"Refuse to create same prefixes on different interfaces");
 
 VNET_DECLARE(struct inpcbinfo, ripcbinfo);
 #define	V_ripcbinfo			VNET(ripcbinfo)
 
 static struct sx in_control_sx;
 SX_SYSINIT(in_control_sx, &in_control_sx, "in_control");
 
 /*
  * Return 1 if an internet address is for a ``local'' host
  * (one to which we have a connection).
  */
 int
 in_localaddr(struct in_addr in)
 {
 	register u_long i = ntohl(in.s_addr);
 	register struct in_ifaddr *ia;
 
 	IN_IFADDR_RLOCK();
 	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 		if ((i & ia->ia_subnetmask) == ia->ia_subnet) {
 			IN_IFADDR_RUNLOCK();
 			return (1);
 		}
 	}
 	IN_IFADDR_RUNLOCK();
 	return (0);
 }
 
 /*
  * Return 1 if an internet address is for the local host and configured
  * on one of its interfaces.
  */
 int
 in_localip(struct in_addr in)
 {
 	struct in_ifaddr *ia;
 
 	IN_IFADDR_RLOCK();
 	LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) {
 		if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) {
 			IN_IFADDR_RUNLOCK();
 			return (1);
 		}
 	}
 	IN_IFADDR_RUNLOCK();
 	return (0);
 }
 
 /*
  * Return a reference to the interface address which is different to
  * the supplied one but with same IP address value.
  */
 static struct in_ifaddr *
 in_localip_more(struct in_ifaddr *ia)
 {
 	in_addr_t in = IA_SIN(ia)->sin_addr.s_addr;
 	struct in_ifaddr *it;
 
 	IN_IFADDR_RLOCK();
 	LIST_FOREACH(it, INADDR_HASH(in), ia_hash) {
 		if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) {
 			ifa_ref(&it->ia_ifa);
 			IN_IFADDR_RUNLOCK();
 			return (it);
 		}
 	}
 	IN_IFADDR_RUNLOCK();
 
 	return (NULL);
 }
 
 /*
  * Determine whether an IP address is in a reserved set of addresses
  * that may not be forwarded, or whether datagrams to that destination
  * may be forwarded.
  */
 int
 in_canforward(struct in_addr in)
 {
 	register u_long i = ntohl(in.s_addr);
 	register u_long net;
 
 	if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i))
 		return (0);
 	if (IN_CLASSA(i)) {
 		net = i & IN_CLASSA_NET;
 		if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))
 			return (0);
 	}
 	return (1);
 }
 
 /*
  * Trim a mask in a sockaddr
  */
 static void
 in_socktrim(struct sockaddr_in *ap)
 {
     register char *cplim = (char *) &ap->sin_addr;
     register char *cp = (char *) (&ap->sin_addr + 1);
 
     ap->sin_len = 0;
     while (--cp >= cplim)
 	if (*cp) {
 	    (ap)->sin_len = cp - (char *) (ap) + 1;
 	    break;
 	}
 }
 
 /*
  * Generic internet control operations (ioctl's).
  */
 int
 in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
     struct thread *td)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	int error;
 
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 
 	/*
 	 * Filter out 4 ioctls we implement directly.  Forward the rest
 	 * to specific functions and ifp->if_ioctl().
 	 */
 	switch (cmd) {
 	case SIOCGIFADDR:
 	case SIOCGIFBRDADDR:
 	case SIOCGIFDSTADDR:
 	case SIOCGIFNETMASK:
 		break;
 	case SIOCDIFADDR:
 		sx_xlock(&in_control_sx);
 		error = in_difaddr_ioctl(data, ifp, td);
 		sx_xunlock(&in_control_sx);
 		return (error);
 	case OSIOCAIFADDR:	/* 9.x compat */
 	case SIOCAIFADDR:
 		sx_xlock(&in_control_sx);
 		error = in_aifaddr_ioctl(cmd, data, ifp, td);
 		sx_xunlock(&in_control_sx);
 		return (error);
 	case SIOCSIFADDR:
 	case SIOCSIFBRDADDR:
 	case SIOCSIFDSTADDR:
 	case SIOCSIFNETMASK:
 		/* We no longer support that old commands. */
 		return (EINVAL);
 	default:
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		return ((*ifp->if_ioctl)(ifp, cmd, data));
 	}
 
 	if (addr->sin_addr.s_addr != INADDR_ANY &&
 	    prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0)
 		return (EADDRNOTAVAIL);
 
 	/*
 	 * Find address for this interface, if it exists.  If an
 	 * address was specified, find that one instead of the
 	 * first one on the interface, if possible.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 		ia = (struct in_ifaddr *)ifa;
 		if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr)
 			break;
 	}
 	if (ifa == NULL)
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 			if (ifa->ifa_addr->sa_family == AF_INET) {
 				ia = (struct in_ifaddr *)ifa;
 				if (prison_check_ip4(td->td_ucred,
 				    &ia->ia_addr.sin_addr) == 0)
 					break;
 			}
 
 	if (ifa == NULL) {
 		IF_ADDR_RUNLOCK(ifp);
 		return (EADDRNOTAVAIL);
 	}
 
 	error = 0;
 	switch (cmd) {
 	case SIOCGIFADDR:
 		*addr = ia->ia_addr;
 		break;
 
 	case SIOCGIFBRDADDR:
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EINVAL;
 			break;
 		}
 		*addr = ia->ia_broadaddr;
 		break;
 
 	case SIOCGIFDSTADDR:
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
 			error = EINVAL;
 			break;
 		}
 		*addr = ia->ia_dstaddr;
 		break;
 
 	case SIOCGIFNETMASK:
 		*addr = ia->ia_sockmask;
 		break;
 	}
 
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (error);
 }
 
 static int
 in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td)
 {
 	const struct in_aliasreq *ifra = (struct in_aliasreq *)data;
 	const struct sockaddr_in *addr = &ifra->ifra_addr;
 	const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr;
 	const struct sockaddr_in *mask = &ifra->ifra_mask;
 	const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr;
 	const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	bool iaIsFirst;
 	int error = 0;
 
 	error = priv_check(td, PRIV_NET_ADDIFADDR);
 	if (error)
 		return (error);
 
 	/*
 	 * ifra_addr must be present and be of INET family.
 	 * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional.
 	 */
 	if (addr->sin_len != sizeof(struct sockaddr_in) ||
 	    addr->sin_family != AF_INET)
 		return (EINVAL);
 	if (broadaddr->sin_len != 0 &&
 	    (broadaddr->sin_len != sizeof(struct sockaddr_in) ||
 	    broadaddr->sin_family != AF_INET))
 		return (EINVAL);
 	if (mask->sin_len != 0 &&
 	    (mask->sin_len != sizeof(struct sockaddr_in) ||
 	    mask->sin_family != AF_INET))
 		return (EINVAL);
 	if ((ifp->if_flags & IFF_POINTOPOINT) &&
 	    (dstaddr->sin_len != sizeof(struct sockaddr_in) ||
 	     dstaddr->sin_addr.s_addr == INADDR_ANY))
 		return (EDESTADDRREQ);
 	if (vhid > 0 && carp_attach_p == NULL)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * See whether address already exist.
 	 */
 	iaIsFirst = true;
 	ia = NULL;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in_ifaddr *it;
 
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 
 		it = (struct in_ifaddr *)ifa;
 		iaIsFirst = false;
 		if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr &&
 		    prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)
 			ia = it;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	if (ia != NULL)
 		(void )in_difaddr_ioctl(data, ifp, td);
 
 	ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK);
 	ia = (struct in_ifaddr *)ifa;
 	ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr;
 	ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr;
 	ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask;
 
 	ia->ia_ifp = ifp;
 	ia->ia_addr = *addr;
 	if (mask->sin_len != 0) {
 		ia->ia_sockmask = *mask;
 		ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr);
 	} else {
 		in_addr_t i = ntohl(addr->sin_addr.s_addr);
 
 		/*
 	 	 * Be compatible with network classes, if netmask isn't
 		 * supplied, guess it based on classes.
 	 	 */
 		if (IN_CLASSA(i))
 			ia->ia_subnetmask = IN_CLASSA_NET;
 		else if (IN_CLASSB(i))
 			ia->ia_subnetmask = IN_CLASSB_NET;
 		else
 			ia->ia_subnetmask = IN_CLASSC_NET;
 		ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask);
 	}
 	ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask;
 	in_socktrim(&ia->ia_sockmask);
 
 	if (ifp->if_flags & IFF_BROADCAST) {
 		if (broadaddr->sin_len != 0) {
 			ia->ia_broadaddr = *broadaddr;
 		} else if (ia->ia_subnetmask == IN_RFC3021_MASK) {
 			ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST;
 			ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in);
 			ia->ia_broadaddr.sin_family = AF_INET;
 		} else {
 			ia->ia_broadaddr.sin_addr.s_addr =
 			    htonl(ia->ia_subnet | ~ia->ia_subnetmask);
 			ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in);
 			ia->ia_broadaddr.sin_family = AF_INET;
 		}
 	}
 
 	if (ifp->if_flags & IFF_POINTOPOINT)
 		ia->ia_dstaddr = *dstaddr;
 
 	/* XXXGL: rtinit() needs this strange assignment. */
 	if (ifp->if_flags & IFF_LOOPBACK)
                 ia->ia_dstaddr = ia->ia_addr;
 
 	if (vhid != 0) {
 		error = (*carp_attach_p)(&ia->ia_ifa, vhid);
 		if (error)
 			return (error);
 	}
 
 	/* if_addrhead is already referenced by ifa_alloc() */
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 
 	ifa_ref(ifa);			/* in_ifaddrhead */
 	IN_IFADDR_WLOCK();
 	TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link);
 	LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash);
 	IN_IFADDR_WUNLOCK();
 
 	/*
 	 * Give the interface a chance to initialize
 	 * if this is its first address,
 	 * and to validate the address if necessary.
 	 */
 	if (ifp->if_ioctl != NULL) {
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
 		if (error)
 			goto fail1;
 	}
 
 	/*
 	 * Add route for the network.
 	 */
 	if (vhid == 0) {
 		int flags = RTF_UP;
 
 		if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT))
 			flags |= RTF_HOST;
 
 		error = in_addprefix(ia, flags);
 		if (error)
 			goto fail1;
 	}
 
 	/*
 	 * Add a loopback route to self.
 	 */
 	if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 &&
 	    ia->ia_addr.sin_addr.s_addr != INADDR_ANY &&
 	    !((ifp->if_flags & IFF_POINTOPOINT) &&
 	     ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) {
 		struct in_ifaddr *eia;
 
 		eia = in_localip_more(ia);
 
 		if (eia == NULL) {
 			error = ifa_add_loopback_route((struct ifaddr *)ia,
 			    (struct sockaddr *)&ia->ia_addr);
 			if (error)
 				goto fail2;
 		} else
 			ifa_free(&eia->ia_ifa);
 	}
 
 	if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) {
 		struct in_addr allhosts_addr;
 		struct in_ifinfo *ii;
 
 		ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
 		allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP);
 
 		error = in_joingroup(ifp, &allhosts_addr, NULL,
 			&ii->ii_allhosts);
 	}
 
 	EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 
 	return (error);
 
 fail2:
 	if (vhid == 0)
 		(void )in_scrubprefix(ia, LLE_STATIC);
 
 fail1:
 	if (ia->ia_ifa.ifa_carp)
 		(*carp_detach_p)(&ia->ia_ifa);
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 	ifa_free(&ia->ia_ifa);		/* if_addrhead */
 
 	IN_IFADDR_WLOCK();
 	TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
 	LIST_REMOVE(ia, ia_hash);
 	IN_IFADDR_WUNLOCK();
 	ifa_free(&ia->ia_ifa);		/* in_ifaddrhead */
 
 	return (error);
 }
 
 static int
 in_difaddr_ioctl(caddr_t data, struct ifnet *ifp, struct thread *td)
 {
 	const struct ifreq *ifr = (struct ifreq *)data;
 	const struct sockaddr_in *addr = (const struct sockaddr_in *)
 	    &ifr->ifr_addr;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	bool deleteAny, iaIsLast;
 	int error;
 
 	if (td != NULL) {
 		error = priv_check(td, PRIV_NET_DELIFADDR);
 		if (error)
 			return (error);
 	}
 
 	if (addr->sin_len != sizeof(struct sockaddr_in) ||
 	    addr->sin_family != AF_INET)
 		deleteAny = true;
 	else
 		deleteAny = false;
 
 	iaIsLast = true;
 	ia = NULL;
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in_ifaddr *it;
 
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 
 		it = (struct in_ifaddr *)ifa;
 		if (deleteAny && ia == NULL && (td == NULL ||
 		    prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0))
 			ia = it;
 
 		if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr &&
 		    (td == NULL || prison_check_ip4(td->td_ucred,
 		    &addr->sin_addr) == 0))
 			ia = it;
 
 		if (it != ia)
 			iaIsLast = false;
 	}
 
 	if (ia == NULL) {
 		IF_ADDR_WUNLOCK(ifp);
 		return (EADDRNOTAVAIL);
 	}
 
 	TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 	ifa_free(&ia->ia_ifa);		/* if_addrhead */
 
 	IN_IFADDR_WLOCK();
 	TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
 	LIST_REMOVE(ia, ia_hash);
 	IN_IFADDR_WUNLOCK();
 
 	/*
 	 * in_scrubprefix() kills the interface route.
 	 */
 	in_scrubprefix(ia, LLE_STATIC);
 
 	/*
 	 * in_ifadown gets rid of all the rest of
 	 * the routes.  This is not quite the right
 	 * thing to do, but at least if we are running
 	 * a routing process they will come back.
 	 */
 	in_ifadown(&ia->ia_ifa, 1);
 
 	if (ia->ia_ifa.ifa_carp)
 		(*carp_detach_p)(&ia->ia_ifa);
 
 	/*
 	 * If this is the last IPv4 address configured on this
 	 * interface, leave the all-hosts group.
 	 * No state-change report need be transmitted.
 	 */
 	if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) {
 		struct in_ifinfo *ii;
 
 		ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
 		IN_MULTI_LOCK();
 		if (ii->ii_allhosts) {
 			(void)in_leavegroup_locked(ii->ii_allhosts, NULL);
 			ii->ii_allhosts = NULL;
 		}
 		IN_MULTI_UNLOCK();
 	}
 
 	EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 	ifa_free(&ia->ia_ifa);		/* in_ifaddrhead */
 
 	return (0);
 }
 
 #define rtinitflags(x) \
 	((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \
 	    ? RTF_HOST : 0)
 
 /*
  * Check if we have a route for the given prefix already or add one accordingly.
  */
 int
 in_addprefix(struct in_ifaddr *target, int flags)
 {
 	struct in_ifaddr *ia;
 	struct in_addr prefix, mask, p, m;
 	int error;
 
 	if ((flags & RTF_HOST) != 0) {
 		prefix = target->ia_dstaddr.sin_addr;
 		mask.s_addr = 0;
 	} else {
 		prefix = target->ia_addr.sin_addr;
 		mask = target->ia_sockmask.sin_addr;
 		prefix.s_addr &= mask.s_addr;
 	}
 
 	IN_IFADDR_RLOCK();
 	/* Look for an existing address with the same prefix, mask, and fib */
 	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 		if (rtinitflags(ia)) {
 			p = ia->ia_dstaddr.sin_addr;
 
 			if (prefix.s_addr != p.s_addr)
 				continue;
 		} else {
 			p = ia->ia_addr.sin_addr;
 			m = ia->ia_sockmask.sin_addr;
 			p.s_addr &= m.s_addr;
 
 			if (prefix.s_addr != p.s_addr ||
 			    mask.s_addr != m.s_addr)
 				continue;
 		}
 		if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib)
 			continue;
 
 		/*
 		 * If we got a matching prefix route inserted by other
 		 * interface address, we are done here.
 		 */
 		if (ia->ia_flags & IFA_ROUTE) {
 #ifdef RADIX_MPATH
 			if (ia->ia_addr.sin_addr.s_addr ==
 			    target->ia_addr.sin_addr.s_addr) {
 				IN_IFADDR_RUNLOCK();
 				return (EEXIST);
 			} else
 				break;
 #endif
 			if (V_nosameprefix) {
 				IN_IFADDR_RUNLOCK();
 				return (EEXIST);
 			} else {
 				int fibnum;
 
 				fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS :
 					target->ia_ifp->if_fib;
 				rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum);
 				IN_IFADDR_RUNLOCK();
 				return (0);
 			}
 		}
 	}
 	IN_IFADDR_RUNLOCK();
 
 	/*
 	 * No-one seem to have this prefix route, so we try to insert it.
 	 */
 	error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags);
 	if (!error)
 		target->ia_flags |= IFA_ROUTE;
 	return (error);
 }
 
 /*
  * If there is no other address in the system that can serve a route to the
  * same prefix, remove the route.  Hand over the route to the new address
  * otherwise.
  */
 int
 in_scrubprefix(struct in_ifaddr *target, u_int flags)
 {
 	struct in_ifaddr *ia;
 	struct in_addr prefix, mask, p, m;
 	int error = 0;
 	struct sockaddr_in prefix0, mask0;
 
 	/*
 	 * Remove the loopback route to the interface address.
 	 */
 	if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) &&
 	    !(target->ia_ifp->if_flags & IFF_LOOPBACK) &&
 	    (flags & LLE_STATIC)) {
 		struct in_ifaddr *eia;
 
 		eia = in_localip_more(target);
 
 		if (eia != NULL) {
 			int fibnum = target->ia_ifp->if_fib;
 
 			error = ifa_switch_loopback_route((struct ifaddr *)eia,
 			    (struct sockaddr *)&target->ia_addr, fibnum);
 			ifa_free(&eia->ia_ifa);
 		} else {
 			error = ifa_del_loopback_route((struct ifaddr *)target,
 			    (struct sockaddr *)&target->ia_addr);
 		}
 
 		if (!(target->ia_ifp->if_flags & IFF_NOARP))
 			/* remove arp cache */
 			arp_ifscrub(target->ia_ifp,
 			    IA_SIN(target)->sin_addr.s_addr);
 	}
 
 	if (rtinitflags(target)) {
 		prefix = target->ia_dstaddr.sin_addr;
 		mask.s_addr = 0;
 	} else {
 		prefix = target->ia_addr.sin_addr;
 		mask = target->ia_sockmask.sin_addr;
 		prefix.s_addr &= mask.s_addr;
 	}
 
 	if ((target->ia_flags & IFA_ROUTE) == 0) {
 		int fibnum;
 		
 		fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS :
 			target->ia_ifp->if_fib;
 		rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum);
 		return (0);
 	}
 
 	IN_IFADDR_RLOCK();
 	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 		if (rtinitflags(ia)) {
 			p = ia->ia_dstaddr.sin_addr;
 
 			if (prefix.s_addr != p.s_addr)
 				continue;
 		} else {
 			p = ia->ia_addr.sin_addr;
 			m = ia->ia_sockmask.sin_addr;
 			p.s_addr &= m.s_addr;
 
 			if (prefix.s_addr != p.s_addr ||
 			    mask.s_addr != m.s_addr)
 				continue;
 		}
 
 		if ((ia->ia_ifp->if_flags & IFF_UP) == 0)
 			continue;
 
 		/*
 		 * If we got a matching prefix address, move IFA_ROUTE and
 		 * the route itself to it.  Make sure that routing daemons
 		 * get a heads-up.
 		 */
 		if ((ia->ia_flags & IFA_ROUTE) == 0) {
 			ifa_ref(&ia->ia_ifa);
 			IN_IFADDR_RUNLOCK();
 			error = rtinit(&(target->ia_ifa), (int)RTM_DELETE,
 			    rtinitflags(target));
 			if (error == 0)
 				target->ia_flags &= ~IFA_ROUTE;
 			else
 				log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n",
 					error);
 			error = rtinit(&ia->ia_ifa, (int)RTM_ADD,
 			    rtinitflags(ia) | RTF_UP);
 			if (error == 0)
 				ia->ia_flags |= IFA_ROUTE;
 			else
 				log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n",
 					error);
 			ifa_free(&ia->ia_ifa);
 			return (error);
 		}
 	}
 	IN_IFADDR_RUNLOCK();
 
 	/*
 	 * remove all L2 entries on the given prefix
 	 */
 	bzero(&prefix0, sizeof(prefix0));
 	prefix0.sin_len = sizeof(prefix0);
 	prefix0.sin_family = AF_INET;
 	prefix0.sin_addr.s_addr = target->ia_subnet;
 	bzero(&mask0, sizeof(mask0));
 	mask0.sin_len = sizeof(mask0);
 	mask0.sin_family = AF_INET;
 	mask0.sin_addr.s_addr = target->ia_subnetmask;
 	lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0,
 	    (struct sockaddr *)&mask0, flags);
 
 	/*
 	 * As no-one seem to have this prefix, we can remove the route.
 	 */
 	error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target));
 	if (error == 0)
 		target->ia_flags &= ~IFA_ROUTE;
 	else
 		log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error);
 	return (error);
 }
 
 #undef rtinitflags
 
 /*
  * Return 1 if the address might be a local broadcast address.
  */
 int
 in_broadcast(struct in_addr in, struct ifnet *ifp)
 {
 	register struct ifaddr *ifa;
 	u_long t;
 
 	if (in.s_addr == INADDR_BROADCAST ||
 	    in.s_addr == INADDR_ANY)
 		return (1);
 	if ((ifp->if_flags & IFF_BROADCAST) == 0)
 		return (0);
 	t = ntohl(in.s_addr);
 	/*
 	 * Look through the list of addresses for a match
 	 * with a broadcast address.
 	 */
 #define ia ((struct in_ifaddr *)ifa)
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (ifa->ifa_addr->sa_family == AF_INET &&
 		    (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
 		     /*
 		      * Check for old-style (host 0) broadcast, but
 		      * taking into account that RFC 3021 obsoletes it.
 		      */
 		    (ia->ia_subnetmask != IN_RFC3021_MASK &&
 		    t == ia->ia_subnet)) &&
 		     /*
 		      * Check for an all one subnetmask. These
 		      * only exist when an interface gets a secondary
 		      * address.
 		      */
 		    ia->ia_subnetmask != (u_long)0xffffffff)
 			    return (1);
 	return (0);
 #undef ia
 }
 
 /*
  * On interface removal, clean up IPv4 data structures hung off of the ifnet.
  */
 void
 in_ifdetach(struct ifnet *ifp)
 {
 
 	in_pcbpurgeif0(&V_ripcbinfo, ifp);
 	in_pcbpurgeif0(&V_udbinfo, ifp);
 	in_pcbpurgeif0(&V_ulitecbinfo, ifp);
 	in_purgemaddrs(ifp);
 }
 
 /*
  * Delete all IPv4 multicast address records, and associated link-layer
  * multicast address records, associated with ifp.
  * XXX It looks like domifdetach runs AFTER the link layer cleanup.
  * XXX This should not race with ifma_protospec being set during
  * a new allocation, if it does, we have bigger problems.
  */
 static void
 in_purgemaddrs(struct ifnet *ifp)
 {
 	LIST_HEAD(,in_multi) purgeinms;
 	struct in_multi		*inm, *tinm;
 	struct ifmultiaddr	*ifma;
 
 	LIST_INIT(&purgeinms);
 	IN_MULTI_LOCK();
 
 	/*
 	 * Extract list of in_multi associated with the detaching ifp
 	 * which the PF_INET layer is about to release.
 	 * We need to do this as IF_ADDR_LOCK() may be re-acquired
 	 * by code further down.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 #if 0
 		KASSERT(ifma->ifma_protospec != NULL,
 		    ("%s: ifma_protospec is NULL", __func__));
 #endif
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		LIST_INSERT_HEAD(&purgeinms, inm, inm_link);
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) {
 		LIST_REMOVE(inm, inm_link);
 		inm_release_locked(inm);
 	}
 	igmp_ifdetach(ifp);
 
 	IN_MULTI_UNLOCK();
 }
 
 struct in_llentry {
 	struct llentry		base;
 	struct sockaddr_in	l3_addr4;
 };
 
 /*
  * Deletes an address from the address table.
  * This function is called by the timer functions
  * such as arptimer() and nd6_llinfo_timer(), and
  * the caller does the locking.
  */
 static void
 in_lltable_free(struct lltable *llt, struct llentry *lle)
 {
 	LLE_WUNLOCK(lle);
 	LLE_LOCK_DESTROY(lle);
 	free(lle, M_LLTABLE);
 }
 
 static struct llentry *
 in_lltable_new(const struct sockaddr *l3addr, u_int flags)
 {
 	struct in_llentry *lle;
 
 	lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO);
 	if (lle == NULL)		/* NB: caller generates msg */
 		return NULL;
 
 	/*
 	 * For IPv4 this will trigger "arpresolve" to generate
 	 * an ARP request.
 	 */
 	lle->base.la_expire = time_uptime; /* mark expired */
 	lle->l3_addr4 = *(const struct sockaddr_in *)l3addr;
 	lle->base.lle_refcnt = 1;
 	lle->base.lle_free = in_lltable_free;
 	LLE_LOCK_INIT(&lle->base);
 	callout_init_rw(&lle->base.la_timer, &lle->base.lle_lock,
 	    CALLOUT_RETURNUNLOCKED);
 
 	return (&lle->base);
 }
 
 #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m)	(			\
 	    (((ntohl((d)->sin_addr.s_addr) ^ (a)->sin_addr.s_addr) & (m)->sin_addr.s_addr)) == 0 )
 
 static void
 in_lltable_prefix_free(struct lltable *llt, const struct sockaddr *prefix,
     const struct sockaddr *mask, u_int flags)
 {
 	const struct sockaddr_in *pfx = (const struct sockaddr_in *)prefix;
 	const struct sockaddr_in *msk = (const struct sockaddr_in *)mask;
 	struct llentry *lle, *next;
 	int i;
 	size_t pkts_dropped;
 
 	IF_AFDATA_WLOCK(llt->llt_ifp);
 	for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) {
 		LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) {
 			/*
 			 * (flags & LLE_STATIC) means deleting all entries
 			 * including static ARP entries.
 			 */
 			if (IN_ARE_MASKED_ADDR_EQUAL(satosin(L3_ADDR(lle)),
 			    pfx, msk) && ((flags & LLE_STATIC) ||
 			    !(lle->la_flags & LLE_STATIC))) {
 				LLE_WLOCK(lle);
 				if (callout_stop(&lle->la_timer))
 					LLE_REMREF(lle);
 				pkts_dropped = llentry_free(lle);
 				ARPSTAT_ADD(dropped, pkts_dropped);
 			}
 		}
 	}
 	IF_AFDATA_WUNLOCK(llt->llt_ifp);
 }
 
 
 static int
 in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr)
 {
 	struct rtentry *rt;
 
 	KASSERT(l3addr->sa_family == AF_INET,
 	    ("sin_family %d", l3addr->sa_family));
 
 	/* XXX rtalloc1_fib should take a const param */
 	rt = rtalloc1_fib(__DECONST(struct sockaddr *, l3addr), 0, 0,
 	    ifp->if_fib);
 
 	if (rt == NULL)
 		return (EINVAL);
 
 	/*
 	 * If the gateway for an existing host route matches the target L3
 	 * address, which is a special route inserted by some implementation
 	 * such as MANET, and the interface is of the correct type, then
 	 * allow for ARP to proceed.
 	 */
 	if (rt->rt_flags & RTF_GATEWAY) {
 		if (!(rt->rt_flags & RTF_HOST) || !rt->rt_ifp ||
 		    rt->rt_ifp->if_type != IFT_ETHER ||
 		    (rt->rt_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 ||
 		    memcmp(rt->rt_gateway->sa_data, l3addr->sa_data,
 		    sizeof(in_addr_t)) != 0) {
 			RTFREE_LOCKED(rt);
 			return (EINVAL);
 		}
 	}
 
 	/*
 	 * Make sure that at least the destination address is covered
 	 * by the route. This is for handling the case where 2 or more
 	 * interfaces have the same prefix. An incoming packet arrives
 	 * on one interface and the corresponding outgoing packet leaves
 	 * another interface.
 	 */
 	if (!(rt->rt_flags & RTF_HOST) && rt->rt_ifp != ifp) {
 		const char *sa, *mask, *addr, *lim;
 		int len;
 
 		mask = (const char *)rt_mask(rt);
 		/*
 		 * Just being extra cautious to avoid some custom
 		 * code getting into trouble.
 		 */
 		if (mask == NULL) {
 			RTFREE_LOCKED(rt);
 			return (EINVAL);
 		}
 
 		sa = (const char *)rt_key(rt);
 		addr = (const char *)l3addr;
 		len = ((const struct sockaddr_in *)l3addr)->sin_len;
 		lim = addr + len;
 
 		for ( ; addr < lim; sa++, mask++, addr++) {
 			if ((*sa ^ *addr) & *mask) {
 #ifdef DIAGNOSTIC
 				log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n",
 				    inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr));
 #endif
 				RTFREE_LOCKED(rt);
 				return (EINVAL);
 			}
 		}
 	}
 
 	RTFREE_LOCKED(rt);
 	return (0);
 }
 
 /*
  * Return NULL if not found or marked for deletion.
  * If found return lle read locked.
  */
 static struct llentry *
 in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr;
 	struct ifnet *ifp = llt->llt_ifp;
 	struct llentry *lle;
 	struct llentries *lleh;
 	u_int hashkey;
 
 	IF_AFDATA_LOCK_ASSERT(ifp);
 	KASSERT(l3addr->sa_family == AF_INET,
 	    ("sin_family %d", l3addr->sa_family));
 
 	hashkey = sin->sin_addr.s_addr;
 	lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)];
 	LIST_FOREACH(lle, lleh, lle_next) {
 		struct sockaddr_in *sa2 = satosin(L3_ADDR(lle));
 		if (lle->la_flags & LLE_DELETED)
 			continue;
 		if (sa2->sin_addr.s_addr == sin->sin_addr.s_addr)
 			break;
 	}
 	if (lle == NULL) {
 #ifdef DIAGNOSTIC
 		if (flags & LLE_DELETE)
 			log(LOG_INFO, "interface address is missing from cache = %p  in delete\n", lle);
 #endif
 		if (!(flags & LLE_CREATE))
 			return (NULL);
 		IF_AFDATA_WLOCK_ASSERT(ifp);
 		/*
 		 * A route that covers the given address must have
 		 * been installed 1st because we are doing a resolution,
 		 * verify this.
 		 */
 		if (!(flags & LLE_IFADDR) &&
 		    in_lltable_rtcheck(ifp, flags, l3addr) != 0)
 			goto done;
 
 		lle = in_lltable_new(l3addr, flags);
 		if (lle == NULL) {
 			log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
 			goto done;
 		}
 		lle->la_flags = flags & ~LLE_CREATE;
 		if ((flags & (LLE_CREATE | LLE_IFADDR)) == (LLE_CREATE | LLE_IFADDR)) {
 			bcopy(IF_LLADDR(ifp), &lle->ll_addr, ifp->if_addrlen);
 			lle->la_flags |= (LLE_VALID | LLE_STATIC);
 		}
 
 		lle->lle_tbl  = llt;
 		lle->lle_head = lleh;
 		lle->la_flags |= LLE_LINKED;
 		LIST_INSERT_HEAD(lleh, lle, lle_next);
 	} else if (flags & LLE_DELETE) {
 		if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) {
 			LLE_WLOCK(lle);
 			lle->la_flags |= LLE_DELETED;
 			EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
 #ifdef DIAGNOSTIC
 			log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
 #endif
 			if ((lle->la_flags &
 			    (LLE_STATIC | LLE_IFADDR)) == LLE_STATIC)
 				llentry_free(lle);
 			else
 				LLE_WUNLOCK(lle);
 		}
 		lle = (void *)-1;
 
 	}
 	if (LLE_IS_VALID(lle)) {
 		if (flags & LLE_EXCLUSIVE)
 			LLE_WLOCK(lle);
 		else
 			LLE_RLOCK(lle);
 	}
 done:
 	return (lle);
 }
 
 static int
 in_lltable_dump(struct lltable *llt, struct sysctl_req *wr)
 {
 #define	SIN(lle)	((struct sockaddr_in *) L3_ADDR(lle))
 	struct ifnet *ifp = llt->llt_ifp;
 	struct llentry *lle;
 	/* XXX stack use */
 	struct {
 		struct rt_msghdr	rtm;
 		struct sockaddr_in	sin;
 		struct sockaddr_dl	sdl;
 	} arpc;
 	int error, i;
 
 	LLTABLE_LOCK_ASSERT();
 
 	error = 0;
 	for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) {
 		LIST_FOREACH(lle, &llt->lle_head[i], lle_next) {
 			struct sockaddr_dl *sdl;
 
 			/* skip deleted entries */
 			if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
 				continue;
 			/* Skip if jailed and not a valid IP of the prison. */
 			if (prison_if(wr->td->td_ucred, L3_ADDR(lle)) != 0)
 				continue;
 			/*
 			 * produce a msg made of:
 			 *  struct rt_msghdr;
 			 *  struct sockaddr_in; (IPv4)
 			 *  struct sockaddr_dl;
 			 */
 			bzero(&arpc, sizeof(arpc));
 			arpc.rtm.rtm_msglen = sizeof(arpc);
 			arpc.rtm.rtm_version = RTM_VERSION;
 			arpc.rtm.rtm_type = RTM_GET;
 			arpc.rtm.rtm_flags = RTF_UP;
 			arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
 			arpc.sin.sin_family = AF_INET;
 			arpc.sin.sin_len = sizeof(arpc.sin);
 			arpc.sin.sin_addr.s_addr = SIN(lle)->sin_addr.s_addr;
 
 			/* publish */
 			if (lle->la_flags & LLE_PUB)
 				arpc.rtm.rtm_flags |= RTF_ANNOUNCE;
 
 			sdl = &arpc.sdl;
 			sdl->sdl_family = AF_LINK;
 			sdl->sdl_len = sizeof(*sdl);
 			sdl->sdl_index = ifp->if_index;
 			sdl->sdl_type = ifp->if_type;
 			if ((lle->la_flags & LLE_VALID) == LLE_VALID) {
 				sdl->sdl_alen = ifp->if_addrlen;
 				bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
 			} else {
 				sdl->sdl_alen = 0;
 				bzero(LLADDR(sdl), ifp->if_addrlen);
 			}
 
 			arpc.rtm.rtm_rmx.rmx_expire =
 			    lle->la_flags & LLE_STATIC ? 0 : lle->la_expire;
 			arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
 			if (lle->la_flags & LLE_STATIC)
 				arpc.rtm.rtm_flags |= RTF_STATIC;
 			arpc.rtm.rtm_index = ifp->if_index;
 			error = SYSCTL_OUT(wr, &arpc, sizeof(arpc));
 			if (error)
 				break;
 		}
 	}
 	return error;
 #undef SIN
 }
 
 void *
 in_domifattach(struct ifnet *ifp)
 {
 	struct in_ifinfo *ii;
 	struct lltable *llt;
 
 	ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO);
 
 	llt = lltable_init(ifp, AF_INET);
 	if (llt != NULL) {
 		llt->llt_prefix_free = in_lltable_prefix_free;
 		llt->llt_lookup = in_lltable_lookup;
 		llt->llt_dump = in_lltable_dump;
 	}
 	ii->ii_llt = llt;
 
 	ii->ii_igmp = igmp_domifattach(ifp);
 
 	return ii;
 }
 
 void
 in_domifdetach(struct ifnet *ifp, void *aux)
 {
 	struct in_ifinfo *ii = (struct in_ifinfo *)aux;
 
 	igmp_domifdetach(ifp);
 	lltable_free(ii->ii_llt);
 	free(ii, M_IFADDR);
 }
Index: user/dchagin/lemul/sys/netinet/in_gif.c
===================================================================
--- user/dchagin/lemul/sys/netinet/in_gif.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/in_gif.c	(revision 274241)
@@ -1,240 +1,240 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/protosw.h>
 #include <sys/malloc.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_gif.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip_ecn.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 
 #include <net/if_gif.h>
 
 static int gif_validate4(const struct ip *, struct gif_softc *,
 	struct ifnet *);
 
 extern  struct domain inetdomain;
 struct protosw in_gif_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		0/* IPPROTO_IPV[46] */,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		in_gif_input,
 	.pr_output =		rip_output,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 };
 
 VNET_DEFINE(int, ip_gif_ttl) = GIF_TTL;
 #define	V_ip_gif_ttl		VNET(ip_gif_ttl)
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ip_gif_ttl), 0, "");
 
 int
 in_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn)
 {
 	GIF_RLOCK_TRACKER;
 	struct gif_softc *sc = ifp->if_softc;
 	struct ip *ip;
 	int len;
 
 	/* prepend new IP header */
 	len = sizeof(struct ip);
 #ifndef __NO_STRICT_ALIGNMENT
 	if (proto == IPPROTO_ETHERIP)
 		len += ETHERIP_ALIGN;
 #endif
 	M_PREPEND(m, len, M_NOWAIT);
 	if (m == NULL)
 		return (ENOBUFS);
 #ifndef __NO_STRICT_ALIGNMENT
 	if (proto == IPPROTO_ETHERIP) {
 		len = mtod(m, vm_offset_t) & 3;
 		KASSERT(len == 0 || len == ETHERIP_ALIGN,
 		    ("in_gif_output: unexpected misalignment"));
 		m->m_data += len;
 		m->m_len -= ETHERIP_ALIGN;
 	}
 #endif
 	ip = mtod(m, struct ip *);
 	GIF_RLOCK(sc);
 	if (sc->gif_family != AF_INET) {
 		m_freem(m);
 		GIF_RUNLOCK(sc);
 		return (ENETDOWN);
 	}
 	bcopy(sc->gif_iphdr, ip, sizeof(struct ip));
 	GIF_RUNLOCK(sc);
 
 	ip->ip_p = proto;
 	/* version will be set in ip_output() */
 	ip->ip_ttl = V_ip_gif_ttl;
 	ip->ip_len = htons(m->m_pkthdr.len);
 	ip->ip_tos = ecn;
 
 	return (ip_output(m, NULL, NULL, 0, NULL, NULL));
 }
 
 int
 in_gif_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct gif_softc *sc;
 	struct ifnet *gifp;
 	struct ip *ip;
 	uint8_t ecn;
 
 	sc = encap_getarg(m);
 	if (sc == NULL) {
 		m_freem(m);
 		KMOD_IPSTAT_INC(ips_nogif);
 		return (IPPROTO_DONE);
 	}
 	gifp = GIF2IFP(sc);
 	if ((gifp->if_flags & IFF_UP) != 0) {
 		ip = mtod(m, struct ip *);
 		ecn = ip->ip_tos;
 		m_adj(m, *offp);
 		gif_input(m, gifp, proto, ecn);
 	} else {
 		m_freem(m);
 		KMOD_IPSTAT_INC(ips_nogif);
 	}
 	return (IPPROTO_DONE);
 }
 
 /*
  * validate outer address.
  */
 static int
 gif_validate4(const struct ip *ip, struct gif_softc *sc, struct ifnet *ifp)
 {
 
 	GIF_RLOCK_ASSERT(sc);
 
 	/* check for address match */
 	if (sc->gif_iphdr->ip_src.s_addr != ip->ip_dst.s_addr ||
 	    sc->gif_iphdr->ip_dst.s_addr != ip->ip_src.s_addr)
 		return (0);
 
 	/* martian filters on outer source - NOT done in ip_input! */
 	if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)))
 		return (0);
 	switch ((ntohl(ip->ip_src.s_addr) & 0xff000000) >> 24) {
 	case 0:
 	case 127:
 	case 255:
 		return (0);
 	}
 
 	/* ingress filters on outer source */
 	if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0 && ifp) {
 		struct sockaddr_in sin;
 		struct rtentry *rt;
 
 		bzero(&sin, sizeof(sin));
 		sin.sin_family = AF_INET;
 		sin.sin_len = sizeof(struct sockaddr_in);
 		sin.sin_addr = ip->ip_src;
 		/* XXX MRT  check for the interface we would use on output */
 		rt = in_rtalloc1((struct sockaddr *)&sin, 0,
 		    0UL, sc->gif_fibnum);
 		if (!rt || rt->rt_ifp != ifp) {
 			if (rt)
 				RTFREE_LOCKED(rt);
 			return (0);
 		}
 		RTFREE_LOCKED(rt);
 	}
 	return (32 * 2);
 }
 
 /*
  * we know that we are in IFF_UP, outer address available, and outer family
  * matched the physical addr family.  see gif_encapcheck().
  */
 int
 in_gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
 {
 	struct ip ip;
 	struct gif_softc *sc;
 	struct ifnet *ifp;
 
 	/* sanity check done in caller */
 	sc = (struct gif_softc *)arg;
 	GIF_RLOCK_ASSERT(sc);
 
 	m_copydata(m, 0, sizeof(ip), (caddr_t)&ip);
 	ifp = ((m->m_flags & M_PKTHDR) != 0) ? m->m_pkthdr.rcvif : NULL;
 	return (gif_validate4(&ip, sc, ifp));
 }
 
 int
 in_gif_attach(struct gif_softc *sc)
 {
 
 	KASSERT(sc->gif_ecookie == NULL, ("gif_ecookie isn't NULL"));
 	sc->gif_ecookie = encap_attach_func(AF_INET, -1, gif_encapcheck,
 	    &in_gif_protosw, sc);
 	if (sc->gif_ecookie == NULL)
 		return (EEXIST);
 	return (0);
 }
Index: user/dchagin/lemul/sys/netinet/in_pcb.c
===================================================================
--- user/dchagin/lemul/sys/netinet/in_pcb.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/in_pcb.c	(revision 274241)
@@ -1,2633 +1,2637 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_pcbgroup.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/callout.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #endif
 #ifdef INET
 #include <netinet/in_var.h>
 #endif
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #endif /* INET6 */
 
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif /* IPSEC */
 
 #include <security/mac/mac_framework.h>
 
 static struct callout	ipport_tick_callout;
 
 /*
  * These configure the range of local port addresses assigned to
  * "unspecified" outgoing connections/packets/whatever.
  */
 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
 
 /*
  * Reserved ports accessible only to root. There are significant
  * security considerations that must be accounted for when changing these,
  * but the security benefits can be great. Please be careful.
  */
 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_reservedlow);
 
 /* Variables dealing with random ephemeral port allocation. */
 VNET_DEFINE(int, ipport_randomized) = 1;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomcps) = 10;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomtime) = 45;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_stoprandom);		/* toggled by ipport_tick */
 VNET_DEFINE(int, ipport_tcpallocs);
 static VNET_DEFINE(int, ipport_tcplastcount);
 
 #define	V_ipport_tcplastcount		VNET(ipport_tcplastcount)
 
 static void	in_pcbremlists(struct inpcb *inp);
 #ifdef INET
 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
 			    struct in_addr faddr, u_int fport_arg,
 			    struct in_addr laddr, u_int lport_arg,
 			    int lookupflags, struct ifnet *ifp);
 
 #define RANGECHK(var, min, max) \
 	if ((var) < (min)) { (var) = (min); } \
 	else if ((var) > (max)) { (var) = (max); }
 
 static int
 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, arg1, arg2, req);
 	if (error == 0) {
 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
 	}
 	return (error);
 }
 
 #undef RANGECHK
 
 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
     "IP Ports");
 
-SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
-	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0,
-	&sysctl_net_ipport_check, "I", "");
-SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
-	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0,
-	&sysctl_net_ipport_check, "I", "");
-SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, first,
-	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0,
-	&sysctl_net_ipport_check, "I", "");
-SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, last,
-	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0,
-	&sysctl_net_ipport_check, "I", "");
-SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
-	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0,
-	&sysctl_net_ipport_check, "I", "");
-SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
-	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0,
-	&sysctl_net_ipport_check, "I", "");
-SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
-	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, "");
-SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
+	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+	&VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
+	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+	&VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
+	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+	&VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
+	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+	&VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
+	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+	&VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
+	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+	&VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
+	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
+	&VNET_NAME(ipport_reservedhigh), 0, "");
+SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
-SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
+	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
-SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
+	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
 	"allocations before switching to a sequental one");
-SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
+	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomtime), 0,
 	"Minimum time to keep sequental port "
 	"allocation before switching to a random one");
 #endif /* INET */
 
 /*
  * in_pcb.c: manage the Protocol Control Blocks.
  *
  * NOTE: It is assumed that most of these functions will be called with
  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
  * functions often modify hash chains or addresses in pcbs.
  */
 
 /*
  * Initialize an inpcbinfo -- we should be able to reduce the number of
  * arguments in time.
  */
 void
 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
     char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini,
     uint32_t inpcbzone_flags, u_int hashfields)
 {
 
 	INP_INFO_LOCK_INIT(pcbinfo, name);
 	INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");	/* XXXRW: argument? */
 #ifdef VIMAGE
 	pcbinfo->ipi_vnet = curvnet;
 #endif
 	pcbinfo->ipi_listhead = listhead;
 	LIST_INIT(pcbinfo->ipi_listhead);
 	pcbinfo->ipi_count = 0;
 	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
 	    &pcbinfo->ipi_hashmask);
 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_porthashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
 #endif
 	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
 	    NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR,
 	    inpcbzone_flags);
 	uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
 	uma_zone_set_warning(pcbinfo->ipi_zone,
 	    "kern.ipc.maxsockets limit reached");
 }
 
 /*
  * Destroy an inpcbinfo.
  */
 void
 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
 {
 
 	KASSERT(pcbinfo->ipi_count == 0,
 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
 
 	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
 	    pcbinfo->ipi_porthashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_destroy(pcbinfo);
 #endif
 	uma_zdestroy(pcbinfo->ipi_zone);
 	INP_HASH_LOCK_DESTROY(pcbinfo);
 	INP_INFO_LOCK_DESTROY(pcbinfo);
 }
 
 /*
  * Allocate a PCB and associate it with the socket.
  * On success return with the PCB locked.
  */
 int
 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 {
 	struct inpcb *inp;
 	int error;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	error = 0;
 	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
 	if (inp == NULL)
 		return (ENOBUFS);
 	bzero(inp, inp_zero_size);
 	inp->inp_pcbinfo = pcbinfo;
 	inp->inp_socket = so;
 	inp->inp_cred = crhold(so->so_cred);
 	inp->inp_inc.inc_fibnum = so->so_fibnum;
 #ifdef MAC
 	error = mac_inpcb_init(inp, M_NOWAIT);
 	if (error != 0)
 		goto out;
 	mac_inpcb_create(so, inp);
 #endif
 #ifdef IPSEC
 	error = ipsec_init_policy(so, &inp->inp_sp);
 	if (error != 0) {
 #ifdef MAC
 		mac_inpcb_destroy(inp);
 #endif
 		goto out;
 	}
 #endif /*IPSEC*/
 #ifdef INET6
 	if (INP_SOCKAF(so) == AF_INET6) {
 		inp->inp_vflag |= INP_IPV6PROTO;
 		if (V_ip6_v6only)
 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
 	}
 #endif
 	LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
 	pcbinfo->ipi_count++;
 	so->so_pcb = (caddr_t)inp;
 #ifdef INET6
 	if (V_ip6_auto_flowlabel)
 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
 #endif
 	INP_WLOCK(inp);
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	refcount_init(&inp->inp_refcount, 1);	/* Reference from inpcbinfo */
 #if defined(IPSEC) || defined(MAC)
 out:
 	if (error != 0) {
 		crfree(inp->inp_cred);
 		uma_zfree(pcbinfo->ipi_zone, inp);
 	}
 #endif
 	return (error);
 }
 
 #ifdef INET
 int
 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
 	    &inp->inp_lport, cred);
 	if (error)
 		return (error);
 	if (in_pcbinshash(inp) != 0) {
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 #endif
 
 /*
  * Select a local port (number) to use.
  */
 #if defined(INET) || defined(INET6)
 int
 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
     struct ucred *cred, int lookupflags)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcb *tmpinp;
 	unsigned short *lastport;
 	int count, dorandom, error;
 	u_short aux, first, last, lport;
 #ifdef INET
 	struct in_addr laddr;
 #endif
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	/*
 	 * Because no actual state changes occur here, a global write lock on
 	 * the pcbinfo isn't required.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (inp->inp_flags & INP_HIGHPORT) {
 		first = V_ipport_hifirstauto;	/* sysctl */
 		last  = V_ipport_hilastauto;
 		lastport = &pcbinfo->ipi_lasthi;
 	} else if (inp->inp_flags & INP_LOWPORT) {
 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
 		if (error)
 			return (error);
 		first = V_ipport_lowfirstauto;	/* 1023 */
 		last  = V_ipport_lowlastauto;	/* 600 */
 		lastport = &pcbinfo->ipi_lastlow;
 	} else {
 		first = V_ipport_firstauto;	/* sysctl */
 		last  = V_ipport_lastauto;
 		lastport = &pcbinfo->ipi_lastport;
 	}
 	/*
 	 * For UDP(-Lite), use random port allocation as long as the user
 	 * allows it.  For TCP (and as of yet unknown) connections,
 	 * use random port allocation only if the user allows it AND
 	 * ipport_tick() allows it.
 	 */
 	if (V_ipport_randomized &&
 		(!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
 		pcbinfo == &V_ulitecbinfo))
 		dorandom = 1;
 	else
 		dorandom = 0;
 	/*
 	 * It makes no sense to do random port allocation if
 	 * we have the only port available.
 	 */
 	if (first == last)
 		dorandom = 0;
 	/* Make sure to not include UDP(-Lite) packets in the count. */
 	if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
 		V_ipport_tcpallocs++;
 	/*
 	 * Instead of having two loops further down counting up or down
 	 * make sure that first is always <= last and go with only one
 	 * code path implementing all logic.
 	 */
 	if (first > last) {
 		aux = first;
 		first = last;
 		last = aux;
 	}
 
 #ifdef INET
 	/* Make the compiler happy. */
 	laddr.s_addr = 0;
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
 		KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p",
 		    __func__, inp));
 		laddr = *laddrp;
 	}
 #endif
 	tmpinp = NULL;	/* Make compiler happy. */
 	lport = *lportp;
 
 	if (dorandom)
 		*lastport = first + (arc4random() % (last - first));
 
 	count = last - first;
 
 	do {
 		if (count-- < 0)	/* completely used? */
 			return (EADDRNOTAVAIL);
 		++*lastport;
 		if (*lastport < first || *lastport > last)
 			*lastport = first;
 		lport = htons(*lastport);
 
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV6) != 0)
 			tmpinp = in6_pcblookup_local(pcbinfo,
 			    &inp->in6p_laddr, lport, lookupflags, cred);
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 			tmpinp = in_pcblookup_local(pcbinfo, laddr,
 			    lport, lookupflags, cred);
 #endif
 	} while (tmpinp != NULL);
 
 #ifdef INET
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4)
 		laddrp->s_addr = laddr.s_addr;
 #endif
 	*lportp = lport;
 
 	return (0);
 }
 
 /*
  * Return cached socket options.
  */
 short
 inp_so_options(const struct inpcb *inp)
 {
    short so_options;
 
    so_options = 0;
 
    if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
 	   so_options |= SO_REUSEPORT;
    if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
 	   so_options |= SO_REUSEADDR;
    return (so_options);
 }
 #endif /* INET || INET6 */
 
 /*
  * Check if a new BINDMULTI socket is allowed to be created.
  *
  * ni points to the new inp.
  * oi points to the exisitng inp.
  *
  * This checks whether the existing inp also has BINDMULTI and
  * whether the credentials match.
  */
 int
 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
 {
 	/* Check permissions match */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    (ni->inp_cred->cr_uid !=
 	    oi->inp_cred->cr_uid))
 		return (0);
 
 	/* Check the existing inp has BINDMULTI set */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    ((oi->inp_flags2 & INP_BINDMULTI) == 0))
 		return (0);
 
 	/*
 	 * We're okay - either INP_BINDMULTI isn't set on ni, or
 	 * it is and it matches the checks.
 	 */
 	return (1);
 }
 
 #ifdef INET
 /*
  * Set up a bind operation on a PCB, performing port allocation
  * as required, but do not actually modify the PCB. Callers can
  * either complete the bind by setting inp_laddr/inp_lport and
  * calling in_pcbinshash(), or they can just use the resulting
  * port and address to authorise the sending of a once-off packet.
  *
  * On error, the values of *laddrp and *lportp are not changed.
  */
 int
 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
     u_short *lportp, struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	struct sockaddr_in *sin;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct in_addr laddr;
 	u_short lport = 0;
 	int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
 	int error;
 
 	/*
 	 * No state changes, so read locks are sufficient here.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
 			return (error);
 	} else {
 		sin = (struct sockaddr_in *)nam;
 		if (nam->sa_len != sizeof (*sin))
 			return (EINVAL);
 #ifdef notdef
 		/*
 		 * We should check the family, but old programs
 		 * incorrectly fail to initialize it.
 		 */
 		if (sin->sin_family != AF_INET)
 			return (EAFNOSUPPORT);
 #endif
 		error = prison_local_ip4(cred, &sin->sin_addr);
 		if (error)
 			return (error);
 		if (sin->sin_port != *lportp) {
 			/* Don't allow the port to change. */
 			if (*lportp != 0)
 				return (EINVAL);
 			lport = sin->sin_port;
 		}
 		/* NB: lport is left as 0 if the port isn't being changed. */
 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow complete duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
 			sin->sin_port = 0;		/* yech... */
 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 			/*
 			 * Is the address a local IP address? 
 			 * If INP_BINDANY is set, then the socket may be bound
 			 * to any endpoint address, local or not.
 			 */
 			if ((inp->inp_flags & INP_BINDANY) == 0 &&
 			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 
 				return (EADDRNOTAVAIL);
 		}
 		laddr = sin->sin_addr;
 		if (lport) {
 			struct inpcb *t;
 			struct tcptw *tw;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
 			    0))
 				return (EACCES);
 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
 			    priv_check_cred(inp->inp_cred,
 			    PRIV_NETINET_REUSEPORT, 0) != 0) {
 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 				    lport, INPLOOKUP_WILDCARD, cred);
 	/*
 	 * XXX
 	 * This entire block sorely needs a rewrite.
 	 */
 				if (t &&
 				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				     (t->inp_flags2 & INP_REUSEPORT) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 
 				/*
 				 * If the socket is a BINDMULTI socket, then
 				 * the credentials need to match and the
 				 * original socket also has to have been bound
 				 * with BINDMULTI.
 				 */
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 			    lport, lookupflags, cred);
 			if (t && (t->inp_flags & INP_TIMEWAIT)) {
 				/*
 				 * XXXRW: If an incpb has had its timewait
 				 * state recycled, we treat the address as
 				 * being in use (for now).  This is better
 				 * than a panic, but not desirable.
 				 */
 				tw = intotw(t);
 				if (tw == NULL ||
 				    (reuseport & tw->tw_so_options) == 0)
 					return (EADDRINUSE);
 			} else if (t &&
 			    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 			    (reuseport & inp_so_options(t)) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
 				    ntohl(t->inp_laddr.s_addr) !=
 				    INADDR_ANY ||
 				    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
 				    (t->inp_vflag & INP_IPV6PROTO) == 0)
 #endif
 				return (EADDRINUSE);
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 		}
 	}
 	if (*lportp != 0)
 		lport = *lportp;
 	if (lport == 0) {
 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
 		if (error != 0)
 			return (error);
 
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	return (0);
 }
 
 /*
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
     struct ucred *cred, struct mbuf *m)
 {
 	u_short lport, fport;
 	in_addr_t laddr, faddr;
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	lport = inp->inp_lport;
 	laddr = inp->inp_laddr.s_addr;
 	anonport = (lport == 0);
 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
 	    NULL, cred);
 	if (error)
 		return (error);
 
 	/* Do the initial binding of the local address if required. */
 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 		inp->inp_lport = lport;
 		inp->inp_laddr.s_addr = laddr;
 		if (in_pcbinshash(inp) != 0) {
 			inp->inp_laddr.s_addr = INADDR_ANY;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 
 	/* Commit the remaining changes. */
 	inp->inp_lport = lport;
 	inp->inp_laddr.s_addr = laddr;
 	inp->inp_faddr.s_addr = faddr;
 	inp->inp_fport = fport;
 	in_pcbrehash_mbuf(inp, m);
 
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 
 int
 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 
 	return (in_pcbconnect_mbuf(inp, nam, cred, NULL));
 }
 
 /*
  * Do proper source address selection on an unbound socket in case
  * of connect. Take jails into account as well.
  */
 int
 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
     struct ucred *cred)
 {
 	struct ifaddr *ifa;
 	struct sockaddr *sa;
 	struct sockaddr_in *sin;
 	struct route sro;
 	int error;
 
 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
 
 	/*
 	 * Bypass source address selection and use the primary jail IP
 	 * if requested.
 	 */
 	if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
 		return (0);
 
 	error = 0;
 	bzero(&sro, sizeof(sro));
 
 	sin = (struct sockaddr_in *)&sro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_addr.s_addr = faddr->s_addr;
 
 	/*
 	 * If route is known our src addr is taken from the i/f,
 	 * else punt.
 	 *
 	 * Find out route to destination.
 	 */
 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
 		in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
 
 	/*
 	 * If we found a route, use the address corresponding to
 	 * the outgoing interface.
 	 * 
 	 * Otherwise assume faddr is reachable on a directly connected
 	 * network and try to find a corresponding interface to take
 	 * the source address from.
 	 */
 	if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
 						inp->inp_socket->so_fibnum));
 		if (ia == NULL) {
 			error = ENETUNREACH;
 			goto done;
 		}
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			ifa_free(&ia->ia_ifa);
 			goto done;
 		}
 
 		ifp = ia->ia_ifp;
 		ifa_free(&ia->ia_ifa);
 		ia = NULL;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			IF_ADDR_RUNLOCK(ifp);
 			goto done;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * If the outgoing interface on the route found is not
 	 * a loopback interface, use the address from that interface.
 	 * In case of jails do those three steps:
 	 * 1. check if the interface address belongs to the jail. If so use it.
 	 * 2. check if we have any address on the outgoing interface
 	 *    belonging to this jail. If so use it.
 	 * 3. as a last resort return the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		/* If not jailed, use the default returned. */
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		/* 1. Check if the iface address belongs to the jail. */
 		sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/*
 		 * 2. Check if we have any address on the outgoing interface
 		 *    belonging to this jail.
 		 */
 		ia = NULL;
 		ifp = sro.ro_rt->rt_ifp;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			IF_ADDR_RUNLOCK(ifp);
 			goto done;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * The outgoing interface is marked with 'loopback net', so a route
 	 * to ourselves is here.
 	 * Try to find the interface of the destination address and then
 	 * take the address from there. That interface is not necessarily
 	 * a loopback interface.
 	 * In case of jails, check that it is an address of the jail
 	 * and if we cannot find, fall back to the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
 		struct sockaddr_in sain;
 		struct in_ifaddr *ia;
 
 		bzero(&sain, sizeof(struct sockaddr_in));
 		sain.sin_family = AF_INET;
 		sain.sin_len = sizeof(struct sockaddr_in);
 		sain.sin_addr.s_addr = faddr->s_addr;
 
 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain),
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0,
 						inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			if (ia == NULL) {
 				error = ENETUNREACH;
 				goto done;
 			}
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			ifa_free(&ia->ia_ifa);
 			goto done;
 		}
 
 		/* Jailed. */
 		if (ia != NULL) {
 			struct ifnet *ifp;
 
 			ifp = ia->ia_ifp;
 			ifa_free(&ia->ia_ifa);
 			ia = NULL;
 			IF_ADDR_RLOCK(ifp);
 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 
 				sa = ifa->ifa_addr;
 				if (sa->sa_family != AF_INET)
 					continue;
 				sin = (struct sockaddr_in *)sa;
 				if (prison_check_ip4(cred,
 				    &sin->sin_addr) == 0) {
 					ia = (struct in_ifaddr *)ifa;
 					break;
 				}
 			}
 			if (ia != NULL) {
 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 			IF_ADDR_RUNLOCK(ifp);
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 done:
 	if (sro.ro_rt != NULL)
 		RTFREE(sro.ro_rt);
 	return (error);
 }
 
 /*
  * Set up for a connect from a socket to the specified address.
  * On entry, *laddrp and *lportp should contain the current local
  * address and port for the PCB; these are updated to the values
  * that should be placed in inp_laddr and inp_lport to complete
  * the connect.
  *
  * On success, *faddrp and *fportp will be set to the remote address
  * and port. These are not updated in the error case.
  *
  * If the operation fails because the connection already exists,
  * *oinpp will be set to the PCB of that connection so that the
  * caller can decide to override it. In all other cases, *oinpp
  * is set to NULL.
  */
 int
 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
     struct inpcb **oinpp, struct ucred *cred)
 {
 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 	struct in_ifaddr *ia;
 	struct inpcb *oinp;
 	struct in_addr laddr, faddr;
 	u_short lport, fport;
 	int error;
 
 	/*
 	 * Because a global state change doesn't actually occur here, a read
 	 * lock is sufficient.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (oinpp != NULL)
 		*oinpp = NULL;
 	if (nam->sa_len != sizeof (*sin))
 		return (EINVAL);
 	if (sin->sin_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (sin->sin_port == 0)
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	lport = *lportp;
 	faddr = sin->sin_addr;
 	fport = sin->sin_port;
 
 	if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
 		/*
 		 * If the destination address is INADDR_ANY,
 		 * use the primary local address.
 		 * If the supplied address is INADDR_BROADCAST,
 		 * and the primary interface supports broadcast,
 		 * choose the broadcast address for that interface.
 		 */
 		if (faddr.s_addr == INADDR_ANY) {
 			IN_IFADDR_RLOCK();
 			faddr =
 			    IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
 			IN_IFADDR_RUNLOCK();
 			if (cred != NULL &&
 			    (error = prison_get_ip4(cred, &faddr)) != 0)
 				return (error);
 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
 			IN_IFADDR_RLOCK();
 			if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
 			    IFF_BROADCAST)
 				faddr = satosin(&TAILQ_FIRST(
 				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
 			IN_IFADDR_RUNLOCK();
 		}
 	}
 	if (laddr.s_addr == INADDR_ANY) {
 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
 		/*
 		 * If the destination address is multicast and an outgoing
 		 * interface has been set as a multicast option, prefer the
 		 * address of that interface as our source address.
 		 */
 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
 		    inp->inp_moptions != NULL) {
 			struct ip_moptions *imo;
 			struct ifnet *ifp;
 
 			imo = inp->inp_moptions;
 			if (imo->imo_multicast_ifp != NULL) {
 				ifp = imo->imo_multicast_ifp;
 				IN_IFADDR_RLOCK();
 				TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 					if ((ia->ia_ifp == ifp) &&
 					    (cred == NULL ||
 					    prison_check_ip4(cred,
 					    &ia->ia_addr.sin_addr) == 0))
 						break;
 				}
 				if (ia == NULL)
 					error = EADDRNOTAVAIL;
 				else {
 					laddr = ia->ia_addr.sin_addr;
 					error = 0;
 				}
 				IN_IFADDR_RUNLOCK();
 			}
 		}
 		if (error)
 			return (error);
 	}
 	oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport,
 	    laddr, lport, 0, NULL);
 	if (oinp != NULL) {
 		if (oinpp != NULL)
 			*oinpp = oinp;
 		return (EADDRINUSE);
 	}
 	if (lport == 0) {
 		error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
 		    cred);
 		if (error)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	*faddrp = faddr.s_addr;
 	*fportp = fport;
 	return (0);
 }
 
 void
 in_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	inp->inp_fport = 0;
 	in_pcbrehash(inp);
 }
 #endif /* INET */
 
 /*
  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
  * For most protocols, this will be invoked immediately prior to calling
  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
  * socket, in which case in_pcbfree() is deferred.
  */
 void
 in_pcbdetach(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 
 	inp->inp_socket->so_pcb = NULL;
 	inp->inp_socket = NULL;
 }
 
 /*
  * in_pcbref() bumps the reference count on an inpcb in order to maintain
  * stability of an inpcb pointer despite the inpcb lock being released.  This
  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
  * but where the inpcb lock may already held, or when acquiring a reference
  * via a pcbgroup.
  *
  * in_pcbref() should be used only to provide brief memory stability, and
  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
  * garbage collect the inpcb if it has been in_pcbfree()'d from another
  * context.  Until in_pcbrele() has returned that the inpcb is still valid,
  * lock and rele are the *only* safe operations that may be performed on the
  * inpcb.
  *
  * While the inpcb will not be freed, releasing the inpcb lock means that the
  * connection's state may change, so the caller should be careful to
  * revalidate any cached state on reacquiring the lock.  Drop the reference
  * using in_pcbrele().
  */
 void
 in_pcbref(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	refcount_acquire(&inp->inp_refcount);
 }
 
 /*
  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
  * return a flag indicating whether or not the inpcb remains valid.  If it is
  * valid, we return with the inpcb lock held.
  *
  * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
  * reference on an inpcb.  Historically more work was done here (actually, in
  * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
  * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
  * about memory stability (and continued use of the write lock).
  */
 int
 in_pcbrele_rlocked(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_RLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0) {
 		/*
 		 * If the inpcb has been freed, let the caller know, even if
 		 * this isn't the last reference.
 		 */
 		if (inp->inp_flags2 & INP_FREED) {
 			INP_RUNLOCK(inp);
 			return (1);
 		}
 		return (0);
 	}
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 
 	INP_RUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 int
 in_pcbrele_wlocked(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0)
 		return (0);
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 
 	INP_WUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 /*
  * Temporary wrapper.
  */
 int
 in_pcbrele(struct inpcb *inp)
 {
 
 	return (in_pcbrele_wlocked(inp));
 }
 
 /*
  * Unconditionally schedule an inpcb to be freed by decrementing its
  * reference count, which should occur only after the inpcb has been detached
  * from its socket.  If another thread holds a temporary reference (acquired
  * using in_pcbref()) then the free is deferred until that reference is
  * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
  * work, including removal from global lists, is done in this context, where
  * the pcbinfo lock is held.
  */
 void
 in_pcbfree(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	/* XXXRW: Do as much as possible here. */
 #ifdef IPSEC
 	if (inp->inp_sp != NULL)
 		ipsec_delete_pcbpolicy(inp);
 #endif
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	in_pcbremlists(inp);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6PROTO) {
 		ip6_freepcbopts(inp->in6p_outputopts);
 		if (inp->in6p_moptions != NULL)
 			ip6_freemoptions(inp->in6p_moptions);
 	}
 #endif
 	if (inp->inp_options)
 		(void)m_free(inp->inp_options);
 #ifdef INET
 	if (inp->inp_moptions != NULL)
 		inp_freemoptions(inp->inp_moptions);
 #endif
 	inp->inp_vflag = 0;
 	inp->inp_flags2 |= INP_FREED;
 	crfree(inp->inp_cred);
 #ifdef MAC
 	mac_inpcb_destroy(inp);
 #endif
 	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 }
 
 /*
  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
  * port reservation, and preventing it from being returned by inpcb lookups.
  *
  * It is used by TCP to mark an inpcb as unused and avoid future packet
  * delivery or event notification when a socket remains open but TCP has
  * closed.  This might occur as a result of a shutdown()-initiated TCP close
  * or a RST on the wire, and allows the port binding to be reused while still
  * maintaining the invariant that so_pcb always points to a valid inpcb until
  * in_pcbdetach().
  *
  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
  * in_pcbnotifyall() and in_pcbpurgeif0()?
  */
 void
 in_pcbdrop(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * XXXRW: Possibly we should protect the setting of INP_DROPPED with
 	 * the hash lock...?
 	 */
 	inp->inp_flags |= INP_DROPPED;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(inp->inp_pcbinfo);
 		LIST_REMOVE(inp, inp_hash);
 		LIST_REMOVE(inp, inp_portlist);
 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			LIST_REMOVE(phd, phd_hash);
 			free(phd, M_PCB);
 		}
 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 #ifdef PCBGROUP
 		in_pcbgroup_remove(inp);
 #endif
 	}
 }
 
 #ifdef INET
 /*
  * Common routines to return the socket addresses associated with inpcbs.
  */
 struct sockaddr *
 in_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in *sin;
 
 	sin = malloc(sizeof *sin, M_SONAME,
 		M_WAITOK | M_ZERO);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = *addr_p;
 	sin->sin_port = port;
 
 	return (struct sockaddr *)sin;
 }
 
 int
 in_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->inp_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 void
 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb *inp, *inp_temp;
 
 	INP_INFO_WLOCK(pcbinfo);
 	LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 #endif
 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
 		    inp->inp_socket == NULL) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 		if ((*notify)(inp, errno))
 			INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 void
 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb *inp;
 	struct ip_moptions *imo;
 	int i, gap;
 
 	INP_INFO_RLOCK(pcbinfo);
 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		imo = inp->inp_moptions;
 		if ((inp->inp_vflag & INP_IPV4) &&
 		    imo != NULL) {
 			/*
 			 * Unselect the outgoing interface if it is being
 			 * detached.
 			 */
 			if (imo->imo_multicast_ifp == ifp)
 				imo->imo_multicast_ifp = NULL;
 
 			/*
 			 * Drop multicast group membership if we joined
 			 * through the interface being detached.
 			 */
 			for (i = 0, gap = 0; i < imo->imo_num_memberships;
 			    i++) {
 				if (imo->imo_membership[i]->inm_ifp == ifp) {
 					in_delmulti(imo->imo_membership[i]);
 					gap++;
 				} else if (gap != 0)
 					imo->imo_membership[i - gap] =
 					    imo->imo_membership[i];
 			}
 			imo->imo_num_memberships -= gap;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(pcbinfo);
 }
 
 /*
  * Lookup a PCB based on the local address and port.  Caller must hold the
  * hash lock.  No inpcb locks or references are acquired.
  */
 #define INP_LOOKUP_MAPPED_PCB_COST	3
 struct inpcb *
 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
     u_short lport, int lookupflags, struct ucred *cred)
 {
 	struct inpcb *inp;
 #ifdef INET6
 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
 #else
 	int matchwild = 3;
 #endif
 	int wildcard;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
 			    inp->inp_laddr.s_addr == laddr.s_addr &&
 			    inp->inp_lport == lport) {
 				/*
 				 * Found?
 				 */
 				if (cred == NULL ||
 				    prison_equal_ip4(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 				if (cred != NULL &&
 				    !prison_equal_ip4(inp->inp_cred->cr_prison,
 					cred->cr_prison))
 					continue;
 #ifdef INET6
 				/* XXX inp locking */
 				if ((inp->inp_vflag & INP_IPV4) == 0)
 					continue;
 				/*
 				 * We never select the PCB that has
 				 * INP_IPV6 flag and is bound to :: if
 				 * we have another PCB which is bound
 				 * to 0.0.0.0.  If a PCB has the
 				 * INP_IPV6 flag, then we set its cost
 				 * higher than IPv4 only PCBs.
 				 *
 				 * Note that the case only happens
 				 * when a socket is bound to ::, under
 				 * the condition that the use of the
 				 * mapped address is allowed.
 				 */
 				if ((inp->inp_vflag & INP_IPV6) != 0)
 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 #endif
 				if (inp->inp_faddr.s_addr != INADDR_ANY)
 					wildcard++;
 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
 					if (laddr.s_addr == INADDR_ANY)
 						wildcard++;
 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
 						continue;
 				} else {
 					if (laddr.s_addr != INADDR_ANY)
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0)
 						break;
 				}
 			}
 		}
 		return (match);
 	}
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
 #ifdef PCBGROUP
 /*
  * Lookup PCB in hash list, using pcbgroup tables.
  */
 static struct inpcb *
 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
     u_int lport_arg, int lookupflags, struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	INP_GROUP_LOCK(pcbgroup);
 	head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbgroup->ipg_hashmask)];
 	LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				goto found;
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL) {
 		inp = tmpinp;
 		goto found;
 	}
 
 #ifdef	RSS
 	/*
 	 * For incoming connections, we may wish to do a wildcard
 	 * match for an RSS-local socket.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		struct inpcbhead *head;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
 		    lport, 0, pcbgroup->ipg_hashmask)];
 		LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			/* XXX inp locking */
 			if (ifp && ifp->if_type == IFT_FAITH &&
 			    (inp->inp_flags & INP_FAITH) == 0)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 #ifdef INET6
 		if (inp == NULL)
 			inp = local_wild_mapped;
 #endif
 		if (inp != NULL)
 			goto found;
 	}
 #endif
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		struct inpcbhead *head;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_wildmask)];
 		LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			/* XXX inp locking */
 			if (ifp && ifp->if_type == IFT_FAITH &&
 			    (inp->inp_flags & INP_FAITH) == 0)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 #ifdef INET6
 		if (inp == NULL)
 			inp = local_wild_mapped;
 #endif
 		if (inp != NULL)
 			goto found;
 	} /* if (lookupflags & INPLOOKUP_WILDCARD) */
 	INP_GROUP_UNLOCK(pcbgroup);
 	return (NULL);
 
 found:
 	in_pcbref(inp);
 	INP_GROUP_UNLOCK(pcbgroup);
 	if (lookupflags & INPLOOKUP_WLOCKPCB) {
 		INP_WLOCK(inp);
 		if (in_pcbrele_wlocked(inp))
 			return (NULL);
 	} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 		INP_RLOCK(inp);
 		if (in_pcbrele_rlocked(inp))
 			return (NULL);
 	} else
 		panic("%s: locking bug", __func__);
 	return (inp);
 }
 #endif /* PCBGROUP */
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
  * that the caller has locked the hash list, and will not perform any further
  * locking or reference operations on either the hash list or the connection.
  */
 static struct inpcb *
 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
     struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
 	LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				return (inp);
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			/* XXX inp locking */
 			if (ifp && ifp->if_type == IFT_FAITH &&
 			    (inp->inp_flags & INP_FAITH) == 0)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					return (inp);
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		if (jail_wild != NULL)
 			return (jail_wild);
 		if (local_exact != NULL)
 			return (local_exact);
 		if (local_wild != NULL)
 			return (local_wild);
 #ifdef INET6
 		if (local_wild_mapped != NULL)
 			return (local_wild_mapped);
 #endif
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 
 	return (NULL);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
  * hash list lock, and will return the inpcb locked (i.e., requires
  * INPLOOKUP_LOCKPCB).
  */
 static struct inpcb *
 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp)
 {
 	struct inpcb *inp;
 
 	INP_HASH_RLOCK(pcbinfo);
 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 	    (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
 	if (inp != NULL) {
 		in_pcbref(inp);
 		INP_HASH_RUNLOCK(pcbinfo);
 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
 			INP_WLOCK(inp);
 			if (in_pcbrele_wlocked(inp))
 				return (NULL);
 		} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 			INP_RLOCK(inp);
 			if (in_pcbrele_rlocked(inp))
 				return (NULL);
 		} else
 			panic("%s: locking bug", __func__);
 	} else
 		INP_HASH_RUNLOCK(pcbinfo);
 	return (inp);
 }
 
 /*
  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
  * from which a pre-calculated hash value may be extracted.
  *
  * Possibly more of this logic should be in in_pcbgroup.c.
  */
 struct inpcb *
 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 {
 #if defined(PCBGROUP) && !defined(RSS)
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	/*
 	 * When not using RSS, use connection groups in preference to the
 	 * reservation table when looking up 4-tuples.  When using RSS, just
 	 * use the reservation table, due to the cost of the Toeplitz hash
 	 * in software.
 	 *
 	 * XXXRW: This policy belongs in the pcbgroup code, as in principle
 	 * we could be doing RSS with a non-Toeplitz hash that is affordable
 	 * in software.
 	 */
 #if defined(PCBGROUP) && !defined(RSS)
 	if (in_pcbgroup_enabled(pcbinfo)) {
 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 	}
 #endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 
 struct inpcb *
 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, struct mbuf *m)
 {
 #ifdef PCBGROUP
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 #ifdef PCBGROUP
 	/*
 	 * If we can use a hardware-generated hash to look up the connection
 	 * group, use that connection group to find the inpcb.  Otherwise
 	 * fall back on a software hash -- or the reservation table if we're
 	 * using RSS.
 	 *
 	 * XXXRW: As above, that policy belongs in the pcbgroup code.
 	 */
 	if (in_pcbgroup_enabled(pcbinfo) &&
 	    !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
 		pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
 		    m->m_pkthdr.flowid);
 		if (pcbgroup != NULL)
 			return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
 			    fport, laddr, lport, lookupflags, ifp));
 #ifndef RSS
 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 #endif
 	}
 #endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 #endif /* INET */
 
 /*
  * Insert PCB onto various hash lists.
  */
 static int
 in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
 {
 	struct inpcbhead *pcbhash;
 	struct inpcbporthead *pcbporthash;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbport *phd;
 	u_int32_t hashkey_faddr;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
 	    ("in_pcbinshash: INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 	else
 #endif
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	pcbporthash = &pcbinfo->ipi_porthashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 
 	/*
 	 * Go through port list and look for a head for this lport.
 	 */
 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
 		if (phd->phd_port == inp->inp_lport)
 			break;
 	}
 	/*
 	 * If none exists, malloc one and tack it on.
 	 */
 	if (phd == NULL) {
 		phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
 		if (phd == NULL) {
 			return (ENOBUFS); /* XXX */
 		}
 		phd->phd_port = inp->inp_lport;
 		LIST_INIT(&phd->phd_pcblist);
 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 	}
 	inp->inp_phd = phd;
 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 	inp->inp_flags |= INP_INHASHLIST;
 #ifdef PCBGROUP
 	if (do_pcbgroup_update)
 		in_pcbgroup_update(inp);
 #endif
 	return (0);
 }
 
 /*
  * For now, there are two public interfaces to insert an inpcb into the hash
  * lists -- one that does update pcbgroups, and one that doesn't.  The latter
  * is used only in the TCP syncache, where in_pcbinshash is called before the
  * full 4-tuple is set for the inpcb, and we don't want to install in the
  * pcbgroup until later.
  *
  * XXXRW: This seems like a misfeature.  in_pcbinshash should always update
  * connection groups, and partially initialised inpcbs should not be exposed
  * to either reservation hash tables or pcbgroups.
  */
 int
 in_pcbinshash(struct inpcb *inp)
 {
 
 	return (in_pcbinshash_internal(inp, 1));
 }
 
 int
 in_pcbinshash_nopcbgroup(struct inpcb *inp)
 {
 
 	return (in_pcbinshash_internal(inp, 0));
 }
 
 /*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
  * not change after in_pcbinshash() has been called.
  */
 void
 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *head;
 	u_int32_t hashkey_faddr;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT(inp->inp_flags & INP_INHASHLIST,
 	    ("in_pcbrehash: !INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 	else
 #endif
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	LIST_REMOVE(inp, inp_hash);
 	LIST_INSERT_HEAD(head, inp, inp_hash);
 
 #ifdef PCBGROUP
 	if (m != NULL)
 		in_pcbgroup_update_mbuf(inp, m);
 	else
 		in_pcbgroup_update(inp);
 #endif
 }
 
 void
 in_pcbrehash(struct inpcb *inp)
 {
 
 	in_pcbrehash_mbuf(inp, NULL);
 }
 
 /*
  * Remove PCB from various lists.
  */
 static void
 in_pcbremlists(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(pcbinfo);
 		LIST_REMOVE(inp, inp_hash);
 		LIST_REMOVE(inp, inp_portlist);
 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			LIST_REMOVE(phd, phd_hash);
 			free(phd, M_PCB);
 		}
 		INP_HASH_WUNLOCK(pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 	}
 	LIST_REMOVE(inp, inp_list);
 	pcbinfo->ipi_count--;
 #ifdef PCBGROUP
 	in_pcbgroup_remove(inp);
 #endif
 }
 
 /*
  * A set label operation has occurred at the socket layer, propagate the
  * label change into the in_pcb for the socket.
  */
 void
 in_pcbsosetlabel(struct socket *so)
 {
 #ifdef MAC
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
 
 	INP_WLOCK(inp);
 	SOCK_LOCK(so);
 	mac_inpcb_sosetlabel(so, inp);
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 #endif
 }
 
 /*
  * ipport_tick runs once per second, determining if random port allocation
  * should be continued.  If more than ipport_randomcps ports have been
  * allocated in the last second, then we return to sequential port
  * allocation. We return to random allocation only once we drop below
  * ipport_randomcps for at least ipport_randomtime seconds.
  */
 static void
 ipport_tick(void *xtp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);	/* XXX appease INVARIANTS here */
 		if (V_ipport_tcpallocs <=
 		    V_ipport_tcplastcount + V_ipport_randomcps) {
 			if (V_ipport_stoprandom > 0)
 				V_ipport_stoprandom--;
 		} else
 			V_ipport_stoprandom = V_ipport_randomtime;
 		V_ipport_tcplastcount = V_ipport_tcpallocs;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
 }
 
 static void
 ip_fini(void *xtp)
 {
 
 	callout_stop(&ipport_tick_callout);
 }
 
 /* 
  * The ipport_callout should start running at about the time we attach the
  * inet or inet6 domains.
  */
 static void
 ipport_tick_init(const void *unused __unused)
 {
 
 	/* Start ipport_tick. */
 	callout_init(&ipport_tick_callout, CALLOUT_MPSAFE);
 	callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 }
 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, 
     ipport_tick_init, NULL);
 
 void
 inp_wlock(struct inpcb *inp)
 {
 
 	INP_WLOCK(inp);
 }
 
 void
 inp_wunlock(struct inpcb *inp)
 {
 
 	INP_WUNLOCK(inp);
 }
 
 void
 inp_rlock(struct inpcb *inp)
 {
 
 	INP_RLOCK(inp);
 }
 
 void
 inp_runlock(struct inpcb *inp)
 {
 
 	INP_RUNLOCK(inp);
 }
 
 #ifdef INVARIANTS
 void
 inp_lock_assert(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 }
 
 void
 inp_unlock_assert(struct inpcb *inp)
 {
 
 	INP_UNLOCK_ASSERT(inp);
 }
 #endif
 
 void
 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
 {
 	struct inpcb *inp;
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		func(inp, arg);
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 }
 
 struct socket *
 inp_inpcbtosocket(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return (inp->inp_socket);
 }
 
 struct tcpcb *
 inp_inpcbtotcpcb(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return ((struct tcpcb *)inp->inp_ppcb);
 }
 
 int
 inp_ip_tos_get(const struct inpcb *inp)
 {
 
 	return (inp->inp_ip_tos);
 }
 
 void
 inp_ip_tos_set(struct inpcb *inp, int val)
 {
 
 	inp->inp_ip_tos = val;
 }
 
 void
 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
     uint32_t *faddr, uint16_t *fp)
 {
 
 	INP_LOCK_ASSERT(inp);
 	*laddr = inp->inp_laddr.s_addr;
 	*faddr = inp->inp_faddr.s_addr;
 	*lp = inp->inp_lport;
 	*fp = inp->inp_fport;
 }
 
 struct inpcb *
 so_sotoinpcb(struct socket *so)
 {
 
 	return (sotoinpcb(so));
 }
 
 struct tcpcb *
 so_sototcpcb(struct socket *so)
 {
 
 	return (sototcpcb(so));
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
 {
 	char faddr_str[48], laddr_str[48];
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inc);
 
 	indent += 2;
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		/* IPv6. */
 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
 	} else
 #endif
 	{
 		/* IPv4. */
 		inet_ntoa_r(inc->inc_laddr, laddr_str);
 		inet_ntoa_r(inc->inc_faddr, faddr_str);
 	}
 	db_print_indent(indent);
 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
 	    ntohs(inc->inc_lport));
 	db_print_indent(indent);
 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
 	    ntohs(inc->inc_fport));
 }
 
 static void
 db_print_inpflags(int inp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_flags & INP_RECVOPTS) {
 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVRETOPTS) {
 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVDSTADDR) {
 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HDRINCL) {
 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HIGHPORT) {
 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_LOWPORT) {
 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ANONPORT) {
 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVIF) {
 		db_printf("%sINP_RECVIF", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_MTUDISC) {
 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_FAITH) {
 		db_printf("%sINP_FAITH", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTTL) {
 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_DONTFRAG) {
 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTOS) {
 		db_printf("%sINP_RECVTOS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_IPV6_V6ONLY) {
 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_PKTINFO) {
 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPLIMIT) {
 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPOPTS) {
 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_DSTOPTS) {
 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDR) {
 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_TCLASS) {
 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_TIMEWAIT) {
 		db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_ONESBCAST) {
 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_DROPPED) {
 		db_printf("%sINP_DROPPED", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_SOCKREF) {
 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & IN6P_RFC2292) {
 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_MTU) {
 		db_printf("IN6P_MTU%s", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_inpvflag(u_char inp_vflag)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_vflag & INP_IPV4) {
 		db_printf("%sINP_IPV4", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6) {
 		db_printf("%sINP_IPV6", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6PROTO) {
 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
 		comma  = 1;
 	}
 }
 
 static void
 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
 
 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
 
 	db_print_indent(indent);
 	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
 	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
 
 	db_print_indent(indent);
 	db_printf("inp_label: %p   inp_flags: 0x%x (",
 	   inp->inp_label, inp->inp_flags);
 	db_print_inpflags(inp->inp_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
 	    inp->inp_vflag);
 	db_print_inpvflag(inp->inp_vflag);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
 
 	db_print_indent(indent);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
 		    "in6p_moptions: %p\n", inp->in6p_options,
 		    inp->in6p_outputopts, inp->in6p_moptions);
 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
 		    inp->in6p_hops);
 	} else
 #endif
 	{
 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
 		    inp->inp_options, inp->inp_moptions);
 	}
 
 	db_print_indent(indent);
 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
 	    (uintmax_t)inp->inp_gencnt);
 }
 
 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 {
 	struct inpcb *inp;
 
 	if (!have_addr) {
 		db_printf("usage: show inpcb <addr>\n");
 		return;
 	}
 	inp = (struct inpcb *)addr;
 
 	db_print_inpcb(inp, "inpcb", 0);
 }
 #endif /* DDB */
Index: user/dchagin/lemul/sys/netinet/in_rmx.c
===================================================================
--- user/dchagin/lemul/sys/netinet/in_rmx.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/in_rmx.c	(revision 274241)
@@ -1,508 +1,508 @@
 /*-
  * Copyright 1994, 1995 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/socket.h>
 #include <sys/mbuf.h>
 #include <sys/syslog.h>
 #include <sys/callout.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 
 extern int	in_inithead(void **head, int off);
 #ifdef VIMAGE
 extern int	in_detachhead(void **head, int off);
 #endif
 
 #define RTPRF_OURS		RTF_PROTO3	/* set on routes we manage */
 
 /*
  * Do what we need to do when inserting a route.
  */
 static struct radix_node *
 in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
     struct radix_node *treenodes)
 {
 	struct rtentry *rt = (struct rtentry *)treenodes;
 	struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt);
 
 	RADIX_NODE_HEAD_WLOCK_ASSERT(head);
 	/*
 	 * A little bit of help for both IP output and input:
 	 *   For host routes, we make sure that RTF_BROADCAST
 	 *   is set for anything that looks like a broadcast address.
 	 *   This way, we can avoid an expensive call to in_broadcast()
 	 *   in ip_output() most of the time (because the route passed
 	 *   to ip_output() is almost always a host route).
 	 *
 	 *   We also do the same for local addresses, with the thought
 	 *   that this might one day be used to speed up ip_input().
 	 *
 	 * We also mark routes to multicast addresses as such, because
 	 * it's easy to do and might be useful (but this is much more
 	 * dubious since it's so easy to inspect the address).
 	 */
 	if (rt->rt_flags & RTF_HOST) {
 		if (in_broadcast(sin->sin_addr, rt->rt_ifp)) {
 			rt->rt_flags |= RTF_BROADCAST;
 		} else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr ==
 		    sin->sin_addr.s_addr) {
 			rt->rt_flags |= RTF_LOCAL;
 		}
 	}
 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 		rt->rt_flags |= RTF_MULTICAST;
 
 	if (rt->rt_ifp != NULL) {
 
 		/*
 		 * Check route MTU:
 		 * inherit interface MTU if not set or
 		 * check if MTU is too large.
 		 */
 		if (rt->rt_mtu == 0) {
 			rt->rt_mtu = rt->rt_ifp->if_mtu;
 		} else if (rt->rt_mtu > rt->rt_ifp->if_mtu)
 			rt->rt_mtu = rt->rt_ifp->if_mtu;
 	}
 
 	return (rn_addroute(v_arg, n_arg, head, treenodes));
 }
 
 /*
  * This code is the inverse of in_clsroute: on first reference, if we
  * were managing the route, stop doing so and set the expiration timer
  * back off again.
  */
 static struct radix_node *
 in_matroute(void *v_arg, struct radix_node_head *head)
 {
 	struct radix_node *rn = rn_match(v_arg, head);
 	struct rtentry *rt = (struct rtentry *)rn;
 
 	if (rt) {
 		RT_LOCK(rt);
 		if (rt->rt_flags & RTPRF_OURS) {
 			rt->rt_flags &= ~RTPRF_OURS;
 			rt->rt_expire = 0;
 		}
 		RT_UNLOCK(rt);
 	}
 	return rn;
 }
 
 static VNET_DEFINE(int, rtq_reallyold) = 60*60; /* one hour is "really old" */
 #define	V_rtq_reallyold		VNET(rtq_reallyold)
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(rtq_reallyold), 0,
     "Default expiration time on dynamically learned routes");
 
 /* never automatically crank down to less */
 static VNET_DEFINE(int, rtq_minreallyold) = 10;
 #define	V_rtq_minreallyold	VNET(rtq_minreallyold)
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(rtq_minreallyold), 0,
     "Minimum time to attempt to hold onto dynamically learned routes");
 
 /* 128 cached routes is "too many" */
 static VNET_DEFINE(int, rtq_toomany) = 128;
 #define	V_rtq_toomany		VNET(rtq_toomany)
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(rtq_toomany), 0,
     "Upper limit on dynamically learned routes");
 
 /*
  * On last reference drop, mark the route as belong to us so that it can be
  * timed out.
  */
 static void
 in_clsroute(struct radix_node *rn, struct radix_node_head *head)
 {
 	struct rtentry *rt = (struct rtentry *)rn;
 
 	RT_LOCK_ASSERT(rt);
 
 	if (!(rt->rt_flags & RTF_UP))
 		return;			/* prophylactic measures */
 
 	if (rt->rt_flags & RTPRF_OURS)
 		return;
 
 	if (!(rt->rt_flags & RTF_DYNAMIC))
 		return;
 
 	/*
 	 * If rtq_reallyold is 0, just delete the route without
 	 * waiting for a timeout cycle to kill it.
 	 */
 	if (V_rtq_reallyold != 0) {
 		rt->rt_flags |= RTPRF_OURS;
 		rt->rt_expire = time_uptime + V_rtq_reallyold;
 	} else
 		rt_expunge(head, rt);
 }
 
 struct rtqk_arg {
 	struct radix_node_head *rnh;
 	int draining;
 	int killed;
 	int found;
 	int updating;
 	time_t nextstop;
 };
 
 /*
  * Get rid of old routes.  When draining, this deletes everything, even when
  * the timeout is not expired yet.  When updating, this makes sure that
  * nothing has a timeout longer than the current value of rtq_reallyold.
  */
 static int
 in_rtqkill(struct radix_node *rn, void *rock)
 {
 	struct rtqk_arg *ap = rock;
 	struct rtentry *rt = (struct rtentry *)rn;
 	int err;
 
 	RADIX_NODE_HEAD_WLOCK_ASSERT(ap->rnh);
 
 	if (rt->rt_flags & RTPRF_OURS) {
 		ap->found++;
 
 		if (ap->draining || rt->rt_expire <= time_uptime) {
 			if (rt->rt_refcnt > 0)
 				panic("rtqkill route really not free");
 
 			err = in_rtrequest(RTM_DELETE,
 					(struct sockaddr *)rt_key(rt),
 					rt->rt_gateway, rt_mask(rt),
 					rt->rt_flags | RTF_RNH_LOCKED, 0,
 					rt->rt_fibnum);
 			if (err) {
 				log(LOG_WARNING, "in_rtqkill: error %d\n", err);
 			} else {
 				ap->killed++;
 			}
 		} else {
 			if (ap->updating &&
 			    (rt->rt_expire - time_uptime > V_rtq_reallyold))
 				rt->rt_expire = time_uptime + V_rtq_reallyold;
 			ap->nextstop = lmin(ap->nextstop, rt->rt_expire);
 		}
 	}
 
 	return 0;
 }
 
 #define RTQ_TIMEOUT	60*10	/* run no less than once every ten minutes */
 static VNET_DEFINE(int, rtq_timeout) = RTQ_TIMEOUT;
 static VNET_DEFINE(struct callout, rtq_timer);
 
 #define	V_rtq_timeout		VNET(rtq_timeout)
 #define	V_rtq_timer		VNET(rtq_timer)
 
 static void in_rtqtimo_one(void *rock);
 
 static void
 in_rtqtimo(void *rock)
 {
 	CURVNET_SET((struct vnet *) rock);
 	int fibnum;
 	void *newrock;
 	struct timeval atv;
 
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		newrock = rt_tables_get_rnh(fibnum, AF_INET);
 		if (newrock != NULL)
 			in_rtqtimo_one(newrock);
 	}
 	atv.tv_usec = 0;
 	atv.tv_sec = V_rtq_timeout;
 	callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock);
 	CURVNET_RESTORE();
 }
 
 static void
 in_rtqtimo_one(void *rock)
 {
 	struct radix_node_head *rnh = rock;
 	struct rtqk_arg arg;
 	static time_t last_adjusted_timeout = 0;
 
 	arg.found = arg.killed = 0;
 	arg.rnh = rnh;
 	arg.nextstop = time_uptime + V_rtq_timeout;
 	arg.draining = arg.updating = 0;
 	RADIX_NODE_HEAD_LOCK(rnh);
 	rnh->rnh_walktree(rnh, in_rtqkill, &arg);
 	RADIX_NODE_HEAD_UNLOCK(rnh);
 
 	/*
 	 * Attempt to be somewhat dynamic about this:
 	 * If there are ``too many'' routes sitting around taking up space,
 	 * then crank down the timeout, and see if we can't make some more
 	 * go away.  However, we make sure that we will never adjust more
 	 * than once in rtq_timeout seconds, to keep from cranking down too
 	 * hard.
 	 */
 	if ((arg.found - arg.killed > V_rtq_toomany) &&
 	    (time_uptime - last_adjusted_timeout >= V_rtq_timeout) &&
 	    V_rtq_reallyold > V_rtq_minreallyold) {
 		V_rtq_reallyold = 2 * V_rtq_reallyold / 3;
 		if (V_rtq_reallyold < V_rtq_minreallyold) {
 			V_rtq_reallyold = V_rtq_minreallyold;
 		}
 
 		last_adjusted_timeout = time_uptime;
 #ifdef DIAGNOSTIC
 		log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n",
 		    V_rtq_reallyold);
 #endif
 		arg.found = arg.killed = 0;
 		arg.updating = 1;
 		RADIX_NODE_HEAD_LOCK(rnh);
 		rnh->rnh_walktree(rnh, in_rtqkill, &arg);
 		RADIX_NODE_HEAD_UNLOCK(rnh);
 	}
 
 }
 
 void
 in_rtqdrain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct radix_node_head *rnh;
 	struct rtqk_arg arg;
 	int 	fibnum;
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 
 		for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 			rnh = rt_tables_get_rnh(fibnum, AF_INET);
 			arg.found = arg.killed = 0;
 			arg.rnh = rnh;
 			arg.nextstop = 0;
 			arg.draining = 1;
 			arg.updating = 0;
 			RADIX_NODE_HEAD_LOCK(rnh);
 			rnh->rnh_walktree(rnh, in_rtqkill, &arg);
 			RADIX_NODE_HEAD_UNLOCK(rnh);
 		}
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 void
 in_setmatchfunc(struct radix_node_head *rnh, int val)
 {
 
 	rnh->rnh_matchaddr = (val != 0) ? rn_match : in_matroute;
 }
 
 static int _in_rt_was_here;
 /*
  * Initialize our routing tree.
  */
 int
 in_inithead(void **head, int off)
 {
 	struct radix_node_head *rnh;
 
 	if (!rn_inithead(head, 32))
 		return 0;
 
 	rnh = *head;
 	RADIX_NODE_HEAD_LOCK_INIT(rnh);
 
 	rnh->rnh_addaddr = in_addroute;
 	in_setmatchfunc(rnh, V_drop_redirect);
 	rnh->rnh_close = in_clsroute;
 	if (_in_rt_was_here == 0 ) {
 		callout_init(&V_rtq_timer, CALLOUT_MPSAFE);
 		callout_reset(&V_rtq_timer, 1, in_rtqtimo, curvnet);
 		_in_rt_was_here = 1;
 	}
 	return 1;
 }
 
 #ifdef VIMAGE
 int
 in_detachhead(void **head, int off)
 {
 
 	callout_drain(&V_rtq_timer);
 	return (1);
 }
 #endif
 
 /*
  * This zaps old routes when the interface goes down or interface
  * address is deleted.  In the latter case, it deletes static routes
  * that point to this address.  If we don't do this, we may end up
  * using the old address in the future.  The ones we always want to
  * get rid of are things like ARP entries, since the user might down
  * the interface, walk over to a completely different network, and
  * plug back in.
  */
 struct in_ifadown_arg {
 	struct radix_node_head *rnh;
 	struct ifaddr *ifa;
 	int del;
 };
 
 static int
 in_ifadownkill(struct radix_node *rn, void *xap)
 {
 	struct in_ifadown_arg *ap = xap;
 	struct rtentry *rt = (struct rtentry *)rn;
 
 	RT_LOCK(rt);
 	if (rt->rt_ifa == ap->ifa &&
 	    (ap->del || !(rt->rt_flags & RTF_STATIC))) {
 		/*
 		 * Aquire a reference so that it can later be freed
 		 * as the refcount would be 0 here in case of at least
 		 * ap->del.
 		 */
 		RT_ADDREF(rt);
 		/*
 		 * Disconnect it from the tree and permit protocols
 		 * to cleanup.
 		 */
 		rt_expunge(ap->rnh, rt);
 		/*
 		 * At this point it is an rttrash node, and in case
 		 * the above is the only reference we must free it.
 		 * If we do not noone will have a pointer and the
 		 * rtentry will be leaked forever.
 		 * In case someone else holds a reference, we are
 		 * fine as we only decrement the refcount. In that
 		 * case if the other entity calls RT_REMREF, we
 		 * will still be leaking but at least we tried.
 		 */
 		RTFREE_LOCKED(rt);
 		return (0);
 	}
 	RT_UNLOCK(rt);
 	return 0;
 }
 
 void
 in_ifadown(struct ifaddr *ifa, int delete)
 {
 	struct in_ifadown_arg arg;
 	struct radix_node_head *rnh;
 	int	fibnum;
 
 	KASSERT(ifa->ifa_addr->sa_family == AF_INET,
 	    ("%s: wrong family", __func__));
 
 	for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		rnh = rt_tables_get_rnh(fibnum, AF_INET);
 		arg.rnh = rnh;
 		arg.ifa = ifa;
 		arg.del = delete;
 		RADIX_NODE_HEAD_LOCK(rnh);
 		rnh->rnh_walktree(rnh, in_ifadownkill, &arg);
 		RADIX_NODE_HEAD_UNLOCK(rnh);
 		ifa->ifa_flags &= ~IFA_ROUTE;		/* XXXlocking? */
 	}
 }
 
 /*
  * inet versions of rt functions. These have fib extensions and 
  * for now will just reference the _fib variants.
  * eventually this order will be reversed,
  */
 void
 in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum)
 {
 	rtalloc_ign_fib(ro, ignflags, fibnum);
 }
 
 int
 in_rtrequest( int req,
 	struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct rtentry **ret_nrt,
 	u_int fibnum)
 {
 	return (rtrequest_fib(req, dst, gateway, netmask, 
 	    flags, ret_nrt, fibnum));
 }
 
 struct rtentry *
 in_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum)
 {
 	return (rtalloc1_fib(dst, report, ignflags, fibnum));
 }
 
 void
 in_rtredirect(struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct sockaddr *src,
 	u_int fibnum)
 {
 	rtredirect_fib(dst, gateway, netmask, flags, src, fibnum);
 }
  
 void
 in_rtalloc(struct route *ro, u_int fibnum)
 {
 	rtalloc_ign_fib(ro, 0UL, fibnum);
 }
 
 #if 0
 int	 in_rt_getifa(struct rt_addrinfo *, u_int fibnum);
 int	 in_rtioctl(u_long, caddr_t, u_int);
 int	 in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int);
 #endif
 
 
Index: user/dchagin/lemul/sys/netinet/ip_carp.c
===================================================================
--- user/dchagin/lemul/sys/netinet/ip_carp.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/ip_carp.c	(revision 274241)
@@ -1,2195 +1,2198 @@
 /*-
  * Copyright (c) 2002 Michael Shalayeff.
  * Copyright (c) 2003 Ryan McBride.
  * Copyright (c) 2011 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bpf.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 #include <sys/counter.h>
 
 #include <net/ethernet.h>
 #include <net/fddi.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/iso88025.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip.h>
 #include <machine/in_cksum.h>
 #endif
 #ifdef INET
 #include <netinet/ip_var.h>
 #include <netinet/if_ether.h>
 #endif
 
 #ifdef INET6
 #include <netinet/icmp6.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 
 #include <crypto/sha1.h>
 
 static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses");
 
 struct carp_softc {
 	struct ifnet		*sc_carpdev;	/* Pointer to parent ifnet. */
 	struct ifaddr		**sc_ifas;	/* Our ifaddrs. */
 	struct sockaddr_dl	sc_addr;	/* Our link level address. */
 	struct callout		sc_ad_tmo;	/* Advertising timeout. */
 #ifdef INET
 	struct callout		sc_md_tmo;	/* Master down timeout. */
 #endif
 #ifdef INET6
 	struct callout 		sc_md6_tmo;	/* XXX: Master down timeout. */
 #endif
 	struct mtx		sc_mtx;
 
 	int			sc_vhid;
 	int			sc_advskew;
 	int			sc_advbase;
 
 	int			sc_naddrs;
 	int			sc_naddrs6;
 	int			sc_ifasiz;
 	enum { INIT = 0, BACKUP, MASTER }	sc_state;
 	int			sc_suppress;
 	int			sc_sendad_errors;
 #define	CARP_SENDAD_MAX_ERRORS	3
 	int			sc_sendad_success;
 #define	CARP_SENDAD_MIN_SUCCESS 3
 
 	int			sc_init_counter;
 	uint64_t		sc_counter;
 
 	/* authentication */
 #define	CARP_HMAC_PAD	64
 	unsigned char sc_key[CARP_KEY_LEN];
 	unsigned char sc_pad[CARP_HMAC_PAD];
 	SHA1_CTX sc_sha1;
 
 	TAILQ_ENTRY(carp_softc)	sc_list;	/* On the carp_if list. */
 	LIST_ENTRY(carp_softc)	sc_next;	/* On the global list. */
 };
 
 struct carp_if {
 #ifdef INET
 	int	cif_naddrs;
 #endif
 #ifdef INET6
 	int	cif_naddrs6;
 #endif
 	TAILQ_HEAD(, carp_softc) cif_vrs;
 #ifdef INET
 	struct ip_moptions 	 cif_imo;
 #endif
 #ifdef INET6
 	struct ip6_moptions 	 cif_im6o;
 #endif
 	struct ifnet	*cif_ifp;
 	struct mtx	cif_mtx;
 	uint32_t	cif_flags;
 #define	CIF_PROMISC	0x00000001
 };
 
 #define	CARP_INET	0
 #define	CARP_INET6	1
 static int proto_reg[] = {-1, -1};
 
 /*
  * Brief design of carp(4).
  *
  * Any carp-capable ifnet may have a list of carp softcs hanging off
  * its ifp->if_carp pointer. Each softc represents one unique virtual
  * host id, or vhid. The softc has a back pointer to the ifnet. All
  * softcs are joined in a global list, which has quite limited use.
  *
  * Any interface address that takes part in CARP negotiation has a
  * pointer to the softc of its vhid, ifa->ifa_carp. That could be either
  * AF_INET or AF_INET6 address.
  *
  * Although, one can get the softc's backpointer to ifnet and traverse
  * through its ifp->if_addrhead queue to find all interface addresses
  * involved in CARP, we keep a growable array of ifaddr pointers. This
  * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that
  * do calls into the network stack, thus avoiding LORs.
  *
  * Locking:
  *
  * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(),
  * callout-driven events and ioctl()s.
  *
  * To traverse the list of softcs on an ifnet we use CIF_LOCK(), to
  * traverse the global list we use the mutex carp_mtx.
  *
  * Known issues with locking:
  *
  * - There is no protection for races between two ioctl() requests,
  *   neither SIOCSVH, nor SIOCAIFADDR & SIOCAIFADDR_IN6. I think that all
  *   interface ioctl()s should be serialized right in net/if.c.
  * - Sending ad, we put the pointer to the softc in an mtag, and no reference
  *   counting is done on the softc.
  * - On module unload we may race (?) with packet processing thread
  *   dereferencing our function pointers.
  */
 
 /* Accept incoming CARP packets. */
 static VNET_DEFINE(int, carp_allow) = 1;
 #define	V_carp_allow	VNET(carp_allow)
 
 /* Preempt slower nodes. */
 static VNET_DEFINE(int, carp_preempt) = 0;
 #define	V_carp_preempt	VNET(carp_preempt)
 
 /* Log level. */
 static VNET_DEFINE(int, carp_log) = 1;
 #define	V_carp_log	VNET(carp_log)
 
 /* Global advskew demotion. */
 static VNET_DEFINE(int, carp_demotion) = 0;
 #define	V_carp_demotion	VNET(carp_demotion)
 
 /* Send error demotion factor. */
 static VNET_DEFINE(int, carp_senderr_adj) = CARP_MAXSKEW;
 #define	V_carp_senderr_adj	VNET(carp_senderr_adj)
 
 /* Iface down demotion factor. */
 static VNET_DEFINE(int, carp_ifdown_adj) = CARP_MAXSKEW;
 #define	V_carp_ifdown_adj	VNET(carp_ifdown_adj)
 
 static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_net_inet, IPPROTO_CARP,	carp,	CTLFLAG_RW, 0,	"CARP");
-SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, allow, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_allow), 0, "Accept incoming CARP packets");
-SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode");
-SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_log), 0, "CARP log level");
-SYSCTL_VNET_PROC(_net_inet_carp, OID_AUTO, demotion, CTLTYPE_INT|CTLFLAG_RW,
+SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
     0, 0, carp_demote_adj_sysctl, "I",
     "Adjust demotion factor (skew of advskew)");
-SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor,
+    CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment");
-SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor,
+    CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_ifdown_adj), 0,
     "Interface down demotion factor adjustment");
 
 VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats);
 VNET_PCPUSTAT_SYSINIT(carpstats);
 VNET_PCPUSTAT_SYSUNINIT(carpstats);
 
 #define	CARPSTATS_ADD(name, val)	\
     counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \
 	sizeof(uint64_t)], (val))
 #define	CARPSTATS_INC(name)		CARPSTATS_ADD(name, 1)
 
 SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats,
     carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)");
 
 #define	CARP_LOCK_INIT(sc)	mtx_init(&(sc)->sc_mtx, "carp_softc",   \
 	NULL, MTX_DEF)
 #define	CARP_LOCK_DESTROY(sc)	mtx_destroy(&(sc)->sc_mtx)
 #define	CARP_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
 #define	CARP_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
 #define	CARP_UNLOCK(sc)		mtx_unlock(&(sc)->sc_mtx)
 #define	CIF_LOCK_INIT(cif)	mtx_init(&(cif)->cif_mtx, "carp_if",   \
 	NULL, MTX_DEF)
 #define	CIF_LOCK_DESTROY(cif)	mtx_destroy(&(cif)->cif_mtx)
 #define	CIF_LOCK_ASSERT(cif)	mtx_assert(&(cif)->cif_mtx, MA_OWNED)
 #define	CIF_LOCK(cif)		mtx_lock(&(cif)->cif_mtx)
 #define	CIF_UNLOCK(cif)		mtx_unlock(&(cif)->cif_mtx)
 #define	CIF_FREE(cif)	do {				\
 		CIF_LOCK_ASSERT(cif);			\
 		if (TAILQ_EMPTY(&(cif)->cif_vrs))	\
 			carp_free_if(cif);		\
 		else					\
 			CIF_UNLOCK(cif);		\
 } while (0)
 
 #define	CARP_LOG(...)	do {				\
 	if (V_carp_log > 0)				\
 		log(LOG_INFO, "carp: " __VA_ARGS__);	\
 } while (0)
 
 #define	CARP_DEBUG(...)	do {				\
 	if (V_carp_log > 1)				\
 		log(LOG_DEBUG, __VA_ARGS__);		\
 } while (0)
 
 #define	IFNET_FOREACH_IFA(ifp, ifa)					\
 	IF_ADDR_LOCK_ASSERT(ifp);					\
 	TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link)		\
 		if ((ifa)->ifa_carp != NULL)
 
 #define	CARP_FOREACH_IFA(sc, ifa)					\
 	CARP_LOCK_ASSERT(sc);						\
 	for (int _i = 0;						\
 		_i < (sc)->sc_naddrs + (sc)->sc_naddrs6 &&		\
 		((ifa) = sc->sc_ifas[_i]) != NULL;			\
 		++_i)
 
 #define	IFNET_FOREACH_CARP(ifp, sc)					\
 	CIF_LOCK_ASSERT(ifp->if_carp);					\
 	TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list)
 
 #define	DEMOTE_ADVSKEW(sc)					\
     (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ?	\
     CARP_MAXSKEW : ((sc)->sc_advskew + V_carp_demotion))
 
 static void	carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
 static struct carp_softc
 		*carp_alloc(struct ifnet *);
 static void	carp_detach_locked(struct ifaddr *);
 static void	carp_destroy(struct carp_softc *);
 static struct carp_if
 		*carp_alloc_if(struct ifnet *);
 static void	carp_free_if(struct carp_if *);
 static void	carp_set_state(struct carp_softc *, int);
 static void	carp_sc_state(struct carp_softc *);
 static void	carp_setrun(struct carp_softc *, sa_family_t);
 static void	carp_master_down(void *);
 static void	carp_master_down_locked(struct carp_softc *);
 static void	carp_send_ad(void *);
 static void	carp_send_ad_locked(struct carp_softc *);
 static void	carp_addroute(struct carp_softc *);
 static void	carp_ifa_addroute(struct ifaddr *);
 static void	carp_delroute(struct carp_softc *);
 static void	carp_ifa_delroute(struct ifaddr *);
 static void	carp_send_ad_all(void *, int);
 static void	carp_demote_adj(int, char *);
 
 static LIST_HEAD(, carp_softc) carp_list;
 static struct mtx carp_mtx;
 static struct task carp_sendall_task =
     TASK_INITIALIZER(0, carp_send_ad_all, NULL);
 
 static void
 carp_hmac_prepare(struct carp_softc *sc)
 {
 	uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
 	uint8_t vhid = sc->sc_vhid & 0xff;
 	struct ifaddr *ifa;
 	int i, found;
 #ifdef INET
 	struct in_addr last, cur, in;
 #endif
 #ifdef INET6
 	struct in6_addr last6, cur6, in6;
 #endif
 
 	CARP_LOCK_ASSERT(sc);
 
 	/* Compute ipad from key. */
 	bzero(sc->sc_pad, sizeof(sc->sc_pad));
 	bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
 	for (i = 0; i < sizeof(sc->sc_pad); i++)
 		sc->sc_pad[i] ^= 0x36;
 
 	/* Precompute first part of inner hash. */
 	SHA1Init(&sc->sc_sha1);
 	SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
 	SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
 	SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
 	SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
 #ifdef INET
 	cur.s_addr = 0;
 	do {
 		found = 0;
 		last = cur;
 		cur.s_addr = 0xffffffff;
 		CARP_FOREACH_IFA(sc, ifa) {
 			in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
 			if (ifa->ifa_addr->sa_family == AF_INET &&
 			    ntohl(in.s_addr) > ntohl(last.s_addr) &&
 			    ntohl(in.s_addr) < ntohl(cur.s_addr)) {
 				cur.s_addr = in.s_addr;
 				found++;
 			}
 		}
 		if (found)
 			SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur));
 	} while (found);
 #endif /* INET */
 #ifdef INET6
 	memset(&cur6, 0, sizeof(cur6));
 	do {
 		found = 0;
 		last6 = cur6;
 		memset(&cur6, 0xff, sizeof(cur6));
 		CARP_FOREACH_IFA(sc, ifa) {
 			in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
 			if (IN6_IS_SCOPE_EMBED(&in6))
 				in6.s6_addr16[1] = 0;
 			if (ifa->ifa_addr->sa_family == AF_INET6 &&
 			    memcmp(&in6, &last6, sizeof(in6)) > 0 &&
 			    memcmp(&in6, &cur6, sizeof(in6)) < 0) {
 				cur6 = in6;
 				found++;
 			}
 		}
 		if (found)
 			SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6));
 	} while (found);
 #endif /* INET6 */
 
 	/* convert ipad to opad */
 	for (i = 0; i < sizeof(sc->sc_pad); i++)
 		sc->sc_pad[i] ^= 0x36 ^ 0x5c;
 }
 
 static void
 carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2],
     unsigned char md[20])
 {
 	SHA1_CTX sha1ctx;
 
 	CARP_LOCK_ASSERT(sc);
 
 	/* fetch first half of inner hash */
 	bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
 
 	SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
 	SHA1Final(md, &sha1ctx);
 
 	/* outer hash */
 	SHA1Init(&sha1ctx);
 	SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
 	SHA1Update(&sha1ctx, md, 20);
 	SHA1Final(md, &sha1ctx);
 }
 
 static int
 carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2],
     unsigned char md[20])
 {
 	unsigned char md2[20];
 
 	CARP_LOCK_ASSERT(sc);
 
 	carp_hmac_generate(sc, counter, md2);
 
 	return (bcmp(md, md2, sizeof(md2)));
 }
 
 /*
  * process input packet.
  * we have rearranged checks order compared to the rfc,
  * but it seems more efficient this way or not possible otherwise.
  */
 #ifdef INET
 int
 carp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct carp_header *ch;
 	int iplen, len;
 
 	iplen = *offp;
 	*mp = NULL;
 
 	CARPSTATS_INC(carps_ipackets);
 
 	if (!V_carp_allow) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that the IP TTL is 255.  */
 	if (ip->ip_ttl != CARP_DFLTTL) {
 		CARPSTATS_INC(carps_badttl);
 		CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
 		    ip->ip_ttl,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	iplen = ip->ip_hl << 2;
 
 	if (m->m_pkthdr.len < iplen + sizeof(*ch)) {
 		CARPSTATS_INC(carps_badlen);
 		CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) "
 		    "on %s\n", __func__, m->m_len - sizeof(struct ip),
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (iplen + sizeof(*ch) < m->m_len) {
 		if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) {
 			CARPSTATS_INC(carps_hdrops);
 			CARP_DEBUG("%s: pullup failed\n", __func__);
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 	}
 	ch = (struct carp_header *)((char *)ip + iplen);
 
 	/*
 	 * verify that the received packet length is
 	 * equal to the CARP header
 	 */
 	len = iplen + sizeof(*ch);
 	if (len > m->m_pkthdr.len) {
 		CARPSTATS_INC(carps_badlen);
 		CARP_DEBUG("%s: packet too short %d on %s\n", __func__,
 		    m->m_pkthdr.len,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if ((m = m_pullup(m, len)) == NULL) {
 		CARPSTATS_INC(carps_hdrops);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 	ch = (struct carp_header *)((char *)ip + iplen);
 
 	/* verify the CARP checksum */
 	m->m_data += iplen;
 	if (in_cksum(m, len - iplen)) {
 		CARPSTATS_INC(carps_badsum);
 		CARP_DEBUG("%s: checksum failed on %s\n", __func__,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= iplen;
 
 	carp_input_c(m, ch, AF_INET);
 	return (IPPROTO_DONE);
 }
 #endif
 
 #ifdef INET6
 int
 carp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct carp_header *ch;
 	u_int len;
 
 	CARPSTATS_INC(carps_ipackets6);
 
 	if (!V_carp_allow) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* check if received on a valid carp interface */
 	if (m->m_pkthdr.rcvif->if_carp == NULL) {
 		CARPSTATS_INC(carps_badif);
 		CARP_DEBUG("%s: packet received on non-carp interface: %s\n",
 		    __func__, m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that the IP TTL is 255 */
 	if (ip6->ip6_hlim != CARP_DFLTTL) {
 		CARPSTATS_INC(carps_badttl);
 		CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
 		    ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that we have a complete carp packet */
 	len = m->m_len;
 	IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
 	if (ch == NULL) {
 		CARPSTATS_INC(carps_badlen);
 		CARP_DEBUG("%s: packet size %u too small\n", __func__, len);
 		return (IPPROTO_DONE);
 	}
 
 
 	/* verify the CARP checksum */
 	m->m_data += *offp;
 	if (in_cksum(m, sizeof(*ch))) {
 		CARPSTATS_INC(carps_badsum);
 		CARP_DEBUG("%s: checksum failed, on %s\n", __func__,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= *offp;
 
 	carp_input_c(m, ch, AF_INET6);
 	return (IPPROTO_DONE);
 }
 #endif /* INET6 */
 
 static void
 carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
 {
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ifaddr *ifa;
 	struct carp_softc *sc;
 	uint64_t tmp_counter;
 	struct timeval sc_tv, ch_tv;
 
 	/* verify that the VHID is valid on the receiving interface */
 	IF_ADDR_RLOCK(ifp);
 	IFNET_FOREACH_IFA(ifp, ifa)
 		if (ifa->ifa_addr->sa_family == af &&
 		    ifa->ifa_carp->sc_vhid == ch->carp_vhid) {
 			ifa_ref(ifa);
 			break;
 		}
 	IF_ADDR_RUNLOCK(ifp);
 
 	if (ifa == NULL) {
 		CARPSTATS_INC(carps_badvhid);
 		m_freem(m);
 		return;
 	}
 
 	/* verify the CARP version. */
 	if (ch->carp_version != CARP_VERSION) {
 		CARPSTATS_INC(carps_badver);
 		CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname,
 		    ch->carp_version);
 		ifa_free(ifa);
 		m_freem(m);
 		return;
 	}
 
 	sc = ifa->ifa_carp;
 	CARP_LOCK(sc);
 	ifa_free(ifa);
 
 	if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
 		CARPSTATS_INC(carps_badauth);
 		CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__,
 		    sc->sc_vhid, ifp->if_xname);
 		goto out;
 	}
 
 	tmp_counter = ntohl(ch->carp_counter[0]);
 	tmp_counter = tmp_counter<<32;
 	tmp_counter += ntohl(ch->carp_counter[1]);
 
 	/* XXX Replay protection goes here */
 
 	sc->sc_init_counter = 0;
 	sc->sc_counter = tmp_counter;
 
 	sc_tv.tv_sec = sc->sc_advbase;
 	sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256;
 	ch_tv.tv_sec = ch->carp_advbase;
 	ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
 
 	switch (sc->sc_state) {
 	case INIT:
 		break;
 	case MASTER:
 		/*
 		 * If we receive an advertisement from a master who's going to
 		 * be more frequent than us, go into BACKUP state.
 		 */
 		if (timevalcmp(&sc_tv, &ch_tv, >) ||
 		    timevalcmp(&sc_tv, &ch_tv, ==)) {
 			callout_stop(&sc->sc_ad_tmo);
 			CARP_LOG("VHID %u@%s: MASTER -> BACKUP "
 			    "(more frequent advertisement received)\n",
 			    sc->sc_vhid,
 			    sc->sc_carpdev->if_xname);
 			carp_set_state(sc, BACKUP);
 			carp_setrun(sc, 0);
 			carp_delroute(sc);
 		}
 		break;
 	case BACKUP:
 		/*
 		 * If we're pre-empting masters who advertise slower than us,
 		 * and this one claims to be slower, treat him as down.
 		 */
 		if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) {
 			CARP_LOG("VHID %u@%s: BACKUP -> MASTER "
 			    "(preempting a slower master)\n",
 			    sc->sc_vhid,
 			    sc->sc_carpdev->if_xname);
 			carp_master_down_locked(sc);
 			break;
 		}
 
 		/*
 		 *  If the master is going to advertise at such a low frequency
 		 *  that he's guaranteed to time out, we'd might as well just
 		 *  treat him as timed out now.
 		 */
 		sc_tv.tv_sec = sc->sc_advbase * 3;
 		if (timevalcmp(&sc_tv, &ch_tv, <)) {
 			CARP_LOG("VHID %u@%s: BACKUP -> MASTER "
 			    "(master timed out)\n",
 			    sc->sc_vhid,
 			    sc->sc_carpdev->if_xname);
 			carp_master_down_locked(sc);
 			break;
 		}
 
 		/*
 		 * Otherwise, we reset the counter and wait for the next
 		 * advertisement.
 		 */
 		carp_setrun(sc, af);
 		break;
 	}
 
 out:
 	CARP_UNLOCK(sc);
 	m_freem(m);
 }
 
 static int
 carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
 {
 	struct m_tag *mtag;
 
 	if (sc->sc_init_counter) {
 		/* this could also be seconds since unix epoch */
 		sc->sc_counter = arc4random();
 		sc->sc_counter = sc->sc_counter << 32;
 		sc->sc_counter += arc4random();
 	} else
 		sc->sc_counter++;
 
 	ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
 	ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);
 
 	carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
 
 	/* Tag packet for carp_output */
 	if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *),
 	    M_NOWAIT)) == NULL) {
 		m_freem(m);
 		CARPSTATS_INC(carps_onomem);
 		return (ENOMEM);
 	}
 	bcopy(&sc, mtag + 1, sizeof(sc));
 	m_tag_prepend(m, mtag);
 
 	return (0);
 }
 
 /*
  * To avoid LORs and possible recursions this function shouldn't
  * be called directly, but scheduled via taskqueue.
  */
 static void
 carp_send_ad_all(void *ctx __unused, int pending __unused)
 {
 	struct carp_softc *sc;
 
 	mtx_lock(&carp_mtx);
 	LIST_FOREACH(sc, &carp_list, sc_next)
 		if (sc->sc_state == MASTER) {
 			CARP_LOCK(sc);
 			CURVNET_SET(sc->sc_carpdev->if_vnet);
 			carp_send_ad_locked(sc);
 			CURVNET_RESTORE();
 			CARP_UNLOCK(sc);
 		}
 	mtx_unlock(&carp_mtx);
 }
 
 /* Send a periodic advertisement, executed in callout context. */
 static void
 carp_send_ad(void *v)
 {
 	struct carp_softc *sc = v;
 
 	CARP_LOCK_ASSERT(sc);
 	CURVNET_SET(sc->sc_carpdev->if_vnet);
 	carp_send_ad_locked(sc);
 	CURVNET_RESTORE();
 	CARP_UNLOCK(sc);
 }
 
 static void
 carp_send_ad_error(struct carp_softc *sc, int error)
 {
 
 	if (error) {
 		if (sc->sc_sendad_errors < INT_MAX)
 			sc->sc_sendad_errors++;
 		if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
 			static const char fmt[] = "send error %d on %s";
 			char msg[sizeof(fmt) + IFNAMSIZ];
 
 			sprintf(msg, fmt, error, sc->sc_carpdev->if_xname);
 			carp_demote_adj(V_carp_senderr_adj, msg);
 		}
 		sc->sc_sendad_success = 0;
 	} else {
 		if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS &&
 		    ++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) {
 			static const char fmt[] = "send ok on %s";
 			char msg[sizeof(fmt) + IFNAMSIZ];
 
 			sprintf(msg, fmt, sc->sc_carpdev->if_xname);
 			carp_demote_adj(-V_carp_senderr_adj, msg);
 			sc->sc_sendad_errors = 0;
 		} else
 			sc->sc_sendad_errors = 0;
 	}
 }
 
 static void
 carp_send_ad_locked(struct carp_softc *sc)
 {
 	struct carp_header ch;
 	struct timeval tv;
 	struct sockaddr sa;
 	struct ifaddr *ifa;
 	struct carp_header *ch_ptr;
 	struct mbuf *m;
 	int len, advskew;
 
 	CARP_LOCK_ASSERT(sc);
 
 	advskew = DEMOTE_ADVSKEW(sc);
 	tv.tv_sec = sc->sc_advbase;
 	tv.tv_usec = advskew * 1000000 / 256;
 
 	ch.carp_version = CARP_VERSION;
 	ch.carp_type = CARP_ADVERTISEMENT;
 	ch.carp_vhid = sc->sc_vhid;
 	ch.carp_advbase = sc->sc_advbase;
 	ch.carp_advskew = advskew;
 	ch.carp_authlen = 7;	/* XXX DEFINE */
 	ch.carp_pad1 = 0;	/* must be zero */
 	ch.carp_cksum = 0;
 
 	/* XXXGL: OpenBSD picks first ifaddr with needed family. */
 
 #ifdef INET
 	if (sc->sc_naddrs) {
 		struct ip *ip;
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			CARPSTATS_INC(carps_onomem);
 			goto resched;
 		}
 		len = sizeof(*ip) + sizeof(ch);
 		m->m_pkthdr.len = len;
 		m->m_pkthdr.rcvif = NULL;
 		m->m_len = len;
 		MH_ALIGN(m, m->m_len);
 		m->m_flags |= M_MCAST;
 		ip = mtod(m, struct ip *);
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(*ip) >> 2;
 		ip->ip_tos = IPTOS_LOWDELAY;
 		ip->ip_len = htons(len);
 		ip->ip_id = ip_newid();
 		ip->ip_off = htons(IP_DF);
 		ip->ip_ttl = CARP_DFLTTL;
 		ip->ip_p = IPPROTO_CARP;
 		ip->ip_sum = 0;
 
 		bzero(&sa, sizeof(sa));
 		sa.sa_family = AF_INET;
 		ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev);
 		if (ifa != NULL) {
 			ip->ip_src.s_addr =
 			    ifatoia(ifa)->ia_addr.sin_addr.s_addr;
 			ifa_free(ifa);
 		} else
 			ip->ip_src.s_addr = 0;
 		ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
 
 		ch_ptr = (struct carp_header *)(&ip[1]);
 		bcopy(&ch, ch_ptr, sizeof(ch));
 		if (carp_prepare_ad(m, sc, ch_ptr))
 			goto resched;
 
 		m->m_data += sizeof(*ip);
 		ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip));
 		m->m_data -= sizeof(*ip);
 
 		CARPSTATS_INC(carps_opackets);
 
 		carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT,
 		    &sc->sc_carpdev->if_carp->cif_imo, NULL));
 	}
 #endif /* INET */
 #ifdef INET6
 	if (sc->sc_naddrs6) {
 		struct ip6_hdr *ip6;
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			CARPSTATS_INC(carps_onomem);
 			goto resched;
 		}
 		len = sizeof(*ip6) + sizeof(ch);
 		m->m_pkthdr.len = len;
 		m->m_pkthdr.rcvif = NULL;
 		m->m_len = len;
 		MH_ALIGN(m, m->m_len);
 		m->m_flags |= M_MCAST;
 		ip6 = mtod(m, struct ip6_hdr *);
 		bzero(ip6, sizeof(*ip6));
 		ip6->ip6_vfc |= IPV6_VERSION;
 		ip6->ip6_hlim = CARP_DFLTTL;
 		ip6->ip6_nxt = IPPROTO_CARP;
 		bzero(&sa, sizeof(sa));
 
 		/* set the source address */
 		sa.sa_family = AF_INET6;
 		ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev);
 		if (ifa != NULL) {
 			bcopy(IFA_IN6(ifa), &ip6->ip6_src,
 			    sizeof(struct in6_addr));
 			ifa_free(ifa);
 		} else
 			/* This should never happen with IPv6. */
 			bzero(&ip6->ip6_src, sizeof(struct in6_addr));
 
 		/* Set the multicast destination. */
 		ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
 		ip6->ip6_dst.s6_addr8[15] = 0x12;
 		if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
 			m_freem(m);
 			CARP_DEBUG("%s: in6_setscope failed\n", __func__);
 			goto resched;
 		}
 
 		ch_ptr = (struct carp_header *)(&ip6[1]);
 		bcopy(&ch, ch_ptr, sizeof(ch));
 		if (carp_prepare_ad(m, sc, ch_ptr))
 			goto resched;
 
 		m->m_data += sizeof(*ip6);
 		ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6));
 		m->m_data -= sizeof(*ip6);
 
 		CARPSTATS_INC(carps_opackets6);
 
 		carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0,
 		    &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL));
 	}
 #endif /* INET6 */
 
 resched:
 	callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc);
 }
 
 static void
 carp_addroute(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 
 	CARP_FOREACH_IFA(sc, ifa)
 		carp_ifa_addroute(ifa);
 }
 
 static void
 carp_ifa_addroute(struct ifaddr *ifa)
 {
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		in_addprefix(ifatoia(ifa), RTF_UP);
 		ifa_add_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia(ifa)->ia_addr);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ifa_add_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
 		nd6_add_ifa_lle(ifatoia6(ifa));
 		break;
 #endif
 	}
 }
 
 static void
 carp_delroute(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 
 	CARP_FOREACH_IFA(sc, ifa)
 		carp_ifa_delroute(ifa);
 }
 
 static void
 carp_ifa_delroute(struct ifaddr *ifa)
 {
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		ifa_del_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia(ifa)->ia_addr);
 		in_scrubprefix(ifatoia(ifa), LLE_STATIC);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ifa_del_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
 		nd6_rem_ifa_lle(ifatoia6(ifa));
 		break;
 #endif
 	}
 }
 
 int
 carp_master(struct ifaddr *ifa)
 {
 	struct carp_softc *sc = ifa->ifa_carp;
 
 	return (sc->sc_state == MASTER);
 }
 
 #ifdef INET
 /*
  * Broadcast a gratuitous ARP request containing
  * the virtual router MAC address for each IP address
  * associated with the virtual router.
  */
 static void
 carp_send_arp(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 
 	CARP_FOREACH_IFA(sc, ifa)
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			arp_ifinit2(sc->sc_carpdev, ifa, LLADDR(&sc->sc_addr));
 }
 
 int
 carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr)
 {
 	struct carp_softc *sc = ifa->ifa_carp;
 
 	if (sc->sc_state == MASTER) {
 		*enaddr = LLADDR(&sc->sc_addr);
 		return (1);
 	}
 
 	return (0);
 }
 #endif
 
 #ifdef INET6
 static void
 carp_send_na(struct carp_softc *sc)
 {
 	static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
 	struct ifaddr *ifa;
 	struct in6_addr *in6;
 
 	CARP_FOREACH_IFA(sc, ifa) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		in6 = IFA_IN6(ifa);
 		nd6_na_output(sc->sc_carpdev, &mcast, in6,
 		    ND_NA_FLAG_OVERRIDE, 1, NULL);
 		DELAY(1000);	/* XXX */
 	}
 }
 
 /*
  * Returns ifa in case it's a carp address and it is MASTER, or if the address
  * matches and is not a carp address.  Returns NULL otherwise.
  */
 struct ifaddr *
 carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr)
 {
 	struct ifaddr *ifa;
 
 	ifa = NULL;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa)))
 			continue;
 		if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER)
 			ifa = NULL;
 		else
 			ifa_ref(ifa);
 		break;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (ifa);
 }
 
 caddr_t
 carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr)
 {
 	struct ifaddr *ifa;
 
 	IF_ADDR_RLOCK(ifp);
 	IFNET_FOREACH_IFA(ifp, ifa)
 		if (ifa->ifa_addr->sa_family == AF_INET6 &&
 		    IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) {
 			struct carp_softc *sc = ifa->ifa_carp;
 			struct m_tag *mtag;
 
 			IF_ADDR_RUNLOCK(ifp);
 
 			mtag = m_tag_get(PACKET_TAG_CARP,
 			    sizeof(struct carp_softc *), M_NOWAIT);
 			if (mtag == NULL)
 				/* Better a bit than nothing. */
 				return (LLADDR(&sc->sc_addr));
 
 			bcopy(&sc, mtag + 1, sizeof(sc));
 			m_tag_prepend(m, mtag);
 
 			return (LLADDR(&sc->sc_addr));
 		}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (NULL);
 }
 #endif /* INET6 */
 
 int
 carp_forus(struct ifnet *ifp, u_char *dhost)
 {
 	struct carp_softc *sc;
 	uint8_t *ena = dhost;
 
 	if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
 		return (0);
 
 	CIF_LOCK(ifp->if_carp);
 	IFNET_FOREACH_CARP(ifp, sc) {
 		CARP_LOCK(sc);
 		if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr),
 		    ETHER_ADDR_LEN)) {
 			CARP_UNLOCK(sc);
 			CIF_UNLOCK(ifp->if_carp);
 			return (1);
 		}
 		CARP_UNLOCK(sc);
 	}
 	CIF_UNLOCK(ifp->if_carp);
 
 	return (0);
 }
 
 /* Master down timeout event, executed in callout context. */
 static void
 carp_master_down(void *v)
 {
 	struct carp_softc *sc = v;
 
 	CARP_LOCK_ASSERT(sc);
 
 	CURVNET_SET(sc->sc_carpdev->if_vnet);
 	if (sc->sc_state == BACKUP) {
 		CARP_LOG("VHID %u@%s: BACKUP -> MASTER (master down)\n",
 		    sc->sc_vhid,
 		    sc->sc_carpdev->if_xname);
 		carp_master_down_locked(sc);
 	}
 	CURVNET_RESTORE();
 
 	CARP_UNLOCK(sc);
 }
 
 static void
 carp_master_down_locked(struct carp_softc *sc)
 {
 
 	CARP_LOCK_ASSERT(sc);
 
 	switch (sc->sc_state) {
 	case BACKUP:
 		carp_set_state(sc, MASTER);
 		carp_send_ad_locked(sc);
 #ifdef INET
 		carp_send_arp(sc);
 #endif
 #ifdef INET6
 		carp_send_na(sc);
 #endif
 		carp_setrun(sc, 0);
 		carp_addroute(sc);
 		break;
 	case INIT:
 	case MASTER:
 #ifdef INVARIANTS
 		panic("carp: VHID %u@%s: master_down event in %s state\n",
 		    sc->sc_vhid,
 		    sc->sc_carpdev->if_xname,
 		    sc->sc_state ? "MASTER" : "INIT");
 #endif
 		break;
 	}
 }
 
 /*
  * When in backup state, af indicates whether to reset the master down timer
  * for v4 or v6. If it's set to zero, reset the ones which are already pending.
  */
 static void
 carp_setrun(struct carp_softc *sc, sa_family_t af)
 {
 	struct timeval tv;
 
 	CARP_LOCK_ASSERT(sc);
 
 	if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 ||
 	    sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
 	    (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0))
 		return;
 
 	switch (sc->sc_state) {
 	case INIT:
 		CARP_LOG("VHID %u@%s: INIT -> BACKUP\n",
 		    sc->sc_vhid,
 		    sc->sc_carpdev->if_xname);
 		carp_set_state(sc, BACKUP);
 		carp_setrun(sc, 0);
 		break;
 	case BACKUP:
 		callout_stop(&sc->sc_ad_tmo);
 		tv.tv_sec = 3 * sc->sc_advbase;
 		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
 		switch (af) {
 #ifdef INET
 		case AF_INET:
 			callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
 			    carp_master_down, sc);
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
 			    carp_master_down, sc);
 			break;
 #endif
 		default:
 #ifdef INET
 			if (sc->sc_naddrs)
 				callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
 				    carp_master_down, sc);
 #endif
 #ifdef INET6
 			if (sc->sc_naddrs6)
 				callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
 				    carp_master_down, sc);
 #endif
 			break;
 		}
 		break;
 	case MASTER:
 		tv.tv_sec = sc->sc_advbase;
 		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
 		callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
 		    carp_send_ad, sc);
 		break;
 	}
 }
 
 /*
  * Setup multicast structures.
  */
 static int
 carp_multicast_setup(struct carp_if *cif, sa_family_t sa)
 {
 	struct ifnet *ifp = cif->cif_ifp;
 	int error = 0;
 
 	CIF_LOCK_ASSERT(cif);
 
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
 	    {
 		struct ip_moptions *imo = &cif->cif_imo;
 		struct in_addr addr;
 
 		if (imo->imo_membership)
 			return (0);
 
 		imo->imo_membership = (struct in_multi **)malloc(
 		    (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP,
 		    M_NOWAIT);
 		if (imo->imo_membership == NULL)
 			return (ENOMEM);
 		imo->imo_mfilters = NULL;
 		imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
 		imo->imo_multicast_vif = -1;
 
 		addr.s_addr = htonl(INADDR_CARP_GROUP);
 		if ((error = in_joingroup(ifp, &addr, NULL,
 		    &imo->imo_membership[0])) != 0) {
 			free(imo->imo_membership, M_CARP);
 			break;
 		}
 		imo->imo_num_memberships++;
 		imo->imo_multicast_ifp = ifp;
 		imo->imo_multicast_ttl = CARP_DFLTTL;
 		imo->imo_multicast_loop = 0;
 		break;
 	   }
 #endif
 #ifdef INET6
 	case AF_INET6:
 	    {
 		struct ip6_moptions *im6o = &cif->cif_im6o;
 		struct in6_addr in6;
 		struct in6_multi *in6m;
 
 		if (im6o->im6o_membership)
 			return (0);
 
 		im6o->im6o_membership = (struct in6_multi **)malloc(
 		    (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP,
 		    M_ZERO | M_NOWAIT);
 		if (im6o->im6o_membership == NULL)
 			return (ENOMEM);
 		im6o->im6o_mfilters = NULL;
 		im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
 		im6o->im6o_multicast_hlim = CARP_DFLTTL;
 		im6o->im6o_multicast_ifp = ifp;
 
 		/* Join IPv6 CARP multicast group. */
 		bzero(&in6, sizeof(in6));
 		in6.s6_addr16[0] = htons(0xff02);
 		in6.s6_addr8[15] = 0x12;
 		if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
 			free(im6o->im6o_membership, M_CARP);
 			break;
 		}
 		in6m = NULL;
 		if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
 			free(im6o->im6o_membership, M_CARP);
 			break;
 		}
 		im6o->im6o_membership[0] = in6m;
 		im6o->im6o_num_memberships++;
 
 		/* Join solicited multicast address. */
 		bzero(&in6, sizeof(in6));
 		in6.s6_addr16[0] = htons(0xff02);
 		in6.s6_addr32[1] = 0;
 		in6.s6_addr32[2] = htonl(1);
 		in6.s6_addr32[3] = 0;
 		in6.s6_addr8[12] = 0xff;
 		if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
 			in6_mc_leave(im6o->im6o_membership[0], NULL);
 			free(im6o->im6o_membership, M_CARP);
 			break;
 		}
 		in6m = NULL;
 		if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
 			in6_mc_leave(im6o->im6o_membership[0], NULL);
 			free(im6o->im6o_membership, M_CARP);
 			break;
 		}
 		im6o->im6o_membership[1] = in6m;
 		im6o->im6o_num_memberships++;
 		break;
 	    }
 #endif
 	}
 
 	return (error);
 }
 
 /*
  * Free multicast structures.
  */
 static void
 carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa)
 {
 
 	CIF_LOCK_ASSERT(cif);
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
 		if (cif->cif_naddrs == 0) {
 			struct ip_moptions *imo = &cif->cif_imo;
 
 			in_leavegroup(imo->imo_membership[0], NULL);
 			KASSERT(imo->imo_mfilters == NULL,
 			    ("%s: imo_mfilters != NULL", __func__));
 			free(imo->imo_membership, M_CARP);
 			imo->imo_membership = NULL;
 
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (cif->cif_naddrs6 == 0) {
 			struct ip6_moptions *im6o = &cif->cif_im6o;
 
 			in6_mc_leave(im6o->im6o_membership[0], NULL);
 			in6_mc_leave(im6o->im6o_membership[1], NULL);
 			KASSERT(im6o->im6o_mfilters == NULL,
 			    ("%s: im6o_mfilters != NULL", __func__));
 			free(im6o->im6o_membership, M_CARP);
 			im6o->im6o_membership = NULL;
 		}
 		break;
 #endif
 	}
 }
 
 int
 carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa)
 {
 	struct m_tag *mtag;
 	struct carp_softc *sc;
 
 	if (!sa)
 		return (0);
 
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		break;
 #endif
 	default:
 		return (0);
 	}
 
 	mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
 	if (mtag == NULL)
 		return (0);
 
 	bcopy(mtag + 1, &sc, sizeof(sc));
 
 	/* Set the source MAC address to the Virtual Router MAC Address. */
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_BRIDGE:
 	case IFT_L2VLAN: {
 			struct ether_header *eh;
 
 			eh = mtod(m, struct ether_header *);
 			eh->ether_shost[0] = 0;
 			eh->ether_shost[1] = 0;
 			eh->ether_shost[2] = 0x5e;
 			eh->ether_shost[3] = 0;
 			eh->ether_shost[4] = 1;
 			eh->ether_shost[5] = sc->sc_vhid;
 		}
 		break;
 	case IFT_FDDI: {
 			struct fddi_header *fh;
 
 			fh = mtod(m, struct fddi_header *);
 			fh->fddi_shost[0] = 0;
 			fh->fddi_shost[1] = 0;
 			fh->fddi_shost[2] = 0x5e;
 			fh->fddi_shost[3] = 0;
 			fh->fddi_shost[4] = 1;
 			fh->fddi_shost[5] = sc->sc_vhid;
 		}
 		break;
 	case IFT_ISO88025: {
  			struct iso88025_header *th;
  			th = mtod(m, struct iso88025_header *);
 			th->iso88025_shost[0] = 3;
 			th->iso88025_shost[1] = 0;
 			th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1);
 			th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1);
 			th->iso88025_shost[4] = 0;
 			th->iso88025_shost[5] = 0;
 		}
 		break;
 	default:
 		printf("%s: carp is not supported for the %d interface type\n",
 		    ifp->if_xname, ifp->if_type);
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static struct carp_softc*
 carp_alloc(struct ifnet *ifp)
 {
 	struct carp_softc *sc;
 	struct carp_if *cif;
 
 	if ((cif = ifp->if_carp) == NULL)
 		cif = carp_alloc_if(ifp);
 
 	sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO);
 
 	sc->sc_advbase = CARP_DFLTINTV;
 	sc->sc_vhid = -1;	/* required setting */
 	sc->sc_init_counter = 1;
 	sc->sc_state = INIT;
 
 	sc->sc_ifasiz = sizeof(struct ifaddr *);
 	sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO);
 	sc->sc_carpdev = ifp;
 
 	CARP_LOCK_INIT(sc);
 #ifdef INET
 	callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 #endif
 #ifdef INET6
 	callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 #endif
 	callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 
 	CIF_LOCK(cif);
 	TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list);
 	CIF_UNLOCK(cif);
 
 	mtx_lock(&carp_mtx);
 	LIST_INSERT_HEAD(&carp_list, sc, sc_next);
 	mtx_unlock(&carp_mtx);
 
 	return (sc);
 }
 
 static int
 carp_grow_ifas(struct carp_softc *sc)
 {
 	struct ifaddr **new;
 
 	CARP_LOCK_ASSERT(sc);
 
 	new = malloc(sc->sc_ifasiz * 2, M_CARP, M_NOWAIT|M_ZERO);
 	if (new == NULL)
 		return (ENOMEM);
 	bcopy(sc->sc_ifas, new, sc->sc_ifasiz);
 	free(sc->sc_ifas, M_CARP);
 	sc->sc_ifas = new;
 	sc->sc_ifasiz *= 2;
 
 	return (0);
 }
 
 static void
 carp_destroy(struct carp_softc *sc)
 {
 	struct ifnet *ifp = sc->sc_carpdev;
 	struct carp_if *cif = ifp->if_carp;
 
 	CIF_LOCK_ASSERT(cif);
 
 	TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list);
 
 	mtx_lock(&carp_mtx);
 	LIST_REMOVE(sc, sc_next);
 	mtx_unlock(&carp_mtx);
 
 	CARP_LOCK(sc);
 	if (sc->sc_suppress)
 		carp_demote_adj(-V_carp_ifdown_adj, "vhid removed");
 	callout_drain(&sc->sc_ad_tmo);
 #ifdef INET
 	callout_drain(&sc->sc_md_tmo);
 #endif
 #ifdef INET6
 	callout_drain(&sc->sc_md6_tmo);
 #endif
 	CARP_LOCK_DESTROY(sc);
 
 	free(sc->sc_ifas, M_CARP);
 	free(sc, M_CARP);
 }
 
 static struct carp_if*
 carp_alloc_if(struct ifnet *ifp)
 {
 	struct carp_if *cif;
 	int error;
 
 	cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO);
 
 	if ((error = ifpromisc(ifp, 1)) != 0)
 		printf("%s: ifpromisc(%s) failed: %d\n",
 		    __func__, ifp->if_xname, error);
 	else
 		cif->cif_flags |= CIF_PROMISC;
 
 	CIF_LOCK_INIT(cif);
 	cif->cif_ifp = ifp;
 	TAILQ_INIT(&cif->cif_vrs);
 
 	IF_ADDR_WLOCK(ifp);
 	ifp->if_carp = cif;
 	if_ref(ifp);
 	IF_ADDR_WUNLOCK(ifp);
 
 	return (cif);
 }
 
 static void
 carp_free_if(struct carp_if *cif)
 {
 	struct ifnet *ifp = cif->cif_ifp;
 
 	CIF_LOCK_ASSERT(cif);
 	KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty",
 	    __func__));
 
 	IF_ADDR_WLOCK(ifp);
 	ifp->if_carp = NULL;
 	IF_ADDR_WUNLOCK(ifp);
 
 	CIF_LOCK_DESTROY(cif);
 
 	if (cif->cif_flags & CIF_PROMISC)
 		ifpromisc(ifp, 0);
 	if_rele(ifp);
 
 	free(cif, M_CARP);
 }
 
 static void
 carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv)
 {
 
 	CARP_LOCK(sc);
 	carpr->carpr_state = sc->sc_state;
 	carpr->carpr_vhid = sc->sc_vhid;
 	carpr->carpr_advbase = sc->sc_advbase;
 	carpr->carpr_advskew = sc->sc_advskew;
 	if (priv)
 		bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key));
 	else
 		bzero(carpr->carpr_key, sizeof(carpr->carpr_key));
 	CARP_UNLOCK(sc);
 }
 
 int
 carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td)
 {
 	struct carpreq carpr;
 	struct ifnet *ifp;
 	struct carp_softc *sc = NULL;
 	int error = 0, locked = 0;
 
 	if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
 		return (error);
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL)
 		return (ENXIO);
 
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_FDDI:
 	case IFT_ISO88025:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		goto out;
 	}
 
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 		error = EADDRNOTAVAIL;
 		goto out;
 	}
 
 	switch (cmd) {
 	case SIOCSVH:
 		if ((error = priv_check(td, PRIV_NETINET_CARP)))
 			break;
 		if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID ||
 		    carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) {
 			error = EINVAL;
 			break;
 		}
 
 		if (ifp->if_carp) {
 			CIF_LOCK(ifp->if_carp);
 			IFNET_FOREACH_CARP(ifp, sc)
 				if (sc->sc_vhid == carpr.carpr_vhid)
 					break;
 			CIF_UNLOCK(ifp->if_carp);
 		}
 		if (sc == NULL) {
 			sc = carp_alloc(ifp);
 			CARP_LOCK(sc);
 			sc->sc_vhid = carpr.carpr_vhid;
 			LLADDR(&sc->sc_addr)[0] = 0;
 			LLADDR(&sc->sc_addr)[1] = 0;
 			LLADDR(&sc->sc_addr)[2] = 0x5e;
 			LLADDR(&sc->sc_addr)[3] = 0;
 			LLADDR(&sc->sc_addr)[4] = 1;
 			LLADDR(&sc->sc_addr)[5] = sc->sc_vhid;
 		} else
 			CARP_LOCK(sc);
 		locked = 1;
 		if (carpr.carpr_advbase > 0) {
 			if (carpr.carpr_advbase > 255 ||
 			    carpr.carpr_advbase < CARP_DFLTINTV) {
 				error = EINVAL;
 				break;
 			}
 			sc->sc_advbase = carpr.carpr_advbase;
 		}
 		if (carpr.carpr_advskew > 0) {
 			if (carpr.carpr_advskew >= 255) {
 				error = EINVAL;
 				break;
 			}
 			sc->sc_advskew = carpr.carpr_advskew;
 		}
 		if (carpr.carpr_key[0] != '\0') {
 			bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
 			carp_hmac_prepare(sc);
 		}
 		if (sc->sc_state != INIT &&
 		    carpr.carpr_state != sc->sc_state) {
 			switch (carpr.carpr_state) {
 			case BACKUP:
 				callout_stop(&sc->sc_ad_tmo);
 				carp_set_state(sc, BACKUP);
 				carp_setrun(sc, 0);
 				carp_delroute(sc);
 				break;
 			case MASTER:
 				carp_master_down_locked(sc);
 				break;
 			default:
 				break;
 			}
 		}
 		break;
 
 	case SIOCGVH:
 	    {
 		int priveleged;
 
 		if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) {
 			error = EINVAL;
 			break;
 		}
 		if (carpr.carpr_count < 1) {
 			error = EMSGSIZE;
 			break;
 		}
 		if (ifp->if_carp == NULL) {
 			error = ENOENT;
 			break;
 		}
 
 		priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0);
 		if (carpr.carpr_vhid != 0) {
 			CIF_LOCK(ifp->if_carp);
 			IFNET_FOREACH_CARP(ifp, sc)
 				if (sc->sc_vhid == carpr.carpr_vhid)
 					break;
 			CIF_UNLOCK(ifp->if_carp);
 			if (sc == NULL) {
 				error = ENOENT;
 				break;
 			}
 			carp_carprcp(&carpr, sc, priveleged);
 			error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
 		} else  {
 			int i, count;
 
 			count = 0;
 			CIF_LOCK(ifp->if_carp);
 			IFNET_FOREACH_CARP(ifp, sc)
 				count++;
 
 			if (count > carpr.carpr_count) {
 				CIF_UNLOCK(ifp->if_carp);
 				error = EMSGSIZE;
 				break;
 			}
 
 			i = 0;
 			IFNET_FOREACH_CARP(ifp, sc) {
 				carp_carprcp(&carpr, sc, priveleged);
 				carpr.carpr_count = count;
 				error = copyout(&carpr, ifr->ifr_data +
 				    (i * sizeof(carpr)), sizeof(carpr));
 				if (error) {
 					CIF_UNLOCK(ifp->if_carp);
 					break;
 				}
 				i++;
 			}
 			CIF_UNLOCK(ifp->if_carp);
 		}
 		break;
 	    }
 	default:
 		error = EINVAL;
 	}
 
 out:
 	if (locked)
 		CARP_UNLOCK(sc);
 	if_rele(ifp);
 
 	return (error);
 }
 
 static int
 carp_get_vhid(struct ifaddr *ifa)
 {
 
 	if (ifa == NULL || ifa->ifa_carp == NULL)
 		return (0);
 
 	return (ifa->ifa_carp->sc_vhid);
 }
 
 int
 carp_attach(struct ifaddr *ifa, int vhid)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct carp_if *cif = ifp->if_carp;
 	struct carp_softc *sc;
 	int index, error;
 
 	if (ifp->if_carp == NULL)
 		return (ENOPROTOOPT);
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 #endif
 #ifdef INET6
 	case AF_INET6:
 #endif
 		break;
 	default:
 		return (EPROTOTYPE);
 	}
 
 	CIF_LOCK(cif);
 	IFNET_FOREACH_CARP(ifp, sc)
 		if (sc->sc_vhid == vhid)
 			break;
 	if (sc == NULL) {
 		CIF_UNLOCK(cif);
 		return (ENOENT);
 	}
 
 	if (ifa->ifa_carp) {
 		if (ifa->ifa_carp->sc_vhid != vhid)
 			carp_detach_locked(ifa);
 		else {
 			CIF_UNLOCK(cif);
 			return (0);
 		}
 	}
 
 	error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family);
 	if (error) {
 		CIF_FREE(cif);
 		return (error);
 	}
 
 	CARP_LOCK(sc);
 	index = sc->sc_naddrs + sc->sc_naddrs6 + 1;
 	if (index > sc->sc_ifasiz / sizeof(struct ifaddr *))
 		if ((error = carp_grow_ifas(sc)) != 0) {
 			carp_multicast_cleanup(cif,
 			    ifa->ifa_addr->sa_family);
 			CARP_UNLOCK(sc);
 			CIF_FREE(cif);
 			return (error);
 		}
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		cif->cif_naddrs++;
 		sc->sc_naddrs++;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		cif->cif_naddrs6++;
 		sc->sc_naddrs6++;
 		break;
 #endif
 	}
 
 	ifa_ref(ifa);
 	sc->sc_ifas[index - 1] = ifa;
 	ifa->ifa_carp = sc;
 
 	carp_hmac_prepare(sc);
 	carp_sc_state(sc);
 
 	CARP_UNLOCK(sc);
 	CIF_UNLOCK(cif);
 
 	return (0);
 }
 
 void
 carp_detach(struct ifaddr *ifa)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct carp_if *cif = ifp->if_carp;
 
 	CIF_LOCK(cif);
 	carp_detach_locked(ifa);
 	CIF_FREE(cif);
 }
 
 static void
 carp_detach_locked(struct ifaddr *ifa)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct carp_if *cif = ifp->if_carp;
 	struct carp_softc *sc = ifa->ifa_carp;
 	int i, index;
 
 	KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa));
 
 	CIF_LOCK_ASSERT(cif);
 	CARP_LOCK(sc);
 
 	/* Shift array. */
 	index = sc->sc_naddrs + sc->sc_naddrs6;
 	for (i = 0; i < index; i++)
 		if (sc->sc_ifas[i] == ifa)
 			break;
 	KASSERT(i < index, ("%s: %p no backref", __func__, ifa));
 	for (; i < index - 1; i++)
 		sc->sc_ifas[i] = sc->sc_ifas[i+1];
 	sc->sc_ifas[index - 1] = NULL;
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		cif->cif_naddrs--;
 		sc->sc_naddrs--;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		cif->cif_naddrs6--;
 		sc->sc_naddrs6--;
 		break;
 #endif
 	}
 
 	carp_ifa_delroute(ifa);
 	carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family);
 
 	ifa->ifa_carp = NULL;
 	ifa_free(ifa);
 
 	carp_hmac_prepare(sc);
 	carp_sc_state(sc);
 
 	if (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) {
 		CARP_UNLOCK(sc);
 		carp_destroy(sc);
 	} else
 		CARP_UNLOCK(sc);
 }
 
 static void
 carp_set_state(struct carp_softc *sc, int state)
 {
 
 	CARP_LOCK_ASSERT(sc);
 
 	if (sc->sc_state != state) {
 		const char *carp_states[] = { CARP_STATES };
 		char subsys[IFNAMSIZ+5];
 
 		sc->sc_state = state;
 
 		snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid,
 		    sc->sc_carpdev->if_xname);
 		devctl_notify("CARP", subsys, carp_states[state], NULL);
 	}
 }
 
 static void
 carp_linkstate(struct ifnet *ifp)
 {
 	struct carp_softc *sc;
 
 	CIF_LOCK(ifp->if_carp);
 	IFNET_FOREACH_CARP(ifp, sc) {
 		CARP_LOCK(sc);
 		carp_sc_state(sc);
 		CARP_UNLOCK(sc);
 	}
 	CIF_UNLOCK(ifp->if_carp);
 }
 
 static void
 carp_sc_state(struct carp_softc *sc)
 {
 
 	CARP_LOCK_ASSERT(sc);
 
 	if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
 	    !(sc->sc_carpdev->if_flags & IFF_UP)) {
 		callout_stop(&sc->sc_ad_tmo);
 #ifdef INET
 		callout_stop(&sc->sc_md_tmo);
 #endif
 #ifdef INET6
 		callout_stop(&sc->sc_md6_tmo);
 #endif
 		carp_set_state(sc, INIT);
 		carp_setrun(sc, 0);
 		if (!sc->sc_suppress)
 			carp_demote_adj(V_carp_ifdown_adj, "interface down");
 		sc->sc_suppress = 1;
 	} else {
 		carp_set_state(sc, INIT);
 		carp_setrun(sc, 0);
 		if (sc->sc_suppress)
 			carp_demote_adj(-V_carp_ifdown_adj, "interface up");
 		sc->sc_suppress = 0;
 	}
 }
 
 static void
 carp_demote_adj(int adj, char *reason)
 {
 	atomic_add_int(&V_carp_demotion, adj);
 	CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason);
 	taskqueue_enqueue(taskqueue_swi, &carp_sendall_task);
 }
 
 static int
 carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int new, error;
 
 	new = V_carp_demotion;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	carp_demote_adj(new, "sysctl");
 
 	return (0);
 }
 
 #ifdef INET
 extern  struct domain inetdomain;
 static struct protosw in_carp_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_CARP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		carp_input,
 	.pr_output =		rip_output,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 };
 #endif
 
 #ifdef INET6
 extern	struct domain inet6domain;
 static struct protosw in6_carp_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_CARP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		carp6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 };
 #endif
 
 static void
 carp_mod_cleanup(void)
 {
 
 #ifdef INET
 	if (proto_reg[CARP_INET] == 0) {
 		(void)ipproto_unregister(IPPROTO_CARP);
 		pf_proto_unregister(PF_INET, IPPROTO_CARP, SOCK_RAW);
 		proto_reg[CARP_INET] = -1;
 	}
 	carp_iamatch_p = NULL;
 #endif
 #ifdef INET6
 	if (proto_reg[CARP_INET6] == 0) {
 		(void)ip6proto_unregister(IPPROTO_CARP);
 		pf_proto_unregister(PF_INET6, IPPROTO_CARP, SOCK_RAW);
 		proto_reg[CARP_INET6] = -1;
 	}
 	carp_iamatch6_p = NULL;
 	carp_macmatch6_p = NULL;
 #endif
 	carp_ioctl_p = NULL;
 	carp_attach_p = NULL;
 	carp_detach_p = NULL;
 	carp_get_vhid_p = NULL;
 	carp_linkstate_p = NULL;
 	carp_forus_p = NULL;
 	carp_output_p = NULL;
 	carp_demote_adj_p = NULL;
 	carp_master_p = NULL;
 	mtx_unlock(&carp_mtx);
 	taskqueue_drain(taskqueue_swi, &carp_sendall_task);
 	mtx_destroy(&carp_mtx);
 }
 
 static int
 carp_mod_load(void)
 {
 	int err;
 
 	mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
 	LIST_INIT(&carp_list);
 	carp_get_vhid_p = carp_get_vhid;
 	carp_forus_p = carp_forus;
 	carp_output_p = carp_output;
 	carp_linkstate_p = carp_linkstate;
 	carp_ioctl_p = carp_ioctl;
 	carp_attach_p = carp_attach;
 	carp_detach_p = carp_detach;
 	carp_demote_adj_p = carp_demote_adj;
 	carp_master_p = carp_master;
 #ifdef INET6
 	carp_iamatch6_p = carp_iamatch6;
 	carp_macmatch6_p = carp_macmatch6;
 	proto_reg[CARP_INET6] = pf_proto_register(PF_INET6,
 	    (struct protosw *)&in6_carp_protosw);
 	if (proto_reg[CARP_INET6]) {
 		printf("carp: error %d attaching to PF_INET6\n",
 		    proto_reg[CARP_INET6]);
 		carp_mod_cleanup();
 		return (proto_reg[CARP_INET6]);
 	}
 	err = ip6proto_register(IPPROTO_CARP);
 	if (err) {
 		printf("carp: error %d registering with INET6\n", err);
 		carp_mod_cleanup();
 		return (err);
 	}
 #endif
 #ifdef INET
 	carp_iamatch_p = carp_iamatch;
 	proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw);
 	if (proto_reg[CARP_INET]) {
 		printf("carp: error %d attaching to PF_INET\n",
 		    proto_reg[CARP_INET]);
 		carp_mod_cleanup();
 		return (proto_reg[CARP_INET]);
 	}
 	err = ipproto_register(IPPROTO_CARP);
 	if (err) {
 		printf("carp: error %d registering with INET\n", err);
 		carp_mod_cleanup();
 		return (err);
 	}
 #endif
 	return (0);
 }
 
 static int
 carp_modevent(module_t mod, int type, void *data)
 {
 	switch (type) {
 	case MOD_LOAD:
 		return carp_mod_load();
 		/* NOTREACHED */
 	case MOD_UNLOAD:
 		mtx_lock(&carp_mtx);
 		if (LIST_EMPTY(&carp_list))
 			carp_mod_cleanup();
 		else {
 			mtx_unlock(&carp_mtx);
 			return (EBUSY);
 		}
 		break;
 
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static moduledata_t carp_mod = {
 	"carp",
 	carp_modevent,
 	0
 };
 
 DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
Index: user/dchagin/lemul/sys/netinet/ip_fastfwd.c
===================================================================
--- user/dchagin/lemul/sys/netinet/ip_fastfwd.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/ip_fastfwd.c	(revision 274241)
@@ -1,599 +1,599 @@
 /*-
  * Copyright (c) 2003 Andre Oppermann, Internet Business Solutions AG
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * ip_fastforward gets its speed from processing the forwarded packet to
  * completion (if_output on the other side) without any queues or netisr's.
  * The receiving interface DMAs the packet into memory, the upper half of
  * driver calls ip_fastforward, we do our routing table lookup and directly
  * send it off to the outgoing interface, which DMAs the packet to the
  * network card. The only part of the packet we touch with the CPU is the
  * IP header (unless there are complex firewall rules touching other parts
  * of the packet, but that is up to you). We are essentially limited by bus
  * bandwidth and how fast the network card/driver can set up receives and
  * transmits.
  *
  * We handle basic errors, IP header errors, checksum errors,
  * destination unreachable, fragmentation and fragmentation needed and
  * report them via ICMP to the sender.
  *
  * Else if something is not pure IPv4 unicast forwarding we fall back to
  * the normal ip_input processing path. We should only be called from
  * interfaces connected to the outside world.
  *
  * Firewalling is fully supported including divert, ipfw fwd and ipfilter
  * ipnat and address rewrite.
  *
  * IPSEC is not supported if this host is a tunnel broker. IPSEC is
  * supported for connections to/from local host.
  *
  * We try to do the least expensive (in CPU ops) checks and operations
  * first to catch junk with as little overhead as possible.
  * 
  * We take full advantage of hardware support for IP checksum and
  * fragmentation offloading.
  *
  * We don't do ICMP redirect in the fast forwarding path. I have had my own
  * cases where two core routers with Zebra routing suite would send millions
  * ICMP redirects to connected hosts if the destination router was not the
  * default gateway. In one case it was filling the routing table of a host
  * with approximately 300.000 cloned redirect entries until it ran out of
  * kernel memory. However the networking code proved very robust and it didn't
  * crash or fail in other ways.
  */
 
 /*
  * Many thanks to Matt Thomas of NetBSD for basic structure of ip_flow.c which
  * is being followed here.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"
 #include "opt_ipstealth.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/pfil.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_options.h>
 
 #include <machine/in_cksum.h>
 
 static VNET_DEFINE(int, ipfastforward_active);
 #define	V_ipfastforward_active		VNET(ipfastforward_active)
 
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipfastforward_active), 0, "Enable fast IP forwarding");
 
 static struct sockaddr_in *
 ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m)
 {
 	struct sockaddr_in *dst;
 	struct rtentry *rt;
 
 	/*
 	 * Find route to destination.
 	 */
 	bzero(ro, sizeof(*ro));
 	dst = (struct sockaddr_in *)&ro->ro_dst;
 	dst->sin_family = AF_INET;
 	dst->sin_len = sizeof(*dst);
 	dst->sin_addr.s_addr = dest.s_addr;
 	in_rtalloc_ign(ro, 0, M_GETFIB(m));
 
 	/*
 	 * Route there and interface still up?
 	 */
 	rt = ro->ro_rt;
 	if (rt && (rt->rt_flags & RTF_UP) &&
 	    (rt->rt_ifp->if_flags & IFF_UP) &&
 	    (rt->rt_ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 		if (rt->rt_flags & RTF_GATEWAY)
 			dst = (struct sockaddr_in *)rt->rt_gateway;
 	} else {
 		IPSTAT_INC(ips_noroute);
 		IPSTAT_INC(ips_cantforward);
 		if (rt)
 			RTFREE(rt);
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		return NULL;
 	}
 	return dst;
 }
 
 /*
  * Try to forward a packet based on the destination address.
  * This is a fast path optimized for the plain forwarding case.
  * If the packet is handled (and consumed) here then we return NULL;
  * otherwise mbuf is returned and the packet should be delivered
  * to ip_input for full processing.
  */
 struct mbuf *
 ip_fastforward(struct mbuf *m)
 {
 	struct ip *ip;
 	struct mbuf *m0 = NULL;
 	struct route ro;
 	struct sockaddr_in *dst = NULL;
 	struct ifnet *ifp;
 	struct in_addr odest, dest;
 	uint16_t sum, ip_len, ip_off;
 	int error = 0;
 	int hlen, mtu;
 	struct m_tag *fwd_tag = NULL;
 
 	/*
 	 * Are we active and forwarding packets?
 	 */
 	if (!V_ipfastforward_active || !V_ipforwarding)
 		return m;
 
 	M_ASSERTVALID(m);
 	M_ASSERTPKTHDR(m);
 
 	bzero(&ro, sizeof(ro));
 
 	/*
 	 * Step 1: check for packet drop conditions (and sanity checks)
 	 */
 
 	/*
 	 * Is entire packet big enough?
 	 */
 	if (m->m_pkthdr.len < sizeof(struct ip)) {
 		IPSTAT_INC(ips_tooshort);
 		goto drop;
 	}
 
 	/*
 	 * Is first mbuf large enough for ip header and is header present?
 	 */
 	if (m->m_len < sizeof (struct ip) &&
 	   (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 		IPSTAT_INC(ips_toosmall);
 		return NULL;	/* mbuf already free'd */
 	}
 
 	ip = mtod(m, struct ip *);
 
 	/*
 	 * Is it IPv4?
 	 */
 	if (ip->ip_v != IPVERSION) {
 		IPSTAT_INC(ips_badvers);
 		goto drop;
 	}
 
 	/*
 	 * Is IP header length correct and is it in first mbuf?
 	 */
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
 		IPSTAT_INC(ips_badhlen);
 		goto drop;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == NULL) {
 			IPSTAT_INC(ips_badhlen);
 			return NULL;	/* mbuf already free'd */
 		}
 		ip = mtod(m, struct ip *);
 	}
 
 	/*
 	 * Checksum correct?
 	 */
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED)
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	else {
 		if (hlen == sizeof(struct ip))
 			sum = in_cksum_hdr(ip);
 		else
 			sum = in_cksum(m, hlen);
 	}
 	if (sum) {
 		IPSTAT_INC(ips_badsum);
 		goto drop;
 	}
 
 	/*
 	 * Remember that we have checked the IP header and found it valid.
 	 */
 	m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID);
 
 	ip_len = ntohs(ip->ip_len);
 
 	/*
 	 * Is IP length longer than packet we have got?
 	 */
 	if (m->m_pkthdr.len < ip_len) {
 		IPSTAT_INC(ips_tooshort);
 		goto drop;
 	}
 
 	/*
 	 * Is packet longer than IP header tells us? If yes, truncate packet.
 	 */
 	if (m->m_pkthdr.len > ip_len) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = ip_len;
 			m->m_pkthdr.len = ip_len;
 		} else
 			m_adj(m, ip_len - m->m_pkthdr.len);
 	}
 
 	/*
 	 * Is packet from or to 127/8?
 	 */
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		IPSTAT_INC(ips_badaddr);
 		goto drop;
 	}
 
 #ifdef ALTQ
 	/*
 	 * Is packet dropped by traffic conditioner?
 	 */
 	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
 		goto drop;
 #endif
 
 	/*
 	 * Step 2: fallback conditions to normal ip_input path processing
 	 */
 
 	/*
 	 * Only IP packets without options
 	 */
 	if (ip->ip_hl != (sizeof(struct ip) >> 2)) {
 		if (V_ip_doopts == 1)
 			return m;
 		else if (V_ip_doopts == 2) {
 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB,
 				0, 0);
 			return NULL;	/* mbuf already free'd */
 		}
 		/* else ignore IP options and continue */
 	}
 
 	/*
 	 * Only unicast IP, not from loopback, no L2 or IP broadcast,
 	 * no multicast, no INADDR_ANY
 	 *
 	 * XXX: Probably some of these checks could be direct drop
 	 * conditions.  However it is not clear whether there are some
 	 * hacks or obscure behaviours which make it neccessary to
 	 * let ip_input handle it.  We play safe here and let ip_input
 	 * deal with it until it is proven that we can directly drop it.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST)) ||
 	    (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) ||
 	    ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST ||
 	    ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST ||
 	    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 	    IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) ||
 	    IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
 	    ip->ip_src.s_addr == INADDR_ANY ||
 	    ip->ip_dst.s_addr == INADDR_ANY )
 		return m;
 
 	/*
 	 * Is it for a local address on this host?
 	 */
 	if (in_localip(ip->ip_dst))
 		return m;
 
 	IPSTAT_INC(ips_total);
 
 	/*
 	 * Step 3: incoming packet firewall processing
 	 */
 
 	odest.s_addr = dest.s_addr = ip->ip_dst.s_addr;
 
 	/*
 	 * Run through list of ipfilter hooks for input packets
 	 */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passin;
 
 	if (pfil_run_hooks(
 	    &V_inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) ||
 	    m == NULL)
 		goto drop;
 
 	M_ASSERTVALID(m);
 	M_ASSERTPKTHDR(m);
 
 	ip = mtod(m, struct ip *);	/* m may have changed by pfil hook */
 	dest.s_addr = ip->ip_dst.s_addr;
 
 	/*
 	 * Destination address changed?
 	 */
 	if (odest.s_addr != dest.s_addr) {
 		/*
 		 * Is it now for a local address on this host?
 		 */
 		if (in_localip(dest))
 			goto forwardlocal;
 		/*
 		 * Go on with new destination address
 		 */
 	}
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		/*
 		 * ipfw changed it for a local address on this host.
 		 */
 		goto forwardlocal;
 	}
 
 passin:
 	/*
 	 * Step 4: decrement TTL and look up route
 	 */
 
 	/*
 	 * Check TTL
 	 */
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 	if (ip->ip_ttl <= IPTTLDEC) {
 		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0);
 		return NULL;	/* mbuf already free'd */
 	}
 
 	/*
 	 * Decrement the TTL and incrementally change the IP header checksum.
 	 * Don't bother doing this with hw checksum offloading, it's faster
 	 * doing it right here.
 	 */
 	ip->ip_ttl -= IPTTLDEC;
 	if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8))
 		ip->ip_sum -= ~htons(IPTTLDEC << 8);
 	else
 		ip->ip_sum += htons(IPTTLDEC << 8);
 #ifdef IPSTEALTH
 	}
 #endif
 
 	/*
 	 * Find route to destination.
 	 */
 	if ((dst = ip_findroute(&ro, dest, m)) == NULL)
 		return NULL;	/* icmp unreach already sent */
 	ifp = ro.ro_rt->rt_ifp;
 
 	/*
 	 * Immediately drop blackholed traffic, and directed broadcasts
 	 * for either the all-ones or all-zero subnet addresses on
 	 * locally attached networks.
 	 */
 	if ((ro.ro_rt->rt_flags & (RTF_BLACKHOLE|RTF_BROADCAST)) != 0)
 		goto drop;
 
 	/*
 	 * Step 5: outgoing firewall packet processing
 	 */
 
 	/*
 	 * Run through list of hooks for output packets.
 	 */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passout;
 
 	if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, NULL) || m == NULL) {
 		goto drop;
 	}
 
 	M_ASSERTVALID(m);
 	M_ASSERTPKTHDR(m);
 
 	ip = mtod(m, struct ip *);
 	dest.s_addr = ip->ip_dst.s_addr;
 
 	/*
 	 * Destination address changed?
 	 */
 	if (m->m_flags & M_IP_NEXTHOP)
 		fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 	if (odest.s_addr != dest.s_addr || fwd_tag != NULL) {
 		/*
 		 * Is it now for a local address on this host?
 		 */
 		if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) {
 forwardlocal:
 			/*
 			 * Return packet for processing by ip_input().
 			 */
 			m->m_flags |= M_FASTFWD_OURS;
 			if (ro.ro_rt)
 				RTFREE(ro.ro_rt);
 			return m;
 		}
 		/*
 		 * Redo route lookup with new destination address
 		 */
 		if (fwd_tag) {
 			dest.s_addr = ((struct sockaddr_in *)
 				    (fwd_tag + 1))->sin_addr.s_addr;
 			m_tag_delete(m, fwd_tag);
 			m->m_flags &= ~M_IP_NEXTHOP;
 		}
 		RTFREE(ro.ro_rt);
 		if ((dst = ip_findroute(&ro, dest, m)) == NULL)
 			return NULL;	/* icmp unreach already sent */
 		ifp = ro.ro_rt->rt_ifp;
 	}
 
 passout:
 	/*
 	 * Step 6: send off the packet
 	 */
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	/*
 	 * Check if route is dampned (when ARP is unable to resolve)
 	 */
 	if ((ro.ro_rt->rt_flags & RTF_REJECT) &&
 	    (ro.ro_rt->rt_expire == 0 || time_uptime < ro.ro_rt->rt_expire)) {
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		goto consumed;
 	}
 
 #ifndef ALTQ
 	/*
 	 * Check if there is enough space in the interface queue
 	 */
 	if ((ifp->if_snd.ifq_len + ip_len / ifp->if_mtu + 1) >=
 	    ifp->if_snd.ifq_maxlen) {
 		IPSTAT_INC(ips_odropped);
 		/* would send source quench here but that is depreciated */
 		goto drop;
 	}
 #endif
 
 	/*
 	 * Check if media link state of interface is not down
 	 */
 	if (ifp->if_link_state == LINK_STATE_DOWN) {
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		goto consumed;
 	}
 
 	/*
 	 * Check if packet fits MTU or if hardware will fragment for us
 	 */
 	if (ro.ro_rt->rt_mtu)
 		mtu = min(ro.ro_rt->rt_mtu, ifp->if_mtu);
 	else
 		mtu = ifp->if_mtu;
 
 	if (ip_len <= mtu) {
 		/*
 		 * Avoid confusing lower layers.
 		 */
 		m_clrprotoflags(m);
 		/*
 		 * Send off the packet via outgoing interface
 		 */
 		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 		error = (*ifp->if_output)(ifp, m,
 				(struct sockaddr *)dst, &ro);
 	} else {
 		/*
 		 * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery
 		 */
 		if (ip_off & IP_DF) {
 			IPSTAT_INC(ips_cantfrag);
 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
 				0, mtu);
 			goto consumed;
 		} else {
 			/*
 			 * We have to fragment the packet
 			 */
 			m->m_pkthdr.csum_flags |= CSUM_IP;
 			if (ip_fragment(ip, &m, mtu, ifp->if_hwassist))
 				goto drop;
 			KASSERT(m != NULL, ("null mbuf and no error"));
 			/*
 			 * Send off the fragments via outgoing interface
 			 */
 			error = 0;
 			do {
 				m0 = m->m_nextpkt;
 				m->m_nextpkt = NULL;
 				/*
 				 * Avoid confusing lower layers.
 				 */
 				m_clrprotoflags(m);
 
 				IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 				error = (*ifp->if_output)(ifp, m,
 					(struct sockaddr *)dst, &ro);
 				if (error)
 					break;
 			} while ((m = m0) != NULL);
 			if (error) {
 				/* Reclaim remaining fragments */
 				for (m = m0; m; m = m0) {
 					m0 = m->m_nextpkt;
 					m_freem(m);
 				}
 			} else
 				IPSTAT_INC(ips_fragmented);
 		}
 	}
 
 	if (error != 0)
 		IPSTAT_INC(ips_odropped);
 	else {
 		counter_u64_add(ro.ro_rt->rt_pksent, 1);
 		IPSTAT_INC(ips_forward);
 		IPSTAT_INC(ips_fastforward);
 	}
 consumed:
 	RTFREE(ro.ro_rt);
 	return NULL;
 drop:
 	if (m)
 		m_freem(m);
 	if (ro.ro_rt)
 		RTFREE(ro.ro_rt);
 	return NULL;
 }
Index: user/dchagin/lemul/sys/netinet/ip_icmp.c
===================================================================
--- user/dchagin/lemul/sys/netinet/ip_icmp.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/ip_icmp.c	(revision 274241)
@@ -1,1022 +1,1022 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_icmp.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/icmp_var.h>
 
 #ifdef INET
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 #endif /* INET */
 
 /*
  * ICMP routines: error generation, receive packet processing, and
  * routines to turnaround packets back to the originator, and
  * host table maintenance routines.
  */
 static VNET_DEFINE(int, icmplim) = 200;
 #define	V_icmplim			VNET(icmplim)
-SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmplim), 0,
 	"Maximum number of ICMP responses per second");
 
 static VNET_DEFINE(int, icmplim_output) = 1;
 #define	V_icmplim_output		VNET(icmplim_output)
-SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmplim_output), 0,
 	"Enable logging of ICMP response rate limiting");
 
 #ifdef INET
 VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat);
 VNET_PCPUSTAT_SYSINIT(icmpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat,
     icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(icmpstat);
 #endif /* VIMAGE */
 
 static VNET_DEFINE(int, icmpmaskrepl) = 0;
 #define	V_icmpmaskrepl			VNET(icmpmaskrepl)
-SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmpmaskrepl), 0,
 	"Reply to ICMP Address Mask Request packets.");
 
 static VNET_DEFINE(u_int, icmpmaskfake) = 0;
 #define	V_icmpmaskfake			VNET(icmpmaskfake)
-SYSCTL_VNET_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_RW,
+SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmpmaskfake), 0,
 	"Fake reply to ICMP Address Mask Request packets.");
 
 VNET_DEFINE(int, drop_redirect) = 0;
 
 static VNET_DEFINE(int, log_redirect) = 0;
 #define	V_log_redirect			VNET(log_redirect)
-SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(log_redirect), 0,
 	"Log ICMP redirects to the console");
 
 static VNET_DEFINE(char, reply_src[IFNAMSIZ]);
 #define	V_reply_src			VNET(reply_src)
-SYSCTL_VNET_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_RW,
+SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(reply_src), IFNAMSIZ,
 	"icmp reply source for non-local packets.");
 
 static VNET_DEFINE(int, icmp_rfi) = 0;
 #define	V_icmp_rfi			VNET(icmp_rfi)
-SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmp_rfi), 0,
 	"ICMP reply from incoming interface for non-local packets");
 
 static VNET_DEFINE(int, icmp_quotelen) = 8;
 #define	V_icmp_quotelen			VNET(icmp_quotelen)
-SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmp_quotelen), 0,
 	"Number of bytes from original packet to quote in ICMP reply");
 
 /*
  * ICMP broadcast echo sysctl
  */
 static VNET_DEFINE(int, icmpbmcastecho) = 0;
 #define	V_icmpbmcastecho		VNET(icmpbmcastecho)
-SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmpbmcastecho), 0,
 	"");
 
 static VNET_DEFINE(int, icmptstamprepl) = 1;
 #define	V_icmptstamprepl		VNET(icmptstamprepl)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_RW,
 	&VNET_NAME(icmptstamprepl), 0, "Respond to ICMP Timestamp packets");
 
 #ifdef ICMPPRINTFS
 int	icmpprintfs = 0;
 #endif
 
 static void	icmp_reflect(struct mbuf *);
 static void	icmp_send(struct mbuf *, struct mbuf *);
 
 extern	struct protosw inetsw[];
 
 static int
 sysctl_net_icmp_drop_redir(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 	int i;
 	struct radix_node_head *rnh;
 
 	new = V_drop_redirect;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		new = (new != 0) ? 1 : 0;
 
 		if (new == V_drop_redirect)
 			return (0);
 
 		for (i = 0; i < rt_numfibs; i++) {
 			if ((rnh = rt_tables_get_rnh(i, AF_INET)) == NULL)
 				continue;
 			RADIX_NODE_HEAD_LOCK(rnh);
 			in_setmatchfunc(rnh, new);
 			RADIX_NODE_HEAD_UNLOCK(rnh);
 		}
 		
 		V_drop_redirect = new;
 	}
 
 	return (error);
 }
 
-SYSCTL_VNET_PROC(_net_inet_icmp, OID_AUTO, drop_redirect,
-    CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+SYSCTL_PROC(_net_inet_icmp, OID_AUTO, drop_redirect,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0,
     sysctl_net_icmp_drop_redir, "I", "Ignore ICMP redirects");
 
 /*
  * Kernel module interface for updating icmpstat.  The argument is an index
  * into icmpstat treated as an array of u_long.  While this encodes the
  * general layout of icmpstat into the caller, it doesn't encode its
  * location, so that future changes to add, for example, per-CPU stats
  * support won't cause binary compatibility problems for kernel modules.
  */
 void
 kmod_icmpstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(icmpstat)[statnum], 1);
 }
 
 /*
  * Generate an error packet of type error
  * in response to bad packet ip.
  */
 void
 icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
 {
 	register struct ip *oip = mtod(n, struct ip *), *nip;
 	register unsigned oiphlen = oip->ip_hl << 2;
 	register struct icmp *icp;
 	register struct mbuf *m;
 	unsigned icmplen, icmpelen, nlen;
 
 	KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type", __func__));
 #ifdef ICMPPRINTFS
 	if (icmpprintfs)
 		printf("icmp_error(%p, %x, %d)\n", oip, type, code);
 #endif
 	if (type != ICMP_REDIRECT)
 		ICMPSTAT_INC(icps_error);
 	/*
 	 * Don't send error:
 	 *  if the original packet was encrypted.
 	 *  if not the first fragment of message.
 	 *  in response to a multicast or broadcast packet.
 	 *  if the old packet protocol was an ICMP error message.
 	 */
 	if (n->m_flags & M_DECRYPTED)
 		goto freeit;
 	if (oip->ip_off & htons(~(IP_MF|IP_DF)))
 		goto freeit;
 	if (n->m_flags & (M_BCAST|M_MCAST))
 		goto freeit;
 	if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
 	  n->m_len >= oiphlen + ICMP_MINLEN &&
 	  !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) {
 		ICMPSTAT_INC(icps_oldicmp);
 		goto freeit;
 	}
 	/* Drop if IP header plus 8 bytes is not contignous in first mbuf. */
 	if (oiphlen + 8 > n->m_len)
 		goto freeit;
 	/*
 	 * Calculate length to quote from original packet and
 	 * prevent the ICMP mbuf from overflowing.
 	 * Unfortunatly this is non-trivial since ip_forward()
 	 * sends us truncated packets.
 	 */
 	nlen = m_length(n, NULL);
 	if (oip->ip_p == IPPROTO_TCP) {
 		struct tcphdr *th;
 		int tcphlen;
 
 		if (oiphlen + sizeof(struct tcphdr) > n->m_len &&
 		    n->m_next == NULL)
 			goto stdreply;
 		if (n->m_len < oiphlen + sizeof(struct tcphdr) &&
 		    ((n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL))
 			goto freeit;
 		th = (struct tcphdr *)((caddr_t)oip + oiphlen);
 		tcphlen = th->th_off << 2;
 		if (tcphlen < sizeof(struct tcphdr))
 			goto freeit;
 		if (ntohs(oip->ip_len) < oiphlen + tcphlen)
 			goto freeit;
 		if (oiphlen + tcphlen > n->m_len && n->m_next == NULL)
 			goto stdreply;
 		if (n->m_len < oiphlen + tcphlen && 
 		    ((n = m_pullup(n, oiphlen + tcphlen)) == NULL))
 			goto freeit;
 		icmpelen = max(tcphlen, min(V_icmp_quotelen,
 		    ntohs(oip->ip_len) - oiphlen));
 	} else
 stdreply:	icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen));
 
 	icmplen = min(oiphlen + icmpelen, nlen);
 	if (icmplen < sizeof(struct ip))
 		goto freeit;
 
 	if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen)
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	else
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		goto freeit;
 #ifdef MAC
 	mac_netinet_icmp_reply(n, m);
 #endif
 	icmplen = min(icmplen, M_TRAILINGSPACE(m) - sizeof(struct ip) - ICMP_MINLEN);
 	m_align(m, ICMP_MINLEN + icmplen);
 	m->m_len = ICMP_MINLEN + icmplen;
 
 	/* XXX MRT  make the outgoing packet use the same FIB
 	 * that was associated with the incoming packet
 	 */
 	M_SETFIB(m, M_GETFIB(n));
 	icp = mtod(m, struct icmp *);
 	ICMPSTAT_INC(icps_outhist[type]);
 	icp->icmp_type = type;
 	if (type == ICMP_REDIRECT)
 		icp->icmp_gwaddr.s_addr = dest;
 	else {
 		icp->icmp_void = 0;
 		/*
 		 * The following assignments assume an overlay with the
 		 * just zeroed icmp_void field.
 		 */
 		if (type == ICMP_PARAMPROB) {
 			icp->icmp_pptr = code;
 			code = 0;
 		} else if (type == ICMP_UNREACH &&
 			code == ICMP_UNREACH_NEEDFRAG && mtu) {
 			icp->icmp_nextmtu = htons(mtu);
 		}
 	}
 	icp->icmp_code = code;
 
 	/*
 	 * Copy the quotation into ICMP message and
 	 * convert quoted IP header back to network representation.
 	 */
 	m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip);
 	nip = &icp->icmp_ip;
 
 	/*
 	 * Set up ICMP message mbuf and copy old IP header (without options
 	 * in front of ICMP message.
 	 * If the original mbuf was meant to bypass the firewall, the error
 	 * reply should bypass as well.
 	 */
 	m->m_flags |= n->m_flags & M_SKIP_FIREWALL;
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 	m->m_pkthdr.len = m->m_len;
 	m->m_pkthdr.rcvif = n->m_pkthdr.rcvif;
 	nip = mtod(m, struct ip *);
 	bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip));
 	nip->ip_len = htons(m->m_len);
 	nip->ip_v = IPVERSION;
 	nip->ip_hl = 5;
 	nip->ip_p = IPPROTO_ICMP;
 	nip->ip_tos = 0;
 	nip->ip_off = 0;
 	icmp_reflect(m);
 
 freeit:
 	m_freem(n);
 }
 
 /*
  * Process a received ICMP message.
  */
 int
 icmp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct icmp *icp;
 	struct in_ifaddr *ia;
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct sockaddr_in icmpsrc, icmpdst, icmpgw;
 	int hlen = *offp;
 	int icmplen = ntohs(ip->ip_len) - *offp;
 	int i, code;
 	void (*ctlfunc)(int, struct sockaddr *, void *);
 	int fibnum;
 
 	*mp = NULL;
 
 	/*
 	 * Locate icmp structure in mbuf, and check
 	 * that not corrupted and of at least minimum length.
 	 */
 #ifdef ICMPPRINTFS
 	if (icmpprintfs) {
 		char buf[4 * sizeof "123"];
 		strcpy(buf, inet_ntoa(ip->ip_src));
 		printf("icmp_input from %s to %s, len %d\n",
 		       buf, inet_ntoa(ip->ip_dst), icmplen);
 	}
 #endif
 	if (icmplen < ICMP_MINLEN) {
 		ICMPSTAT_INC(icps_tooshort);
 		goto freeit;
 	}
 	i = hlen + min(icmplen, ICMP_ADVLENMIN);
 	if (m->m_len < i && (m = m_pullup(m, i)) == NULL)  {
 		ICMPSTAT_INC(icps_tooshort);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 	m->m_len -= hlen;
 	m->m_data += hlen;
 	icp = mtod(m, struct icmp *);
 	if (in_cksum(m, icmplen)) {
 		ICMPSTAT_INC(icps_checksum);
 		goto freeit;
 	}
 	m->m_len += hlen;
 	m->m_data -= hlen;
 
 	if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
 		/*
 		 * Deliver very specific ICMP type only.
 		 */
 		switch (icp->icmp_type) {
 		case ICMP_UNREACH:
 		case ICMP_TIMXCEED:
 			break;
 		default:
 			goto freeit;
 		}
 	}
 
 #ifdef ICMPPRINTFS
 	if (icmpprintfs)
 		printf("icmp_input, type %d code %d\n", icp->icmp_type,
 		    icp->icmp_code);
 #endif
 
 	/*
 	 * Message type specific processing.
 	 */
 	if (icp->icmp_type > ICMP_MAXTYPE)
 		goto raw;
 
 	/* Initialize */
 	bzero(&icmpsrc, sizeof(icmpsrc));
 	icmpsrc.sin_len = sizeof(struct sockaddr_in);
 	icmpsrc.sin_family = AF_INET;
 	bzero(&icmpdst, sizeof(icmpdst));
 	icmpdst.sin_len = sizeof(struct sockaddr_in);
 	icmpdst.sin_family = AF_INET;
 	bzero(&icmpgw, sizeof(icmpgw));
 	icmpgw.sin_len = sizeof(struct sockaddr_in);
 	icmpgw.sin_family = AF_INET;
 
 	ICMPSTAT_INC(icps_inhist[icp->icmp_type]);
 	code = icp->icmp_code;
 	switch (icp->icmp_type) {
 
 	case ICMP_UNREACH:
 		switch (code) {
 			case ICMP_UNREACH_NET:
 			case ICMP_UNREACH_HOST:
 			case ICMP_UNREACH_SRCFAIL:
 			case ICMP_UNREACH_NET_UNKNOWN:
 			case ICMP_UNREACH_HOST_UNKNOWN:
 			case ICMP_UNREACH_ISOLATED:
 			case ICMP_UNREACH_TOSNET:
 			case ICMP_UNREACH_TOSHOST:
 			case ICMP_UNREACH_HOST_PRECEDENCE:
 			case ICMP_UNREACH_PRECEDENCE_CUTOFF:
 				code = PRC_UNREACH_NET;
 				break;
 
 			case ICMP_UNREACH_NEEDFRAG:
 				code = PRC_MSGSIZE;
 				break;
 
 			/*
 			 * RFC 1122, Sections 3.2.2.1 and 4.2.3.9.
 			 * Treat subcodes 2,3 as immediate RST
 			 */
 			case ICMP_UNREACH_PROTOCOL:
 			case ICMP_UNREACH_PORT:
 				code = PRC_UNREACH_PORT;
 				break;
 
 			case ICMP_UNREACH_NET_PROHIB:
 			case ICMP_UNREACH_HOST_PROHIB:
 			case ICMP_UNREACH_FILTER_PROHIB:
 				code = PRC_UNREACH_ADMIN_PROHIB;
 				break;
 
 			default:
 				goto badcode;
 		}
 		goto deliver;
 
 	case ICMP_TIMXCEED:
 		if (code > 1)
 			goto badcode;
 		code += PRC_TIMXCEED_INTRANS;
 		goto deliver;
 
 	case ICMP_PARAMPROB:
 		if (code > 1)
 			goto badcode;
 		code = PRC_PARAMPROB;
 		goto deliver;
 
 	case ICMP_SOURCEQUENCH:
 		if (code)
 			goto badcode;
 		code = PRC_QUENCH;
 	deliver:
 		/*
 		 * Problem with datagram; advise higher level routines.
 		 */
 		if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
 		    icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
 			ICMPSTAT_INC(icps_badlen);
 			goto freeit;
 		}
 		/* Discard ICMP's in response to multicast packets */
 		if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr)))
 			goto badcode;
 #ifdef ICMPPRINTFS
 		if (icmpprintfs)
 			printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
 #endif
 		icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
 		/*
 		 * XXX if the packet contains [IPv4 AH TCP], we can't make a
 		 * notification to TCP layer.
 		 */
 		ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput;
 		if (ctlfunc)
 			(*ctlfunc)(code, (struct sockaddr *)&icmpsrc,
 				   (void *)&icp->icmp_ip);
 		break;
 
 	badcode:
 		ICMPSTAT_INC(icps_badcode);
 		break;
 
 	case ICMP_ECHO:
 		if (!V_icmpbmcastecho
 		    && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
 			ICMPSTAT_INC(icps_bmcastecho);
 			break;
 		}
 		icp->icmp_type = ICMP_ECHOREPLY;
 		if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0)
 			goto freeit;
 		else
 			goto reflect;
 
 	case ICMP_TSTAMP:
 		if (V_icmptstamprepl == 0)
 			break;
 		if (!V_icmpbmcastecho
 		    && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
 			ICMPSTAT_INC(icps_bmcasttstamp);
 			break;
 		}
 		if (icmplen < ICMP_TSLEN) {
 			ICMPSTAT_INC(icps_badlen);
 			break;
 		}
 		icp->icmp_type = ICMP_TSTAMPREPLY;
 		icp->icmp_rtime = iptime();
 		icp->icmp_ttime = icp->icmp_rtime;	/* bogus, do later! */
 		if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0)
 			goto freeit;
 		else
 			goto reflect;
 
 	case ICMP_MASKREQ:
 		if (V_icmpmaskrepl == 0)
 			break;
 		/*
 		 * We are not able to respond with all ones broadcast
 		 * unless we receive it over a point-to-point interface.
 		 */
 		if (icmplen < ICMP_MASKLEN)
 			break;
 		switch (ip->ip_dst.s_addr) {
 
 		case INADDR_BROADCAST:
 		case INADDR_ANY:
 			icmpdst.sin_addr = ip->ip_src;
 			break;
 
 		default:
 			icmpdst.sin_addr = ip->ip_dst;
 		}
 		ia = (struct in_ifaddr *)ifaof_ifpforaddr(
 			    (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif);
 		if (ia == NULL)
 			break;
 		if (ia->ia_ifp == NULL) {
 			ifa_free(&ia->ia_ifa);
 			break;
 		}
 		icp->icmp_type = ICMP_MASKREPLY;
 		if (V_icmpmaskfake == 0)
 			icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr;
 		else
 			icp->icmp_mask = V_icmpmaskfake;
 		if (ip->ip_src.s_addr == 0) {
 			if (ia->ia_ifp->if_flags & IFF_BROADCAST)
 			    ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr;
 			else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT)
 			    ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr;
 		}
 		ifa_free(&ia->ia_ifa);
 reflect:
 		ICMPSTAT_INC(icps_reflect);
 		ICMPSTAT_INC(icps_outhist[icp->icmp_type]);
 		icmp_reflect(m);
 		return (IPPROTO_DONE);
 
 	case ICMP_REDIRECT:
 		if (V_log_redirect) {
 			u_long src, dst, gw;
 
 			src = ntohl(ip->ip_src.s_addr);
 			dst = ntohl(icp->icmp_ip.ip_dst.s_addr);
 			gw = ntohl(icp->icmp_gwaddr.s_addr);
 			printf("icmp redirect from %d.%d.%d.%d: "
 			       "%d.%d.%d.%d => %d.%d.%d.%d\n",
 			       (int)(src >> 24), (int)((src >> 16) & 0xff),
 			       (int)((src >> 8) & 0xff), (int)(src & 0xff),
 			       (int)(dst >> 24), (int)((dst >> 16) & 0xff),
 			       (int)((dst >> 8) & 0xff), (int)(dst & 0xff),
 			       (int)(gw >> 24), (int)((gw >> 16) & 0xff),
 			       (int)((gw >> 8) & 0xff), (int)(gw & 0xff));
 		}
 		/*
 		 * RFC1812 says we must ignore ICMP redirects if we
 		 * are acting as router.
 		 */
 		if (V_drop_redirect || V_ipforwarding)
 			break;
 		if (code > 3)
 			goto badcode;
 		if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
 		    icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
 			ICMPSTAT_INC(icps_badlen);
 			break;
 		}
 		/*
 		 * Short circuit routing redirects to force
 		 * immediate change in the kernel's routing
 		 * tables.  The message is also handed to anyone
 		 * listening on a raw socket (e.g. the routing
 		 * daemon for use in updating its tables).
 		 */
 		icmpgw.sin_addr = ip->ip_src;
 		icmpdst.sin_addr = icp->icmp_gwaddr;
 #ifdef	ICMPPRINTFS
 		if (icmpprintfs) {
 			char buf[4 * sizeof "123"];
 			strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst));
 
 			printf("redirect dst %s to %s\n",
 			       buf, inet_ntoa(icp->icmp_gwaddr));
 		}
 #endif
 		icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
 		for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 			in_rtredirect((struct sockaddr *)&icmpsrc,
 			  (struct sockaddr *)&icmpdst,
 			  (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST,
 			  (struct sockaddr *)&icmpgw, fibnum);
 		}
 		pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc);
 #ifdef IPSEC
 		key_sa_routechange((struct sockaddr *)&icmpsrc);
 #endif
 		break;
 
 	/*
 	 * No kernel processing for the following;
 	 * just fall through to send to raw listener.
 	 */
 	case ICMP_ECHOREPLY:
 	case ICMP_ROUTERADVERT:
 	case ICMP_ROUTERSOLICIT:
 	case ICMP_TSTAMPREPLY:
 	case ICMP_IREQREPLY:
 	case ICMP_MASKREPLY:
 	default:
 		break;
 	}
 
 raw:
 	*mp = m;
 	rip_input(mp, offp, proto);
 	return (IPPROTO_DONE);
 
 freeit:
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Reflect the ip packet back to the source
  */
 static void
 icmp_reflect(struct mbuf *m)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	struct in_ifaddr *ia;
 	struct in_addr t;
 	struct mbuf *opts = 0;
 	int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
 
 	if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 	    IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) ||
 	    IN_ZERONET(ntohl(ip->ip_src.s_addr)) ) {
 		m_freem(m);	/* Bad return address */
 		ICMPSTAT_INC(icps_badaddr);
 		goto done;	/* Ip_output() will check for broadcast */
 	}
 
 	t = ip->ip_dst;
 	ip->ip_dst = ip->ip_src;
 
 	/*
 	 * Source selection for ICMP replies:
 	 *
 	 * If the incoming packet was addressed directly to one of our
 	 * own addresses, use dst as the src for the reply.
 	 */
 	IN_IFADDR_RLOCK();
 	LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) {
 		if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) {
 			t = IA_SIN(ia)->sin_addr;
 			IN_IFADDR_RUNLOCK();
 			goto match;
 		}
 	}
 	IN_IFADDR_RUNLOCK();
 
 	/*
 	 * If the incoming packet was addressed to one of our broadcast
 	 * addresses, use the first non-broadcast address which corresponds
 	 * to the incoming interface.
 	 */
 	ifp = m->m_pkthdr.rcvif;
 	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    t.s_addr) {
 				t = IA_SIN(ia)->sin_addr;
 				IF_ADDR_RUNLOCK(ifp);
 				goto match;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	/*
 	 * If the packet was transiting through us, use the address of
 	 * the interface the packet came through in.  If that interface
 	 * doesn't have a suitable IP address, the normal selection
 	 * criteria apply.
 	 */
 	if (V_icmp_rfi && ifp != NULL) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			t = IA_SIN(ia)->sin_addr;
 			IF_ADDR_RUNLOCK(ifp);
 			goto match;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	/*
 	 * If the incoming packet was not addressed directly to us, use
 	 * designated interface for icmp replies specified by sysctl
 	 * net.inet.icmp.reply_src (default not set). Otherwise continue
 	 * with normal source selection.
 	 */
 	if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			t = IA_SIN(ia)->sin_addr;
 			IF_ADDR_RUNLOCK(ifp);
 			goto match;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	/*
 	 * If the packet was transiting through us, use the address of
 	 * the interface that is the closest to the packet source.
 	 * When we don't have a route back to the packet source, stop here
 	 * and drop the packet.
 	 */
 	ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
 	if (ia == NULL) {
 		m_freem(m);
 		ICMPSTAT_INC(icps_noroute);
 		goto done;
 	}
 	t = IA_SIN(ia)->sin_addr;
 	ifa_free(&ia->ia_ifa);
 match:
 #ifdef MAC
 	mac_netinet_icmp_replyinplace(m);
 #endif
 	ip->ip_src = t;
 	ip->ip_ttl = V_ip_defttl;
 
 	if (optlen > 0) {
 		register u_char *cp;
 		int opt, cnt;
 		u_int len;
 
 		/*
 		 * Retrieve any source routing from the incoming packet;
 		 * add on any record-route or timestamp options.
 		 */
 		cp = (u_char *) (ip + 1);
 		if ((opts = ip_srcroute(m)) == 0 &&
 		    (opts = m_gethdr(M_NOWAIT, MT_DATA))) {
 			opts->m_len = sizeof(struct in_addr);
 			mtod(opts, struct in_addr *)->s_addr = 0;
 		}
 		if (opts) {
 #ifdef ICMPPRINTFS
 		    if (icmpprintfs)
 			    printf("icmp_reflect optlen %d rt %d => ",
 				optlen, opts->m_len);
 #endif
 		    for (cnt = optlen; cnt > 0; cnt -= len, cp += len) {
 			    opt = cp[IPOPT_OPTVAL];
 			    if (opt == IPOPT_EOL)
 				    break;
 			    if (opt == IPOPT_NOP)
 				    len = 1;
 			    else {
 				    if (cnt < IPOPT_OLEN + sizeof(*cp))
 					    break;
 				    len = cp[IPOPT_OLEN];
 				    if (len < IPOPT_OLEN + sizeof(*cp) ||
 				        len > cnt)
 					    break;
 			    }
 			    /*
 			     * Should check for overflow, but it "can't happen"
 			     */
 			    if (opt == IPOPT_RR || opt == IPOPT_TS ||
 				opt == IPOPT_SECURITY) {
 				    bcopy((caddr_t)cp,
 					mtod(opts, caddr_t) + opts->m_len, len);
 				    opts->m_len += len;
 			    }
 		    }
 		    /* Terminate & pad, if necessary */
 		    cnt = opts->m_len % 4;
 		    if (cnt) {
 			    for (; cnt < 4; cnt++) {
 				    *(mtod(opts, caddr_t) + opts->m_len) =
 					IPOPT_EOL;
 				    opts->m_len++;
 			    }
 		    }
 #ifdef ICMPPRINTFS
 		    if (icmpprintfs)
 			    printf("%d\n", opts->m_len);
 #endif
 		}
 		ip_stripoptions(m);
 	}
 	m_tag_delete_nonpersistent(m);
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 	icmp_send(m, opts);
 done:
 	if (opts)
 		(void)m_free(opts);
 }
 
 /*
  * Send an icmp packet back to the ip level,
  * after supplying a checksum.
  */
 static void
 icmp_send(struct mbuf *m, struct mbuf *opts)
 {
 	register struct ip *ip = mtod(m, struct ip *);
 	register int hlen;
 	register struct icmp *icp;
 
 	hlen = ip->ip_hl << 2;
 	m->m_data += hlen;
 	m->m_len -= hlen;
 	icp = mtod(m, struct icmp *);
 	icp->icmp_cksum = 0;
 	icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - hlen);
 	m->m_data -= hlen;
 	m->m_len += hlen;
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef ICMPPRINTFS
 	if (icmpprintfs) {
 		char buf[4 * sizeof "123"];
 		strcpy(buf, inet_ntoa(ip->ip_dst));
 		printf("icmp_send dst %s src %s\n",
 		       buf, inet_ntoa(ip->ip_src));
 	}
 #endif
 	(void) ip_output(m, opts, NULL, 0, NULL, NULL);
 }
 
 /*
  * Return milliseconds since 00:00 GMT in network format.
  */
 uint32_t
 iptime(void)
 {
 	struct timeval atv;
 	u_long t;
 
 	getmicrotime(&atv);
 	t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000;
 	return (htonl(t));
 }
 
 /*
  * Return the next larger or smaller MTU plateau (table from RFC 1191)
  * given current value MTU.  If DIR is less than zero, a larger plateau
  * is returned; otherwise, a smaller value is returned.
  */
 int
 ip_next_mtu(int mtu, int dir)
 {
 	static int mtutab[] = {
 		65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508,
 		296, 68, 0
 	};
 	int i, size;
 
 	size = (sizeof mtutab) / (sizeof mtutab[0]);
 	if (dir >= 0) {
 		for (i = 0; i < size; i++)
 			if (mtu > mtutab[i])
 				return mtutab[i];
 	} else {
 		for (i = size - 1; i >= 0; i--)
 			if (mtu < mtutab[i])
 				return mtutab[i];
 		if (mtu == mtutab[0])
 			return mtutab[0];
 	}
 	return 0;
 }
 #endif /* INET */
 
 
 /*
  * badport_bandlim() - check for ICMP bandwidth limit
  *
  *	Return 0 if it is ok to send an ICMP error response, -1 if we have
  *	hit our bandwidth limit and it is not ok.
  *
  *	If icmplim is <= 0, the feature is disabled and 0 is returned.
  *
  *	For now we separate the TCP and UDP subsystems w/ different 'which'
  *	values.  We may eventually remove this separation (and simplify the
  *	code further).
  *
  *	Note that the printing of the error message is delayed so we can
  *	properly print the icmp error rate that the system was trying to do
  *	(i.e. 22000/100 pps, etc...).  This can cause long delays in printing
  *	the 'final' error, but it doesn't make sense to solve the printing
  *	delay with more complex code.
  */
 
 int
 badport_bandlim(int which)
 {
 
 #define	N(a)	(sizeof (a) / sizeof (a[0]))
 	static struct rate {
 		const char	*type;
 		struct timeval	lasttime;
 		int		curpps;
 	} rates[BANDLIM_MAX+1] = {
 		{ "icmp unreach response" },
 		{ "icmp ping response" },
 		{ "icmp tstamp response" },
 		{ "closed port RST response" },
 		{ "open port RST response" },
 		{ "icmp6 unreach response" },
 		{ "sctp ootb response" }
 	};
 
 	/*
 	 * Return ok status if feature disabled or argument out of range.
 	 */
 	if (V_icmplim > 0 && (u_int) which < N(rates)) {
 		struct rate *r = &rates[which];
 		int opps = r->curpps;
 
 		if (!ppsratecheck(&r->lasttime, &r->curpps, V_icmplim))
 			return -1;	/* discard packet */
 		/*
 		 * If we've dropped below the threshold after having
 		 * rate-limited traffic print the message.  This preserves
 		 * the previous behaviour at the expense of added complexity.
 		 */
 		if (V_icmplim_output && opps > V_icmplim)
 			log(LOG_NOTICE, "Limiting %s from %d to %d packets/sec\n",
 				r->type, opps, V_icmplim);
 	}
 	return 0;			/* okay to send packet */
 #undef N
 }
Index: user/dchagin/lemul/sys/netinet/ip_input.c
===================================================================
--- user/dchagin/lemul/sys/netinet/ip_input.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/ip_input.c	(revision 274241)
@@ -1,1916 +1,1916 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bootp.h"
 #include "opt_ipfw.h"
 #include "opt_ipstealth.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/pfil.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_options.h>
 #include <machine/in_cksum.h>
 #include <netinet/ip_carp.h>
 #ifdef IPSEC
 #include <netinet/ip_ipsec.h>
 #endif /* IPSEC */
 #include <netinet/in_rss.h>
 
 #include <sys/socketvar.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof(struct ip) == 20);
 #endif
 
 struct	rwlock in_ifaddr_lock;
 RW_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock");
 
 VNET_DEFINE(int, rsvp_on);
 
 VNET_DEFINE(int, ipforwarding);
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipforwarding), 0,
     "Enable IP forwarding between interfaces");
 
 static VNET_DEFINE(int, ipsendredirects) = 1;	/* XXX */
 #define	V_ipsendredirects	VNET(ipsendredirects)
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipsendredirects), 0,
     "Enable sending IP redirects");
 
 static VNET_DEFINE(int, ip_keepfaith);
 #define	V_ip_keepfaith		VNET(ip_keepfaith)
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_keepfaith), 0,
     "Enable packet capture for FAITH IPv4->IPv6 translater daemon");
 
 static VNET_DEFINE(int, ip_sendsourcequench);
 #define	V_ip_sendsourcequench	VNET(ip_sendsourcequench)
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_sendsourcequench), 0,
     "Enable the transmission of source quench packets");
 
 VNET_DEFINE(int, ip_do_randomid);
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_do_randomid), 0,
     "Assign random ip_id values");
 
 /*
  * XXX - Setting ip_checkinterface mostly implements the receive side of
  * the Strong ES model described in RFC 1122, but since the routing table
  * and transmit implementation do not implement the Strong ES model,
  * setting this to 1 results in an odd hybrid.
  *
  * XXX - ip_checkinterface currently must be disabled if you use ipnat
  * to translate the destination address to another local interface.
  *
  * XXX - ip_checkinterface must be disabled if you add IP aliases
  * to the loopback interface instead of the interface where the
  * packets for those addresses are received.
  */
 static VNET_DEFINE(int, ip_checkinterface);
 #define	V_ip_checkinterface	VNET(ip_checkinterface)
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_checkinterface), 0,
     "Verify packet arrives on correct interface");
 
 VNET_DEFINE(struct pfil_head, inet_pfil_hook);	/* Packet filter hooks */
 
 static struct netisr_handler ip_nh = {
 	.nh_name = "ip",
 	.nh_handler = ip_input,
 	.nh_proto = NETISR_IP,
 #ifdef	RSS
 	.nh_m2cpuid = rss_soft_m2cpuid,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 #else
 	.nh_policy = NETISR_POLICY_FLOW,
 #endif
 };
 
 #ifdef	RSS
 /*
  * Directly dispatched frames are currently assumed
  * to have a flowid already calculated.
  *
  * It should likely have something that assert it
  * actually has valid flow details.
  */
 static struct netisr_handler ip_direct_nh = {
 	.nh_name = "ip_direct",
 	.nh_handler = ip_direct_input,
 	.nh_proto = NETISR_IP_DIRECT,
 	.nh_m2cpuid = rss_m2cpuid,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 };
 #endif
 
 extern	struct domain inetdomain;
 extern	struct protosw inetsw[];
 u_char	ip_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead);  /* first inet address */
 VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table  */
 VNET_DEFINE(u_long, in_ifaddrhmask);		/* mask for hash table */
 
 static VNET_DEFINE(uma_zone_t, ipq_zone);
 static VNET_DEFINE(TAILQ_HEAD(ipqhead, ipq), ipq[IPREASS_NHASH]);
 static struct mtx ipqlock;
 
 #define	V_ipq_zone		VNET(ipq_zone)
 #define	V_ipq			VNET(ipq)
 
 #define	IPQ_LOCK()	mtx_lock(&ipqlock)
 #define	IPQ_UNLOCK()	mtx_unlock(&ipqlock)
 #define	IPQ_LOCK_INIT()	mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF)
 #define	IPQ_LOCK_ASSERT()	mtx_assert(&ipqlock, MA_OWNED)
 
 static void	maxnipq_update(void);
 static void	ipq_zone_change(void *);
 static void	ip_drain_locked(void);
 
 static VNET_DEFINE(int, maxnipq);  /* Administrative limit on # reass queues. */
 static VNET_DEFINE(int, nipq);			/* Total # of reass queues */
 #define	V_maxnipq		VNET(maxnipq)
 #define	V_nipq			VNET(nipq)
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD,
+SYSCTL_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET | CTLFLAG_RD,
     &VNET_NAME(nipq), 0,
     "Current number of IPv4 fragment reassembly queue entries");
 
 static VNET_DEFINE(int, maxfragsperpacket);
 #define	V_maxfragsperpacket	VNET(maxfragsperpacket)
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(maxfragsperpacket), 0,
     "Maximum number of IPv4 fragments allowed per packet");
 
 #ifdef IPCTL_DEFMTU
 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
     &ip_mtu, 0, "Default MTU");
 #endif
 
 #ifdef IPSTEALTH
 VNET_DEFINE(int, ipstealth);
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipstealth), 0,
     "IP stealth mode, no TTL decrementation on forwarding");
 #endif
 
 static void	ip_freef(struct ipqhead *, struct ipq *);
 
 /*
  * IP statistics are stored in the "array" of counter(9)s.
  */
 VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat);
 VNET_PCPUSTAT_SYSINIT(ipstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat,
     "IP statistics (struct ipstat, netinet/ip_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipstat);
 #endif /* VIMAGE */
 
 /*
  * Kernel module interface for updating ipstat.  The argument is an index
  * into ipstat treated as an array.
  */
 void
 kmod_ipstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], 1);
 }
 
 void
 kmod_ipstat_dec(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], -1);
 }
 
 static int
 sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I",
     "Maximum size of the IP input queue");
 
 static int
 sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I",
     "Number of packets dropped from the IP input queue");
 
 #ifdef	RSS
 static int
 sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_direct_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_direct_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I",
     "Maximum size of the IP direct input queue");
 
 static int
 sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_direct_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_direct_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I",
     "Number of packets dropped from the IP direct input queue");
 #endif	/* RSS */
 
 /*
  * IP initialization: fill in IP protocol switch table.
  * All protocols not implemented in kernel go to raw IP protocol handler.
  */
 void
 ip_init(void)
 {
 	struct protosw *pr;
 	int i;
 
 	V_ip_id = time_second & 0xffff;
 
 	TAILQ_INIT(&V_in_ifaddrhead);
 	V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
 
 	/* Initialize IP reassembly queue. */
 	for (i = 0; i < IPREASS_NHASH; i++)
 		TAILQ_INIT(&V_ipq[i]);
 	V_maxnipq = nmbclusters / 32;
 	V_maxfragsperpacket = 16;
 	V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
 	    NULL, UMA_ALIGN_PTR, 0);
 	maxnipq_update();
 
 	/* Initialize packet filter hooks. */
 	V_inet_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet_pfil_hook.ph_af = AF_INET;
 	if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 	/* Skip initialization of globals for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip_init: PF_INET not found");
 
 	/* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip_protox[i] = pr - inetsw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip_protox[].
 	 */
 	for (pr = inetdomain.dom_protosw;
 	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip_protox[pr->pr_protocol] = pr - inetsw;
 		}
 
 	EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change,
 		NULL, EVENTHANDLER_PRI_ANY);
 
 	/* Initialize various other remaining things. */
 	IPQ_LOCK_INIT();
 	netisr_register(&ip_nh);
 #ifdef	RSS
 	netisr_register(&ip_direct_nh);
 #endif
 }
 
 #ifdef VIMAGE
 void
 ip_destroy(void)
 {
 	int i;
 
 	if ((i = pfil_head_unregister(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, i);
 
 	/* Cleanup in_ifaddr hash table; should be empty. */
 	hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask);
 
 	IPQ_LOCK();
 	ip_drain_locked();
 	IPQ_UNLOCK();
 
 	uma_zdestroy(V_ipq_zone);
 }
 #endif
 
 #ifdef	RSS
 /*
  * IP direct input routine.
  *
  * This is called when reinjecting completed fragments where
  * all of the previous checking and book-keeping has been done.
  */
 void
 ip_direct_input(struct mbuf *m)
 {
 	struct ip *ip;
 	int hlen;
 
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 
 	IPSTAT_INC(ips_delivered);
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
 	return;
 }
 #endif
 
 /*
  * Ip input routine.  Checksum and byte swap header.  If fragmented
  * try to reassemble.  Process options.  Pass to next level.
  */
 void
 ip_input(struct mbuf *m)
 {
 	struct ip *ip = NULL;
 	struct in_ifaddr *ia = NULL;
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	int    checkif, hlen = 0;
 	uint16_t sum, ip_len;
 	int dchg = 0;				/* dest changed after fw */
 	struct in_addr odst;			/* original dst address */
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		/* Set up some basics that will be used later. */
 		ip = mtod(m, struct ip *);
 		hlen = ip->ip_hl << 2;
 		ip_len = ntohs(ip->ip_len);
 		goto ours;
 	}
 
 	IPSTAT_INC(ips_total);
 
 	if (m->m_pkthdr.len < sizeof(struct ip))
 		goto tooshort;
 
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 		IPSTAT_INC(ips_toosmall);
 		return;
 	}
 	ip = mtod(m, struct ip *);
 
 	if (ip->ip_v != IPVERSION) {
 		IPSTAT_INC(ips_badvers);
 		goto bad;
 	}
 
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
 		IPSTAT_INC(ips_badhlen);
 		goto bad;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == NULL) {
 			IPSTAT_INC(ips_badhlen);
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 
 	IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL);
 
 	/* 127/8 must not appear on wire - RFC1122 */
 	ifp = m->m_pkthdr.rcvif;
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			goto bad;
 		}
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	} else {
 		if (hlen == sizeof(struct ip)) {
 			sum = in_cksum_hdr(ip);
 		} else {
 			sum = in_cksum(m, hlen);
 		}
 	}
 	if (sum) {
 		IPSTAT_INC(ips_badsum);
 		goto bad;
 	}
 
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
 		/* packet is dropped by traffic conditioner */
 		return;
 #endif
 
 	ip_len = ntohs(ip->ip_len);
 	if (ip_len < hlen) {
 		IPSTAT_INC(ips_badlen);
 		goto bad;
 	}
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IP header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len < ip_len) {
 tooshort:
 		IPSTAT_INC(ips_tooshort);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > ip_len) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = ip_len;
 			m->m_pkthdr.len = ip_len;
 		} else
 			m_adj(m, ip_len - m->m_pkthdr.len);
 	}
 
 #ifdef IPSEC
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (ip_ipsec_filtertunnel(m))
 		goto passin;
 #endif /* IPSEC */
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing (e.g.
 	 *     by NAT rewriting).  When this happens, tell
 	 *     ip_forward to do the right thing.
 	 */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passin;
 
 	odst = ip->ip_dst;
 	if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0)
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 
 	ip = mtod(m, struct ip *);
 	dchg = (odst.s_addr != ip->ip_dst.s_addr);
 	ifp = m->m_pkthdr.rcvif;
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		goto ours;
 	}
 	if (m->m_flags & M_IP_NEXTHOP) {
 		dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL);
 		if (dchg != 0) {
 			/*
 			 * Directly ship the packet on.  This allows
 			 * forwarding packets originally destined to us
 			 * to some other directly connected host.
 			 */
 			ip_forward(m, 1);
 			return;
 		}
 	}
 passin:
 
 	/*
 	 * Process options and, if not destined for us,
 	 * ship it on.  ip_dooptions returns 1 when an
 	 * error was detected (causing an icmp message
 	 * to be sent and the original packet to be freed).
 	 */
 	if (hlen > sizeof (struct ip) && ip_dooptions(m, 0))
 		return;
 
         /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
          * matter if it is destined to another node, or whether it is 
          * a multicast one, RSVP wants it! and prevents it from being forwarded
          * anywhere else. Also checks if the rsvp daemon is running before
 	 * grabbing the packet.
          */
 	if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) 
 		goto ours;
 
 	/*
 	 * Check our list of addresses, to see if the packet is for us.
 	 * If we don't have any addresses, assume any unicast packet
 	 * we receive might be for us (and let the upper layers deal
 	 * with it).
 	 */
 	if (TAILQ_EMPTY(&V_in_ifaddrhead) &&
 	    (m->m_flags & (M_MCAST|M_BCAST)) == 0)
 		goto ours;
 
 	/*
 	 * Enable a consistency check between the destination address
 	 * and the arrival interface for a unicast packet (the RFC 1122
 	 * strong ES model) if IP forwarding is disabled and the packet
 	 * is not locally generated and the packet is not subject to
 	 * 'ipfw fwd'.
 	 *
 	 * XXX - Checking also should be disabled if the destination
 	 * address is ipnat'ed to a different interface.
 	 *
 	 * XXX - Checking is incompatible with IP aliases added
 	 * to the loopback interface instead of the interface where
 	 * the packets are received.
 	 *
 	 * XXX - This is the case for carp vhost IPs as well so we
 	 * insert a workaround. If the packet got here, we already
 	 * checked with carp_iamatch() and carp_forus().
 	 */
 	checkif = V_ip_checkinterface && (V_ipforwarding == 0) && 
 	    ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) &&
 	    ifp->if_carp == NULL && (dchg == 0);
 
 	/*
 	 * Check for exact addresses in the hash bucket.
 	 */
 	/* IN_IFADDR_RLOCK(); */
 	LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
 		/*
 		 * If the address matches, verify that the packet
 		 * arrived via the correct interface if checking is
 		 * enabled.
 		 */
 		if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && 
 		    (!checkif || ia->ia_ifp == ifp)) {
 			counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 			counter_u64_add(ia->ia_ifa.ifa_ibytes,
 			    m->m_pkthdr.len);
 			/* IN_IFADDR_RUNLOCK(); */
 			goto ours;
 		}
 	}
 	/* IN_IFADDR_RUNLOCK(); */
 
 	/*
 	 * Check for broadcast addresses.
 	 *
 	 * Only accept broadcast packets that arrive via the matching
 	 * interface.  Reception of forwarded directed broadcasts would
 	 * be handled via ip_forward() and ether_output() with the loopback
 	 * into the stack for SIMPLEX interfaces handled by ether_output().
 	 */
 	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
 		IF_ADDR_RLOCK(ifp);
 	        TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    ip->ip_dst.s_addr) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #ifdef BOOTP_COMPAT
 			if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #endif
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		ia = NULL;
 	}
 	/* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */
 	if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		if (V_ip_mrouter) {
 			/*
 			 * If we are acting as a multicast router, all
 			 * incoming multicast packets are passed to the
 			 * kernel-level multicast forwarding function.
 			 * The packet is returned (relatively) intact; if
 			 * ip_mforward() returns a non-zero value, the packet
 			 * must be discarded, else it may be accepted below.
 			 */
 			if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) {
 				IPSTAT_INC(ips_cantforward);
 				m_freem(m);
 				return;
 			}
 
 			/*
 			 * The process-level routing daemon needs to receive
 			 * all multicast IGMP packets, whether or not this
 			 * host belongs to their destination groups.
 			 */
 			if (ip->ip_p == IPPROTO_IGMP)
 				goto ours;
 			IPSTAT_INC(ips_forward);
 		}
 		/*
 		 * Assume the packet is for us, to avoid prematurely taking
 		 * a lock on the in_multi hash. Protocols must perform
 		 * their own filtering and update statistics accordingly.
 		 */
 		goto ours;
 	}
 	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
 		goto ours;
 	if (ip->ip_dst.s_addr == INADDR_ANY)
 		goto ours;
 
 	/*
 	 * FAITH(Firewall Aided Internet Translator)
 	 */
 	if (ifp && ifp->if_type == IFT_FAITH) {
 		if (V_ip_keepfaith) {
 			if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) 
 				goto ours;
 		}
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * Not for us; forward if possible and desirable.
 	 */
 	if (V_ipforwarding == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 	} else {
 #ifdef IPSEC
 		if (ip_ipsec_fwd(m))
 			goto bad;
 #endif /* IPSEC */
 		ip_forward(m, dchg);
 	}
 	return;
 
 ours:
 #ifdef IPSTEALTH
 	/*
 	 * IPSTEALTH: Process non-routing options only
 	 * if the packet is destined for us.
 	 */
 	if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1))
 		return;
 #endif /* IPSTEALTH */
 
 	/*
 	 * Attempt reassembly; if it succeeds, proceed.
 	 * ip_reass() will return a different mbuf.
 	 */
 	if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
 		/* XXXGL: shouldn't we save & set m_flags? */
 		m = ip_reass(m);
 		if (m == NULL)
 			return;
 		ip = mtod(m, struct ip *);
 		/* Get the header length of the reassembled packet */
 		hlen = ip->ip_hl << 2;
 	}
 
 #ifdef IPSEC
 	/*
 	 * enforce IPsec policy checking if we are seeing last header.
 	 * note that we do not visit this with protocols with pcb layer
 	 * code - like udp/tcp/raw ip.
 	 */
 	if (ip_ipsec_input(m))
 		goto bad;
 #endif /* IPSEC */
 
 	/*
 	 * Switch out to protocol's input routine.
 	 */
 	IPSTAT_INC(ips_delivered);
 
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
 	return;
 bad:
 	m_freem(m);
 }
 
 /*
  * After maxnipq has been updated, propagate the change to UMA.  The UMA zone
  * max has slightly different semantics than the sysctl, for historical
  * reasons.
  */
 static void
 maxnipq_update(void)
 {
 
 	/*
 	 * -1 for unlimited allocation.
 	 */
 	if (V_maxnipq < 0)
 		uma_zone_set_max(V_ipq_zone, 0);
 	/*
 	 * Positive number for specific bound.
 	 */
 	if (V_maxnipq > 0)
 		uma_zone_set_max(V_ipq_zone, V_maxnipq);
 	/*
 	 * Zero specifies no further fragment queue allocation -- set the
 	 * bound very low, but rely on implementation elsewhere to actually
 	 * prevent allocation and reclaim current queues.
 	 */
 	if (V_maxnipq == 0)
 		uma_zone_set_max(V_ipq_zone, 1);
 }
 
 static void
 ipq_zone_change(void *tag)
 {
 
 	if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) {
 		V_maxnipq = nmbclusters / 32;
 		maxnipq_update();
 	}
 }
 
 static int
 sysctl_maxnipq(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	i = V_maxnipq;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	/*
 	 * XXXRW: Might be a good idea to sanity check the argument and place
 	 * an extreme upper bound.
 	 */
 	if (i < -1)
 		return (EINVAL);
 	V_maxnipq = i;
 	maxnipq_update();
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW,
     NULL, 0, sysctl_maxnipq, "I",
     "Maximum number of IPv4 fragment reassembly queue entries");
 
 #define	M_IP_FRAG	M_PROTO9
 
 /*
  * Take incoming datagram fragment and try to reassemble it into
  * whole datagram.  If the argument is the first fragment or one
  * in between the function will return NULL and store the mbuf
  * in the fragment chain.  If the argument is the last fragment
  * the packet will be reassembled and the pointer to the new
  * mbuf returned for further processing.  Only m_tags attached
  * to the first packet/fragment are preserved.
  * The IP header is *NOT* adjusted out of iplen.
  */
 struct mbuf *
 ip_reass(struct mbuf *m)
 {
 	struct ip *ip;
 	struct mbuf *p, *q, *nq, *t;
 	struct ipq *fp = NULL;
 	struct ipqhead *head;
 	int i, hlen, next;
 	u_int8_t ecn, ecn0;
 	u_short hash;
 #ifdef	RSS
 	uint32_t rss_hash, rss_type;
 #endif
 
 	/* If maxnipq or maxfragsperpacket are 0, never accept fragments. */
 	if (V_maxnipq == 0 || V_maxfragsperpacket == 0) {
 		IPSTAT_INC(ips_fragments);
 		IPSTAT_INC(ips_fragdropped);
 		m_freem(m);
 		return (NULL);
 	}
 
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 
 	hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
 	head = &V_ipq[hash];
 	IPQ_LOCK();
 
 	/*
 	 * Look for queue of fragments
 	 * of this datagram.
 	 */
 	TAILQ_FOREACH(fp, head, ipq_list)
 		if (ip->ip_id == fp->ipq_id &&
 		    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
 		    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
 #ifdef MAC
 		    mac_ipq_match(m, fp) &&
 #endif
 		    ip->ip_p == fp->ipq_p)
 			goto found;
 
 	fp = NULL;
 
 	/*
 	 * Attempt to trim the number of allocated fragment queues if it
 	 * exceeds the administrative limit.
 	 */
 	if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) {
 		/*
 		 * drop something from the tail of the current queue
 		 * before proceeding further
 		 */
 		struct ipq *q = TAILQ_LAST(head, ipqhead);
 		if (q == NULL) {   /* gak */
 			for (i = 0; i < IPREASS_NHASH; i++) {
 				struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead);
 				if (r) {
 					IPSTAT_ADD(ips_fragtimeout,
 					    r->ipq_nfrags);
 					ip_freef(&V_ipq[i], r);
 					break;
 				}
 			}
 		} else {
 			IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags);
 			ip_freef(head, q);
 		}
 	}
 
 found:
 	/*
 	 * Adjust ip_len to not reflect header,
 	 * convert offset of this to bytes.
 	 */
 	ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
 	if (ip->ip_off & htons(IP_MF)) {
 		/*
 		 * Make sure that fragments have a data length
 		 * that's a non-zero multiple of 8 bytes.
 		 */
 		if (ip->ip_len == htons(0) || (ntohs(ip->ip_len) & 0x7) != 0) {
 			IPSTAT_INC(ips_toosmall); /* XXX */
 			goto dropfrag;
 		}
 		m->m_flags |= M_IP_FRAG;
 	} else
 		m->m_flags &= ~M_IP_FRAG;
 	ip->ip_off = htons(ntohs(ip->ip_off) << 3);
 
 	/*
 	 * Attempt reassembly; if it succeeds, proceed.
 	 * ip_reass() will return a different mbuf.
 	 */
 	IPSTAT_INC(ips_fragments);
 	m->m_pkthdr.PH_loc.ptr = ip;
 
 	/* Previous ip_reass() started here. */
 	/*
 	 * Presence of header sizes in mbufs
 	 * would confuse code below.
 	 */
 	m->m_data += hlen;
 	m->m_len -= hlen;
 
 	/*
 	 * If first fragment to arrive, create a reassembly queue.
 	 */
 	if (fp == NULL) {
 		fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
 		if (fp == NULL)
 			goto dropfrag;
 #ifdef MAC
 		if (mac_ipq_init(fp, M_NOWAIT) != 0) {
 			uma_zfree(V_ipq_zone, fp);
 			fp = NULL;
 			goto dropfrag;
 		}
 		mac_ipq_create(m, fp);
 #endif
 		TAILQ_INSERT_HEAD(head, fp, ipq_list);
 		V_nipq++;
 		fp->ipq_nfrags = 1;
 		fp->ipq_ttl = IPFRAGTTL;
 		fp->ipq_p = ip->ip_p;
 		fp->ipq_id = ip->ip_id;
 		fp->ipq_src = ip->ip_src;
 		fp->ipq_dst = ip->ip_dst;
 		fp->ipq_frags = m;
 		m->m_nextpkt = NULL;
 		goto done;
 	} else {
 		fp->ipq_nfrags++;
 #ifdef MAC
 		mac_ipq_update(m, fp);
 #endif
 	}
 
 #define GETIP(m)	((struct ip*)((m)->m_pkthdr.PH_loc.ptr))
 
 	/*
 	 * Handle ECN by comparing this segment with the first one;
 	 * if CE is set, do not lose CE.
 	 * drop if CE and not-ECT are mixed for the same packet.
 	 */
 	ecn = ip->ip_tos & IPTOS_ECN_MASK;
 	ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
 	if (ecn == IPTOS_ECN_CE) {
 		if (ecn0 == IPTOS_ECN_NOTECT)
 			goto dropfrag;
 		if (ecn0 != IPTOS_ECN_CE)
 			GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
 	}
 	if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
 		goto dropfrag;
 
 	/*
 	 * Find a segment which begins after this one does.
 	 */
 	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
 		if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off))
 			break;
 
 	/*
 	 * If there is a preceding segment, it may provide some of
 	 * our data already.  If so, drop the data from the incoming
 	 * segment.  If it provides all of our data, drop us, otherwise
 	 * stick new segment in the proper place.
 	 *
 	 * If some of the data is dropped from the preceding
 	 * segment, then it's checksum is invalidated.
 	 */
 	if (p) {
 		i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) -
 		    ntohs(ip->ip_off);
 		if (i > 0) {
 			if (i >= ntohs(ip->ip_len))
 				goto dropfrag;
 			m_adj(m, i);
 			m->m_pkthdr.csum_flags = 0;
 			ip->ip_off = htons(ntohs(ip->ip_off) + i);
 			ip->ip_len = htons(ntohs(ip->ip_len) - i);
 		}
 		m->m_nextpkt = p->m_nextpkt;
 		p->m_nextpkt = m;
 	} else {
 		m->m_nextpkt = fp->ipq_frags;
 		fp->ipq_frags = m;
 	}
 
 	/*
 	 * While we overlap succeeding segments trim them or,
 	 * if they are completely covered, dequeue them.
 	 */
 	for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) >
 	    ntohs(GETIP(q)->ip_off); q = nq) {
 		i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) -
 		    ntohs(GETIP(q)->ip_off);
 		if (i < ntohs(GETIP(q)->ip_len)) {
 			GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i);
 			GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i);
 			m_adj(q, i);
 			q->m_pkthdr.csum_flags = 0;
 			break;
 		}
 		nq = q->m_nextpkt;
 		m->m_nextpkt = nq;
 		IPSTAT_INC(ips_fragdropped);
 		fp->ipq_nfrags--;
 		m_freem(q);
 	}
 
 	/*
 	 * Check for complete reassembly and perform frag per packet
 	 * limiting.
 	 *
 	 * Frag limiting is performed here so that the nth frag has
 	 * a chance to complete the packet before we drop the packet.
 	 * As a result, n+1 frags are actually allowed per packet, but
 	 * only n will ever be stored. (n = maxfragsperpacket.)
 	 *
 	 */
 	next = 0;
 	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
 		if (ntohs(GETIP(q)->ip_off) != next) {
 			if (fp->ipq_nfrags > V_maxfragsperpacket) {
 				IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
 				ip_freef(head, fp);
 			}
 			goto done;
 		}
 		next += ntohs(GETIP(q)->ip_len);
 	}
 	/* Make sure the last packet didn't have the IP_MF flag */
 	if (p->m_flags & M_IP_FRAG) {
 		if (fp->ipq_nfrags > V_maxfragsperpacket) {
 			IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
 			ip_freef(head, fp);
 		}
 		goto done;
 	}
 
 	/*
 	 * Reassembly is complete.  Make sure the packet is a sane size.
 	 */
 	q = fp->ipq_frags;
 	ip = GETIP(q);
 	if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
 		IPSTAT_INC(ips_toolong);
 		IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
 		ip_freef(head, fp);
 		goto done;
 	}
 
 	/*
 	 * Concatenate fragments.
 	 */
 	m = q;
 	t = m->m_next;
 	m->m_next = NULL;
 	m_cat(m, t);
 	nq = q->m_nextpkt;
 	q->m_nextpkt = NULL;
 	for (q = nq; q != NULL; q = nq) {
 		nq = q->m_nextpkt;
 		q->m_nextpkt = NULL;
 		m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
 		m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
 		m_cat(m, q);
 	}
 	/*
 	 * In order to do checksumming faster we do 'end-around carry' here
 	 * (and not in for{} loop), though it implies we are not going to
 	 * reassemble more than 64k fragments.
 	 */
 	while (m->m_pkthdr.csum_data & 0xffff0000)
 		m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
 		    (m->m_pkthdr.csum_data >> 16);
 #ifdef MAC
 	mac_ipq_reassemble(fp, m);
 	mac_ipq_destroy(fp);
 #endif
 
 	/*
 	 * Create header for new ip packet by modifying header of first
 	 * packet;  dequeue and discard fragment reassembly header.
 	 * Make header visible.
 	 */
 	ip->ip_len = htons((ip->ip_hl << 2) + next);
 	ip->ip_src = fp->ipq_src;
 	ip->ip_dst = fp->ipq_dst;
 	TAILQ_REMOVE(head, fp, ipq_list);
 	V_nipq--;
 	uma_zfree(V_ipq_zone, fp);
 	m->m_len += (ip->ip_hl << 2);
 	m->m_data -= (ip->ip_hl << 2);
 	/* some debugging cruft by sklower, below, will go away soon */
 	if (m->m_flags & M_PKTHDR)	/* XXX this should be done elsewhere */
 		m_fixhdr(m);
 	IPSTAT_INC(ips_reassembled);
 	IPQ_UNLOCK();
 
 #ifdef	RSS
 	/*
 	 * Query the RSS layer for the flowid / flowtype for the
 	 * mbuf payload.
 	 *
 	 * For now, just assume we have to calculate a new one.
 	 * Later on we should check to see if the assigned flowid matches
 	 * what RSS wants for the given IP protocol and if so, just keep it.
 	 *
 	 * We then queue into the relevant netisr so it can be dispatched
 	 * to the correct CPU.
 	 *
 	 * Note - this may return 1, which means the flowid in the mbuf
 	 * is correct for the configured RSS hash types and can be used.
 	 */
 	if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) {
 		m->m_pkthdr.flowid = rss_hash;
 		M_HASHTYPE_SET(m, rss_type);
 		m->m_flags |= M_FLOWID;
 	}
 
 	/*
 	 * Queue/dispatch for reprocessing.
 	 *
 	 * Note: this is much slower than just handling the frame in the
 	 * current receive context.  It's likely worth investigating
 	 * why this is.
 	 */
 	netisr_dispatch(NETISR_IP_DIRECT, m);
 	return (NULL);
 #endif
 
 	/* Handle in-line */
 	return (m);
 
 dropfrag:
 	IPSTAT_INC(ips_fragdropped);
 	if (fp != NULL)
 		fp->ipq_nfrags--;
 	m_freem(m);
 done:
 	IPQ_UNLOCK();
 	return (NULL);
 
 #undef GETIP
 }
 
 /*
  * Free a fragment reassembly header and all
  * associated datagrams.
  */
 static void
 ip_freef(struct ipqhead *fhp, struct ipq *fp)
 {
 	struct mbuf *q;
 
 	IPQ_LOCK_ASSERT();
 
 	while (fp->ipq_frags) {
 		q = fp->ipq_frags;
 		fp->ipq_frags = q->m_nextpkt;
 		m_freem(q);
 	}
 	TAILQ_REMOVE(fhp, fp, ipq_list);
 	uma_zfree(V_ipq_zone, fp);
 	V_nipq--;
 }
 
 /*
  * IP timer processing;
  * if a timer expires on a reassembly
  * queue, discard it.
  */
 void
 ip_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct ipq *fp;
 	int i;
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	IPQ_LOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		for (i = 0; i < IPREASS_NHASH; i++) {
 			for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) {
 				struct ipq *fpp;
 
 				fpp = fp;
 				fp = TAILQ_NEXT(fp, ipq_list);
 				if(--fpp->ipq_ttl == 0) {
 					IPSTAT_ADD(ips_fragtimeout,
 					    fpp->ipq_nfrags);
 					ip_freef(&V_ipq[i], fpp);
 				}
 			}
 		}
 		/*
 		 * If we are over the maximum number of fragments
 		 * (due to the limit being lowered), drain off
 		 * enough to get down to the new limit.
 		 */
 		if (V_maxnipq >= 0 && V_nipq > V_maxnipq) {
 			for (i = 0; i < IPREASS_NHASH; i++) {
 				while (V_nipq > V_maxnipq &&
 				    !TAILQ_EMPTY(&V_ipq[i])) {
 					IPSTAT_ADD(ips_fragdropped,
 					    TAILQ_FIRST(&V_ipq[i])->ipq_nfrags);
 					ip_freef(&V_ipq[i],
 					    TAILQ_FIRST(&V_ipq[i]));
 				}
 			}
 		}
 		CURVNET_RESTORE();
 	}
 	IPQ_UNLOCK();
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Drain off all datagram fragments.
  */
 static void
 ip_drain_locked(void)
 {
 	int     i;
 
 	IPQ_LOCK_ASSERT();
 
 	for (i = 0; i < IPREASS_NHASH; i++) {
 		while(!TAILQ_EMPTY(&V_ipq[i])) {
 			IPSTAT_ADD(ips_fragdropped,
 			    TAILQ_FIRST(&V_ipq[i])->ipq_nfrags);
 			ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i]));
 		}
 	}
 }
 
 void
 ip_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	IPQ_LOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ip_drain_locked();
 		CURVNET_RESTORE();
 	}
 	IPQ_UNLOCK();
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	in_rtqdrain();
 }
 
 /*
  * The protocol to be inserted into ip_protox[] must be already registered
  * in inetsw[], either statically or through pf_proto_register().
  */
 int
 ipproto_register(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] != pr - inetsw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/* Find the protocol position in inetsw[] and set the index. */
 	for (pr = inetdomain.dom_protosw;
 	     pr < inetdomain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol == ipproto) {
 			ip_protox[pr->pr_protocol] = pr - inetsw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ipproto_unregister(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] == pr - inetsw)  /* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip_protox[ipproto] = pr - inetsw;
 	return (0);
 }
 
 /*
  * Given address of next destination (final or next hop), return (referenced)
  * internet address info of interface to be used to get there.
  */
 struct in_ifaddr *
 ip_rtaddr(struct in_addr dst, u_int fibnum)
 {
 	struct route sro;
 	struct sockaddr_in *sin;
 	struct in_ifaddr *ia;
 
 	bzero(&sro, sizeof(sro));
 	sin = (struct sockaddr_in *)&sro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = dst;
 	in_rtalloc_ign(&sro, 0, fibnum);
 
 	if (sro.ro_rt == NULL)
 		return (NULL);
 
 	ia = ifatoia(sro.ro_rt->rt_ifa);
 	ifa_ref(&ia->ia_ifa);
 	RTFREE(sro.ro_rt);
 	return (ia);
 }
 
 u_char inetctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		EHOSTUNREACH,	0,
 	ENOPROTOOPT,	ECONNREFUSED
 };
 
 /*
  * Forward a packet.  If some error occurs return the sender
  * an icmp packet.  Note we can't always generate a meaningful
  * icmp message because icmp doesn't have a large enough repertoire
  * of codes and types.
  *
  * If not forwarding, just drop the packet.  This could be confusing
  * if ipforwarding was zero but some routing protocol was advancing
  * us as a gateway to somewhere.  However, we must let the routing
  * protocol deal with that.
  *
  * The srcrt parameter indicates whether the packet is being forwarded
  * via a source route.
  */
 void
 ip_forward(struct mbuf *m, int srcrt)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct in_ifaddr *ia;
 	struct mbuf *mcopy;
 	struct in_addr dest;
 	struct route ro;
 	int error, type = 0, code = 0, mtu = 0;
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 		if (ip->ip_ttl <= IPTTLDEC) {
 			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
 			    0, 0);
 			return;
 		}
 #ifdef IPSTEALTH
 	}
 #endif
 
 	ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
 #ifndef IPSEC
 	/*
 	 * 'ia' may be NULL if there is no route for this destination.
 	 * In case of IPsec, Don't discard it just yet, but pass it to
 	 * ip_output in case of outgoing IPsec policy.
 	 */
 	if (!srcrt && ia == NULL) {
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		return;
 	}
 #endif
 
 	/*
 	 * Save the IP header and at most 8 bytes of the payload,
 	 * in case we need to generate an ICMP message to the src.
 	 *
 	 * XXX this can be optimized a lot by saving the data in a local
 	 * buffer on the stack (72 bytes at most), and only allocating the
 	 * mbuf if really necessary. The vast majority of the packets
 	 * are forwarded without having to send an ICMP back (either
 	 * because unnecessary, or because rate limited), so we are
 	 * really we are wasting a lot of work here.
 	 *
 	 * We don't use m_copy() because it might return a reference
 	 * to a shared cluster. Both this function and ip_output()
 	 * assume exclusive access to the IP header in `m', so any
 	 * data in a cluster may change before we reach icmp_error().
 	 */
 	mcopy = m_gethdr(M_NOWAIT, m->m_type);
 	if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) {
 		/*
 		 * It's probably ok if the pkthdr dup fails (because
 		 * the deep copy of the tag chain failed), but for now
 		 * be conservative and just discard the copy since
 		 * code below may some day want the tags.
 		 */
 		m_free(mcopy);
 		mcopy = NULL;
 	}
 	if (mcopy != NULL) {
 		mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy));
 		mcopy->m_pkthdr.len = mcopy->m_len;
 		m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
 	}
 
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 		ip->ip_ttl -= IPTTLDEC;
 #ifdef IPSTEALTH
 	}
 #endif
 
 	/*
 	 * If forwarding packet using same interface that it came in on,
 	 * perhaps should send a redirect to sender to shortcut a hop.
 	 * Only send redirect if source is sending directly to us,
 	 * and if packet was not source routed (or has any options).
 	 * Also, don't send redirect if forwarding using a default route
 	 * or a route modified by a redirect.
 	 */
 	dest.s_addr = 0;
 	if (!srcrt && V_ipsendredirects &&
 	    ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) {
 		struct sockaddr_in *sin;
 		struct rtentry *rt;
 
 		bzero(&ro, sizeof(ro));
 		sin = (struct sockaddr_in *)&ro.ro_dst;
 		sin->sin_family = AF_INET;
 		sin->sin_len = sizeof(*sin);
 		sin->sin_addr = ip->ip_dst;
 		in_rtalloc_ign(&ro, 0, M_GETFIB(m));
 
 		rt = ro.ro_rt;
 
 		if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
 		    satosin(rt_key(rt))->sin_addr.s_addr != 0) {
 #define	RTA(rt)	((struct in_ifaddr *)(rt->rt_ifa))
 			u_long src = ntohl(ip->ip_src.s_addr);
 
 			if (RTA(rt) &&
 			    (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
 				if (rt->rt_flags & RTF_GATEWAY)
 					dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr;
 				else
 					dest.s_addr = ip->ip_dst.s_addr;
 				/* Router requirements says to only send host redirects */
 				type = ICMP_REDIRECT;
 				code = ICMP_REDIRECT_HOST;
 			}
 		}
 		if (rt)
 			RTFREE(rt);
 	}
 
 	/*
 	 * Try to cache the route MTU from ip_output so we can consider it for
 	 * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191.
 	 */
 	bzero(&ro, sizeof(ro));
 
 	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
 
 	if (error == EMSGSIZE && ro.ro_rt)
 		mtu = ro.ro_rt->rt_mtu;
 	RO_RTFREE(&ro);
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);
 	else {
 		IPSTAT_INC(ips_forward);
 		if (type)
 			IPSTAT_INC(ips_redirectsent);
 		else {
 			if (mcopy)
 				m_freem(mcopy);
 			if (ia != NULL)
 				ifa_free(&ia->ia_ifa);
 			return;
 		}
 	}
 	if (mcopy == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return;
 	}
 
 	switch (error) {
 
 	case 0:				/* forwarded, but need redirect */
 		/* type, code set above */
 		break;
 
 	case ENETUNREACH:
 	case EHOSTUNREACH:
 	case ENETDOWN:
 	case EHOSTDOWN:
 	default:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_HOST;
 		break;
 
 	case EMSGSIZE:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_NEEDFRAG;
 
 #ifdef IPSEC
 		/* 
 		 * If IPsec is configured for this path,
 		 * override any possibly mtu value set by ip_output.
 		 */ 
 		mtu = ip_ipsec_mtu(mcopy, mtu);
 #endif /* IPSEC */
 		/*
 		 * If the MTU was set before make sure we are below the
 		 * interface MTU.
 		 * If the MTU wasn't set before use the interface mtu or
 		 * fall back to the next smaller mtu step compared to the
 		 * current packet size.
 		 */
 		if (mtu != 0) {
 			if (ia != NULL)
 				mtu = min(mtu, ia->ia_ifp->if_mtu);
 		} else {
 			if (ia != NULL)
 				mtu = ia->ia_ifp->if_mtu;
 			else
 				mtu = ip_next_mtu(ntohs(ip->ip_len), 0);
 		}
 		IPSTAT_INC(ips_cantfrag);
 		break;
 
 	case ENOBUFS:
 		/*
 		 * A router should not generate ICMP_SOURCEQUENCH as
 		 * required in RFC1812 Requirements for IP Version 4 Routers.
 		 * Source quench could be a big problem under DoS attacks,
 		 * or if the underlying interface is rate-limited.
 		 * Those who need source quench packets may re-enable them
 		 * via the net.inet.ip.sendsourcequench sysctl.
 		 */
 		if (V_ip_sendsourcequench == 0) {
 			m_freem(mcopy);
 			if (ia != NULL)
 				ifa_free(&ia->ia_ifa);
 			return;
 		} else {
 			type = ICMP_SOURCEQUENCH;
 			code = 0;
 		}
 		break;
 
 	case EACCES:			/* ipfw denied packet */
 		m_freem(mcopy);
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return;
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	icmp_error(mcopy, type, code, dest.s_addr, mtu);
 }
 
 void
 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
     struct mbuf *m)
 {
 
 	if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) {
 		struct bintime bt;
 
 		bintime(&bt);
 		if (inp->inp_socket->so_options & SO_BINTIME) {
 			*mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt),
 			    SCM_BINTIME, SOL_SOCKET);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 		if (inp->inp_socket->so_options & SO_TIMESTAMP) {
 			struct timeval tv;
 
 			bintime2timeval(&bt, &tv);
 			*mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv),
 			    SCM_TIMESTAMP, SOL_SOCKET);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 	if (inp->inp_flags & INP_RECVDSTADDR) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_dst,
 		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTTL) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_ttl,
 		    sizeof(u_char), IP_RECVTTL, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #ifdef notyet
 	/* XXX
 	 * Moving these out of udp_input() made them even more broken
 	 * than they already were.
 	 */
 	/* options were tossed already */
 	if (inp->inp_flags & INP_RECVOPTS) {
 		*mp = sbcreatecontrol((caddr_t)opts_deleted_above,
 		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	/* ip_srcroute doesn't do what we want here, need to fix */
 	if (inp->inp_flags & INP_RECVRETOPTS) {
 		*mp = sbcreatecontrol((caddr_t)ip_srcroute(m),
 		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 	if (inp->inp_flags & INP_RECVIF) {
 		struct ifnet *ifp;
 		struct sdlbuf {
 			struct sockaddr_dl sdl;
 			u_char	pad[32];
 		} sdlbuf;
 		struct sockaddr_dl *sdp;
 		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
 
 		if ((ifp = m->m_pkthdr.rcvif) &&
 		    ifp->if_index && ifp->if_index <= V_if_index) {
 			sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
 			/*
 			 * Change our mind and don't try copy.
 			 */
 			if (sdp->sdl_family != AF_LINK ||
 			    sdp->sdl_len > sizeof(sdlbuf)) {
 				goto makedummy;
 			}
 			bcopy(sdp, sdl2, sdp->sdl_len);
 		} else {
 makedummy:	
 			sdl2->sdl_len =
 			    offsetof(struct sockaddr_dl, sdl_data[0]);
 			sdl2->sdl_family = AF_LINK;
 			sdl2->sdl_index = 0;
 			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
 		}
 		*mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len,
 		    IP_RECVIF, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTOS) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_tos,
 		    sizeof(u_char), IP_RECVTOS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (inp->inp_flags2 & INP_RECVFLOWID) {
 		uint32_t flowid, flow_type;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		/*
 		 * XXX should handle the failure of one or the
 		 * other - don't populate both?
 		 */
 		*mp = sbcreatecontrol((caddr_t) &flowid,
 		    sizeof(uint32_t), IP_FLOWID, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 		*mp = sbcreatecontrol((caddr_t) &flow_type,
 		    sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 #ifdef	RSS
 	if (inp->inp_flags2 & INP_RECVRSSBUCKETID) {
 		uint32_t flowid, flow_type;
 		uint32_t rss_bucketid;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
 			*mp = sbcreatecontrol((caddr_t) &rss_bucketid,
 			   sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 }
 
 /*
  * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the
  * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on
  * locking.  This code remains in ip_input.c as ip_mroute.c is optionally
  * compiled.
  */
 static VNET_DEFINE(int, ip_rsvp_on);
 VNET_DEFINE(struct socket *, ip_rsvpd);
 
 #define	V_ip_rsvp_on		VNET(ip_rsvp_on)
 
 int
 ip_rsvp_init(struct socket *so)
 {
 
 	if (so->so_type != SOCK_RAW ||
 	    so->so_proto->pr_protocol != IPPROTO_RSVP)
 		return EOPNOTSUPP;
 
 	if (V_ip_rsvpd != NULL)
 		return EADDRINUSE;
 
 	V_ip_rsvpd = so;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-increment
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (!V_ip_rsvp_on) {
 		V_ip_rsvp_on = 1;
 		V_rsvp_on++;
 	}
 
 	return 0;
 }
 
 int
 ip_rsvp_done(void)
 {
 
 	V_ip_rsvpd = NULL;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-decrement
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (V_ip_rsvp_on) {
 		V_ip_rsvp_on = 0;
 		V_rsvp_on--;
 	}
 	return 0;
 }
 
 int
 rsvp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 
 	m = *mp;
 	*mp = NULL;
 
 	if (rsvp_input_p) { /* call the real one if loaded */
 		*mp = m;
 		rsvp_input_p(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 
 	/* Can still get packets with rsvp_on = 0 if there is a local member
 	 * of the group to which the RSVP packet is addressed.  But in this
 	 * case we want to throw the packet away.
 	 */
 	
 	if (!V_rsvp_on) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (V_ip_rsvpd != NULL) { 
 		*mp = m;
 		rip_input(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 	/* Drop the packet */
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
Index: user/dchagin/lemul/sys/netinet/ip_ipsec.c
===================================================================
--- user/dchagin/lemul/sys/netinet/ip_ipsec.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/ip_ipsec.c	(revision 274241)
@@ -1,380 +1,380 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *      The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipsec.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/ip_ipsec.h>
 #ifdef SCTP
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <machine/in_cksum.h>
 
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 #include <netipsec/key.h>
 
 extern	struct protosw inetsw[];
 
 #ifdef IPSEC_FILTERTUNNEL
 static VNET_DEFINE(int, ip4_ipsec_filtertunnel) = 1;
 #else
 static VNET_DEFINE(int, ip4_ipsec_filtertunnel) = 0;
 #endif
 #define	V_ip4_ipsec_filtertunnel VNET(ip4_ipsec_filtertunnel)
 
 SYSCTL_DECL(_net_inet_ipsec);
-SYSCTL_VNET_INT(_net_inet_ipsec, OID_AUTO, filtertunnel,
-	CTLFLAG_RW, &VNET_NAME(ip4_ipsec_filtertunnel), 0,
+SYSCTL_INT(_net_inet_ipsec, OID_AUTO, filtertunnel,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ipsec_filtertunnel), 0,
 	"If set filter packets from an IPsec tunnel.");
 
 /*
  * Check if we have to jump over firewall processing for this packet.
  * Called from ip_input().
  * 1 = jump over firewall, 0 = packet goes through firewall.
  */
 int
 ip_ipsec_filtertunnel(struct mbuf *m)
 {
 
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (!V_ip4_ipsec_filtertunnel &&
 	    m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL)
 		return 1;
 	return 0;
 }
 
 /*
  * Check if this packet has an active SA and needs to be dropped instead
  * of forwarded.
  * Called from ip_input().
  * 1 = drop packet, 0 = forward packet.
  */
 int
 ip_ipsec_fwd(struct mbuf *m)
 {
 	struct m_tag *mtag;
 	struct tdb_ident *tdbi;
 	struct secpolicy *sp;
 	int error;
 
 	mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
 	if (mtag != NULL) {
 		tdbi = (struct tdb_ident *)(mtag + 1);
 		sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
 	} else {
 		sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
 					   IP_FORWARDING, &error);   
 	}
 	if (sp == NULL) {	/* NB: can happen if error */
 		/*XXX error stat???*/
 		DPRINTF(("ip_input: no SP for forwarding\n"));	/*XXX*/
 		return 1;
 	}
 
 	/*
 	 * Check security policy against packet attributes.
 	 */
 	error = ipsec_in_reject(sp, m);
 	KEY_FREESP(&sp);
 	if (error) {
 		IPSTAT_INC(ips_cantforward);
 		return 1;
 	}
 	return 0;
 }
 
 /*
  * Check if protocol type doesn't have a further header and do IPSEC
  * decryption or reject right now.  Protocols with further headers get
  * their IPSEC treatment within the protocol specific processing.
  * Called from ip_input().
  * 1 = drop packet, 0 = continue processing packet.
  */
 int
 ip_ipsec_input(struct mbuf *m)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct m_tag *mtag;
 	struct tdb_ident *tdbi;
 	struct secpolicy *sp;
 	int error;
 	/*
 	 * enforce IPsec policy checking if we are seeing last header.
 	 * note that we do not visit this with protocols with pcb layer
 	 * code - like udp/tcp/raw ip.
 	 */
 	if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0) {
 		/*
 		 * Check if the packet has already had IPsec processing
 		 * done.  If so, then just pass it along.  This tag gets
 		 * set during AH, ESP, etc. input handling, before the
 		 * packet is returned to the ip input queue for delivery.
 		 */ 
 		mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
 		if (mtag != NULL) {
 			tdbi = (struct tdb_ident *)(mtag + 1);
 			sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
 		} else {
 			sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
 						   IP_FORWARDING, &error);   
 		}
 		if (sp != NULL) {
 			/*
 			 * Check security policy against packet attributes.
 			 */
 			error = ipsec_in_reject(sp, m);
 			KEY_FREESP(&sp);
 		} else {
 			/* XXX error stat??? */
 			error = EINVAL;
 			DPRINTF(("ip_input: no SP, packet discarded\n"));/*XXX*/
 			return 1;
 		}
 		if (error)
 			return 1;
 	}
 	return 0;
 }
 
 /*
  * Compute the MTU for a forwarded packet that gets IPSEC encapsulated.
  * Called from ip_forward().
  * Returns MTU suggestion for ICMP needfrag reply.
  */
 int
 ip_ipsec_mtu(struct mbuf *m, int mtu)
 {
 	/*
 	 * If the packet is routed over IPsec tunnel, tell the
 	 * originator the tunnel MTU.
 	 *	tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
 	 * XXX quickhack!!!
 	 */
 	struct secpolicy *sp = NULL;
 	int ipsecerror;
 	int ipsechdr;
 	struct route *ro;
 	sp = ipsec_getpolicybyaddr(m,
 				   IPSEC_DIR_OUTBOUND,
 				   IP_FORWARDING,
 				   &ipsecerror);
 	if (sp != NULL) {
 		/* count IPsec header size */
 		ipsechdr = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, NULL);
 
 		/*
 		 * find the correct route for outer IPv4
 		 * header, compute tunnel MTU.
 		 */
 		if (sp->req != NULL &&
 		    sp->req->sav != NULL &&
 		    sp->req->sav->sah != NULL) {
 			ro = &sp->req->sav->sah->route_cache.sa_route;
 			if (ro->ro_rt && ro->ro_rt->rt_ifp) {
 				mtu = ro->ro_rt->rt_mtu ? ro->ro_rt->rt_mtu :
 				    ro->ro_rt->rt_ifp->if_mtu;
 				mtu -= ipsechdr;
 			}
 		}
 		KEY_FREESP(&sp);
 	}
 	return mtu;
 }
 
 /*
  * 
  * Called from ip_output().
  * 1 = drop packet, 0 = continue processing packet,
  * -1 = packet was reinjected and stop processing packet
  */
 int
 ip_ipsec_output(struct mbuf **m, struct inpcb *inp, int *flags, int *error)
 {
 	struct secpolicy *sp = NULL;
 	struct tdb_ident *tdbi;
 	struct m_tag *mtag;
 	/*
 	 * Check the security policy (SP) for the packet and, if
 	 * required, do IPsec-related processing.  There are two
 	 * cases here; the first time a packet is sent through
 	 * it will be untagged and handled by ipsec4_checkpolicy.
 	 * If the packet is resubmitted to ip_output (e.g. after
 	 * AH, ESP, etc. processing), there will be a tag to bypass
 	 * the lookup and related policy checking.
 	 */
 	mtag = m_tag_find(*m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
 	if (mtag != NULL) {
 		tdbi = (struct tdb_ident *)(mtag + 1);
 		sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
 		if (sp == NULL)
 			*error = -EINVAL;	/* force silent drop */
 		m_tag_delete(*m, mtag);
 	} else {
 		sp = ipsec4_checkpolicy(*m, IPSEC_DIR_OUTBOUND, *flags,
 					error, inp);
 	}
 	/*
 	 * There are four return cases:
 	 *    sp != NULL	 	    apply IPsec policy
 	 *    sp == NULL, error == 0	    no IPsec handling needed
 	 *    sp == NULL, error == -EINVAL  discard packet w/o error
 	 *    sp == NULL, error != 0	    discard packet, report error
 	 */
 	if (sp != NULL) {
 		/* Loop detection, check if ipsec processing already done */
 		KASSERT(sp->req != NULL, ("ip_output: no ipsec request"));
 		for (mtag = m_tag_first(*m); mtag != NULL;
 		     mtag = m_tag_next(*m, mtag)) {
 			if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
 				continue;
 			if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
 			    mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
 				continue;
 			/*
 			 * Check if policy has an SA associated with it.
 			 * This can happen when an SP has yet to acquire
 			 * an SA; e.g. on first reference.  If it occurs,
 			 * then we let ipsec4_process_packet do its thing.
 			 */
 			if (sp->req->sav == NULL)
 				break;
 			tdbi = (struct tdb_ident *)(mtag + 1);
 			if (tdbi->spi == sp->req->sav->spi &&
 			    tdbi->proto == sp->req->sav->sah->saidx.proto &&
 			    bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst,
 				 sizeof (union sockaddr_union)) == 0) {
 				/*
 				 * No IPsec processing is needed, free
 				 * reference to SP.
 				 *
 				 * NB: null pointer to avoid free at
 				 *     done: below.
 				 */
 				KEY_FREESP(&sp), sp = NULL;
 				goto done;
 			}
 		}
 
 		/*
 		 * Do delayed checksums now because we send before
 		 * this is done in the normal processing path.
 		 */
 		if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			in_delayed_cksum(*m);
 			(*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 		}
 #ifdef SCTP
 		if ((*m)->m_pkthdr.csum_flags & CSUM_SCTP) {
 			struct ip *ip = mtod(*m, struct ip *);
 
 			sctp_delayed_cksum(*m, (uint32_t)(ip->ip_hl << 2));
 			(*m)->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 		}
 #endif
 
 		/* NB: callee frees mbuf */
 		*error = ipsec4_process_packet(*m, sp->req, *flags, 0);
 		if (*error == EJUSTRETURN) {
 			/*
 			 * We had a SP with a level of 'use' and no SA. We
 			 * will just continue to process the packet without
 			 * IPsec processing and return without error.
 			 */
 			*error = 0;
 			goto done;
 		}
 		/*
 		 * Preserve KAME behaviour: ENOENT can be returned
 		 * when an SA acquire is in progress.  Don't propagate
 		 * this to user-level; it confuses applications.
 		 *
 		 * XXX this will go away when the SADB is redone.
 		 */
 		if (*error == ENOENT)
 			*error = 0;
 		goto reinjected;
 	} else {	/* sp == NULL */
 
 		if (*error != 0) {
 			/*
 			 * Hack: -EINVAL is used to signal that a packet
 			 * should be silently discarded.  This is typically
 			 * because we asked key management for an SA and
 			 * it was delayed (e.g. kicked up to IKE).
 			 */
 			if (*error == -EINVAL)
 				*error = 0;
 			goto bad;
 		} else {
 			/* No IPsec processing for this packet. */
 		}
 	}
 done:
 	if (sp != NULL)
 		KEY_FREESP(&sp);
 	return 0;
 reinjected:
 	if (sp != NULL)
 		KEY_FREESP(&sp);
 	return -1;
 bad:
 	if (sp != NULL)
 		KEY_FREESP(&sp);
 	return 1;
 }
Index: user/dchagin/lemul/sys/netinet/ip_mroute.c
===================================================================
--- user/dchagin/lemul/sys/netinet/ip_mroute.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/ip_mroute.c	(revision 274241)
@@ -1,2948 +1,2948 @@
 /*-
  * Copyright (c) 1989 Stephen Deering
  * Copyright (c) 1992, 1993
  *      The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
  */
 
 /*
  * IP multicast forwarding procedures
  *
  * Written by David Waitzman, BBN Labs, August 1988.
  * Modified by Steve Deering, Stanford, February 1989.
  * Modified by Mark J. Steiglitz, Stanford, May, 1991
  * Modified by Van Jacobson, LBL, January 1993
  * Modified by Ajit Thyagarajan, PARC, August 1993
  * Modified by Bill Fenner, PARC, April 1995
  * Modified by Ahmed Helmy, SGI, June 1996
  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
  * Modified by Hitoshi Asaeda, WIDE, August 2000
  * Modified by Pavlin Radoslavov, ICSI, October 2002
  *
  * MROUTING Revision: 3.5
  * and PIM-SMv2 and PIM-DM support, advanced API support,
  * bandwidth metering and signaling
  */
 
 /*
  * TODO: Prefix functions with ipmf_.
  * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol
  * domain attachment (if_afdata) so we can track consumers of that service.
  * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT,
  * move it to socket options.
  * TODO: Cleanup LSRR removal further.
  * TODO: Push RSVP stubs into raw_ip.c.
  * TODO: Use bitstring.h for vif set.
  * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded.
  * TODO: Sync ip6_mroute.c with this file.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_mrouting.h"
 
 #define _PIM_VT 1
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/stddef.h>
 #include <sys/lock.h>
 #include <sys/ktr.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/counter.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/igmp.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip_mroute.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/pim.h>
 #include <netinet/pim_var.h>
 #include <netinet/udp.h>
 
 #include <machine/in_cksum.h>
 
 #ifndef KTR_IPMF
 #define KTR_IPMF KTR_INET
 #endif
 
 #define		VIFI_INVALID	((vifi_t) -1)
 
 static VNET_DEFINE(uint32_t, last_tv_sec); /* last time we processed this */
 #define	V_last_tv_sec	VNET(last_tv_sec)
 
 static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache");
 
 /*
  * Locking.  We use two locks: one for the virtual interface table and
  * one for the forwarding table.  These locks may be nested in which case
  * the VIF lock must always be taken first.  Note that each lock is used
  * to cover not only the specific data structure but also related data
  * structures.
  */
 
 static struct mtx mrouter_mtx;
 #define	MROUTER_LOCK()		mtx_lock(&mrouter_mtx)
 #define	MROUTER_UNLOCK()	mtx_unlock(&mrouter_mtx)
 #define	MROUTER_LOCK_ASSERT()	mtx_assert(&mrouter_mtx, MA_OWNED)
 #define	MROUTER_LOCK_INIT()						\
 	mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF)
 #define	MROUTER_LOCK_DESTROY()	mtx_destroy(&mrouter_mtx)
 
 static int ip_mrouter_cnt;	/* # of vnets with active mrouters */
 static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */
 
 static VNET_PCPUSTAT_DEFINE(struct mrtstat, mrtstat);
 VNET_PCPUSTAT_SYSINIT(mrtstat);
 VNET_PCPUSTAT_SYSUNINIT(mrtstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat,
     mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, "
     "netinet/ip_mroute.h)");
 
 static VNET_DEFINE(u_long, mfchash);
 #define	V_mfchash		VNET(mfchash)
 #define	MFCHASH(a, g)							\
 	((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \
 	  ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash)
 #define	MFCHASHSIZE	256
 
 static u_long mfchashsize;			/* Hash size */
 static VNET_DEFINE(u_char *, nexpire);		/* 0..mfchashsize-1 */
 #define	V_nexpire		VNET(nexpire)
 static VNET_DEFINE(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl);
 #define	V_mfchashtbl		VNET(mfchashtbl)
 
 static struct mtx mfc_mtx;
 #define	MFC_LOCK()		mtx_lock(&mfc_mtx)
 #define	MFC_UNLOCK()		mtx_unlock(&mfc_mtx)
 #define	MFC_LOCK_ASSERT()	mtx_assert(&mfc_mtx, MA_OWNED)
 #define	MFC_LOCK_INIT()							\
 	mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF)
 #define	MFC_LOCK_DESTROY()	mtx_destroy(&mfc_mtx)
 
 static VNET_DEFINE(vifi_t, numvifs);
 #define	V_numvifs		VNET(numvifs)
 static VNET_DEFINE(struct vif, viftable[MAXVIFS]);
 #define	V_viftable		VNET(viftable)
-SYSCTL_VNET_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD,
+SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD,
     &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]",
     "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)");
 
 static struct mtx vif_mtx;
 #define	VIF_LOCK()		mtx_lock(&vif_mtx)
 #define	VIF_UNLOCK()		mtx_unlock(&vif_mtx)
 #define	VIF_LOCK_ASSERT()	mtx_assert(&vif_mtx, MA_OWNED)
 #define	VIF_LOCK_INIT()							\
 	mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF)
 #define	VIF_LOCK_DESTROY()	mtx_destroy(&vif_mtx)
 
 static eventhandler_tag if_detach_event_tag = NULL;
 
 static VNET_DEFINE(struct callout, expire_upcalls_ch);
 #define	V_expire_upcalls_ch	VNET(expire_upcalls_ch)
 
 #define		EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second		*/
 #define		UPCALL_EXPIRE	6		/* number of timeouts	*/
 
 /*
  * Bandwidth meter variables and constants
  */
 static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters");
 /*
  * Pending timeouts are stored in a hash table, the key being the
  * expiration time. Periodically, the entries are analysed and processed.
  */
 #define	BW_METER_BUCKETS	1024
 static VNET_DEFINE(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]);
 #define	V_bw_meter_timers	VNET(bw_meter_timers)
 static VNET_DEFINE(struct callout, bw_meter_ch);
 #define	V_bw_meter_ch		VNET(bw_meter_ch)
 #define	BW_METER_PERIOD (hz)		/* periodical handling of bw meters */
 
 /*
  * Pending upcalls are stored in a vector which is flushed when
  * full, or periodically
  */
 static VNET_DEFINE(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]);
 #define	V_bw_upcalls		VNET(bw_upcalls)
 static VNET_DEFINE(u_int, bw_upcalls_n); /* # of pending upcalls */
 #define	V_bw_upcalls_n    	VNET(bw_upcalls_n)
 static VNET_DEFINE(struct callout, bw_upcalls_ch);
 #define	V_bw_upcalls_ch		VNET(bw_upcalls_ch)
 
 #define BW_UPCALLS_PERIOD (hz)		/* periodical flush of bw upcalls */
 
 static VNET_PCPUSTAT_DEFINE(struct pimstat, pimstat);
 VNET_PCPUSTAT_SYSINIT(pimstat);
 VNET_PCPUSTAT_SYSUNINIT(pimstat);
 
 SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");
 SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat,
     pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)");
 
 static u_long	pim_squelch_wholepkt = 0;
 SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW,
     &pim_squelch_wholepkt, 0,
     "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified");
 
 extern  struct domain inetdomain;
 static const struct protosw in_pim_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_PIM,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		pim_input,
 	.pr_output =		rip_output,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 };
 static const struct encaptab *pim_encap_cookie;
 
 static int pim_encapcheck(const struct mbuf *, int, int, void *);
 
 /*
  * Note: the PIM Register encapsulation adds the following in front of a
  * data packet:
  *
  * struct pim_encap_hdr {
  *    struct ip ip;
  *    struct pim_encap_pimhdr  pim;
  * }
  *
  */
 
 struct pim_encap_pimhdr {
 	struct pim pim;
 	uint32_t   flags;
 };
 #define		PIM_ENCAP_TTL	64
 
 static struct ip pim_encap_iphdr = {
 #if BYTE_ORDER == LITTLE_ENDIAN
 	sizeof(struct ip) >> 2,
 	IPVERSION,
 #else
 	IPVERSION,
 	sizeof(struct ip) >> 2,
 #endif
 	0,			/* tos */
 	sizeof(struct ip),	/* total length */
 	0,			/* id */
 	0,			/* frag offset */
 	PIM_ENCAP_TTL,
 	IPPROTO_PIM,
 	0,			/* checksum */
 };
 
 static struct pim_encap_pimhdr pim_encap_pimhdr = {
     {
 	PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
 	0,			/* reserved */
 	0,			/* checksum */
     },
     0				/* flags */
 };
 
 static VNET_DEFINE(vifi_t, reg_vif_num) = VIFI_INVALID;
 #define	V_reg_vif_num		VNET(reg_vif_num)
 static VNET_DEFINE(struct ifnet, multicast_register_if);
 #define	V_multicast_register_if	VNET(multicast_register_if)
 
 /*
  * Private variables.
  */
 
 static u_long	X_ip_mcast_src(int);
 static int	X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *,
 		    struct ip_moptions *);
 static int	X_ip_mrouter_done(void);
 static int	X_ip_mrouter_get(struct socket *, struct sockopt *);
 static int	X_ip_mrouter_set(struct socket *, struct sockopt *);
 static int	X_legal_vif_num(int);
 static int	X_mrt_ioctl(u_long, caddr_t, int);
 
 static int	add_bw_upcall(struct bw_upcall *);
 static int	add_mfc(struct mfcctl2 *);
 static int	add_vif(struct vifctl *);
 static void	bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
 static void	bw_meter_process(void);
 static void	bw_meter_receive_packet(struct bw_meter *, int,
 		    struct timeval *);
 static void	bw_upcalls_send(void);
 static int	del_bw_upcall(struct bw_upcall *);
 static int	del_mfc(struct mfcctl2 *);
 static int	del_vif(vifi_t);
 static int	del_vif_locked(vifi_t);
 static void	expire_bw_meter_process(void *);
 static void	expire_bw_upcalls_send(void *);
 static void	expire_mfc(struct mfc *);
 static void	expire_upcalls(void *);
 static void	free_bw_list(struct bw_meter *);
 static int	get_sg_cnt(struct sioc_sg_req *);
 static int	get_vif_cnt(struct sioc_vif_req *);
 static void	if_detached_event(void *, struct ifnet *);
 static int	ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
 static int	ip_mrouter_init(struct socket *, int);
 static __inline struct mfc *
 		mfc_find(struct in_addr *, struct in_addr *);
 static void	phyint_send(struct ip *, struct vif *, struct mbuf *);
 static struct mbuf *
 		pim_register_prepare(struct ip *, struct mbuf *);
 static int	pim_register_send(struct ip *, struct vif *,
 		    struct mbuf *, struct mfc *);
 static int	pim_register_send_rp(struct ip *, struct vif *,
 		    struct mbuf *, struct mfc *);
 static int	pim_register_send_upcall(struct ip *, struct vif *,
 		    struct mbuf *, struct mfc *);
 static void	schedule_bw_meter(struct bw_meter *, struct timeval *);
 static void	send_packet(struct vif *, struct mbuf *);
 static int	set_api_config(uint32_t *);
 static int	set_assert(int);
 static int	socket_send(struct socket *, struct mbuf *,
 		    struct sockaddr_in *);
 static void	unschedule_bw_meter(struct bw_meter *);
 
 /*
  * Kernel multicast forwarding API capabilities and setup.
  * If more API capabilities are added to the kernel, they should be
  * recorded in `mrt_api_support'.
  */
 #define MRT_API_VERSION		0x0305
 
 static const int mrt_api_version = MRT_API_VERSION;
 static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
 					 MRT_MFC_FLAGS_BORDER_VIF |
 					 MRT_MFC_RP |
 					 MRT_MFC_BW_UPCALL);
 static VNET_DEFINE(uint32_t, mrt_api_config);
 #define	V_mrt_api_config	VNET(mrt_api_config)
 static VNET_DEFINE(int, pim_assert_enabled);
 #define	V_pim_assert_enabled	VNET(pim_assert_enabled)
 static struct timeval pim_assert_interval = { 3, 0 };	/* Rate limit */
 
 /*
  * Find a route for a given origin IP address and multicast group address.
  * Statistics must be updated by the caller.
  */
 static __inline struct mfc *
 mfc_find(struct in_addr *o, struct in_addr *g)
 {
 	struct mfc *rt;
 
 	MFC_LOCK_ASSERT();
 
 	LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
 		if (in_hosteq(rt->mfc_origin, *o) &&
 		    in_hosteq(rt->mfc_mcastgrp, *g) &&
 		    TAILQ_EMPTY(&rt->mfc_stall))
 			break;
 	}
 
 	return (rt);
 }
 
 /*
  * Handle MRT setsockopt commands to modify the multicast forwarding tables.
  */
 static int
 X_ip_mrouter_set(struct socket *so, struct sockopt *sopt)
 {
     int	error, optval;
     vifi_t	vifi;
     struct	vifctl vifc;
     struct	mfcctl2 mfc;
     struct	bw_upcall bw_upcall;
     uint32_t	i;
 
     if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT)
 	return EPERM;
 
     error = 0;
     switch (sopt->sopt_name) {
     case MRT_INIT:
 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
 	if (error)
 	    break;
 	error = ip_mrouter_init(so, optval);
 	break;
 
     case MRT_DONE:
 	error = ip_mrouter_done();
 	break;
 
     case MRT_ADD_VIF:
 	error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
 	if (error)
 	    break;
 	error = add_vif(&vifc);
 	break;
 
     case MRT_DEL_VIF:
 	error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
 	if (error)
 	    break;
 	error = del_vif(vifi);
 	break;
 
     case MRT_ADD_MFC:
     case MRT_DEL_MFC:
 	/*
 	 * select data size depending on API version.
 	 */
 	if (sopt->sopt_name == MRT_ADD_MFC &&
 		V_mrt_api_config & MRT_API_FLAGS_ALL) {
 	    error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2),
 				sizeof(struct mfcctl2));
 	} else {
 	    error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl),
 				sizeof(struct mfcctl));
 	    bzero((caddr_t)&mfc + sizeof(struct mfcctl),
 			sizeof(mfc) - sizeof(struct mfcctl));
 	}
 	if (error)
 	    break;
 	if (sopt->sopt_name == MRT_ADD_MFC)
 	    error = add_mfc(&mfc);
 	else
 	    error = del_mfc(&mfc);
 	break;
 
     case MRT_ASSERT:
 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
 	if (error)
 	    break;
 	set_assert(optval);
 	break;
 
     case MRT_API_CONFIG:
 	error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
 	if (!error)
 	    error = set_api_config(&i);
 	if (!error)
 	    error = sooptcopyout(sopt, &i, sizeof i);
 	break;
 
     case MRT_ADD_BW_UPCALL:
     case MRT_DEL_BW_UPCALL:
 	error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall,
 				sizeof bw_upcall);
 	if (error)
 	    break;
 	if (sopt->sopt_name == MRT_ADD_BW_UPCALL)
 	    error = add_bw_upcall(&bw_upcall);
 	else
 	    error = del_bw_upcall(&bw_upcall);
 	break;
 
     default:
 	error = EOPNOTSUPP;
 	break;
     }
     return error;
 }
 
 /*
  * Handle MRT getsockopt commands
  */
 static int
 X_ip_mrouter_get(struct socket *so, struct sockopt *sopt)
 {
     int error;
 
     switch (sopt->sopt_name) {
     case MRT_VERSION:
 	error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version);
 	break;
 
     case MRT_ASSERT:
 	error = sooptcopyout(sopt, &V_pim_assert_enabled,
 	    sizeof V_pim_assert_enabled);
 	break;
 
     case MRT_API_SUPPORT:
 	error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support);
 	break;
 
     case MRT_API_CONFIG:
 	error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config);
 	break;
 
     default:
 	error = EOPNOTSUPP;
 	break;
     }
     return error;
 }
 
 /*
  * Handle ioctl commands to obtain information from the cache
  */
 static int
 X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused)
 {
     int error = 0;
 
     /*
      * Currently the only function calling this ioctl routine is rtioctl().
      * Typically, only root can create the raw socket in order to execute
      * this ioctl method, however the request might be coming from a prison
      */
     error = priv_check(curthread, PRIV_NETINET_MROUTE);
     if (error)
 	return (error);
     switch (cmd) {
     case (SIOCGETVIFCNT):
 	error = get_vif_cnt((struct sioc_vif_req *)data);
 	break;
 
     case (SIOCGETSGCNT):
 	error = get_sg_cnt((struct sioc_sg_req *)data);
 	break;
 
     default:
 	error = EINVAL;
 	break;
     }
     return error;
 }
 
 /*
  * returns the packet, byte, rpf-failure count for the source group provided
  */
 static int
 get_sg_cnt(struct sioc_sg_req *req)
 {
     struct mfc *rt;
 
     MFC_LOCK();
     rt = mfc_find(&req->src, &req->grp);
     if (rt == NULL) {
 	MFC_UNLOCK();
 	req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
 	return EADDRNOTAVAIL;
     }
     req->pktcnt = rt->mfc_pkt_cnt;
     req->bytecnt = rt->mfc_byte_cnt;
     req->wrong_if = rt->mfc_wrong_if;
     MFC_UNLOCK();
     return 0;
 }
 
 /*
  * returns the input and output packet and byte counts on the vif provided
  */
 static int
 get_vif_cnt(struct sioc_vif_req *req)
 {
     vifi_t vifi = req->vifi;
 
     VIF_LOCK();
     if (vifi >= V_numvifs) {
 	VIF_UNLOCK();
 	return EINVAL;
     }
 
     req->icount = V_viftable[vifi].v_pkt_in;
     req->ocount = V_viftable[vifi].v_pkt_out;
     req->ibytes = V_viftable[vifi].v_bytes_in;
     req->obytes = V_viftable[vifi].v_bytes_out;
     VIF_UNLOCK();
 
     return 0;
 }
 
 static void
 if_detached_event(void *arg __unused, struct ifnet *ifp)
 {
     vifi_t vifi;
     u_long i;
 
     MROUTER_LOCK();
 
     if (V_ip_mrouter == NULL) {
 	MROUTER_UNLOCK();
 	return;
     }
 
     VIF_LOCK();
     MFC_LOCK();
 
     /*
      * Tear down multicast forwarder state associated with this ifnet.
      * 1. Walk the vif list, matching vifs against this ifnet.
      * 2. Walk the multicast forwarding cache (mfc) looking for
      *    inner matches with this vif's index.
      * 3. Expire any matching multicast forwarding cache entries.
      * 4. Free vif state. This should disable ALLMULTI on the interface.
      */
     for (vifi = 0; vifi < V_numvifs; vifi++) {
 	if (V_viftable[vifi].v_ifp != ifp)
 		continue;
 	for (i = 0; i < mfchashsize; i++) {
 		struct mfc *rt, *nrt;
 
 		LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
 			if (rt->mfc_parent == vifi) {
 				expire_mfc(rt);
 			}
 		}
 	}
 	del_vif_locked(vifi);
     }
 
     MFC_UNLOCK();
     VIF_UNLOCK();
 
     MROUTER_UNLOCK();
 }
                         
 /*
  * Enable multicast forwarding.
  */
 static int
 ip_mrouter_init(struct socket *so, int version)
 {
 
     CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__,
         so->so_type, so->so_proto->pr_protocol);
 
     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP)
 	return EOPNOTSUPP;
 
     if (version != 1)
 	return ENOPROTOOPT;
 
     MROUTER_LOCK();
 
     if (ip_mrouter_unloading) {
 	MROUTER_UNLOCK();
 	return ENOPROTOOPT;
     }
 
     if (V_ip_mrouter != NULL) {
 	MROUTER_UNLOCK();
 	return EADDRINUSE;
     }
 
     V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash,
 	HASH_NOWAIT);
 
     callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
 	curvnet);
     callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
 	curvnet);
     callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
 	curvnet);
 
     V_ip_mrouter = so;
     ip_mrouter_cnt++;
 
     MROUTER_UNLOCK();
 
     CTR1(KTR_IPMF, "%s: done", __func__);
 
     return 0;
 }
 
 /*
  * Disable multicast forwarding.
  */
 static int
 X_ip_mrouter_done(void)
 {
     struct ifnet *ifp;
     u_long i;
     vifi_t vifi;
 
     MROUTER_LOCK();
 
     if (V_ip_mrouter == NULL) {
 	MROUTER_UNLOCK();
 	return EINVAL;
     }
 
     /*
      * Detach/disable hooks to the reset of the system.
      */
     V_ip_mrouter = NULL;
     ip_mrouter_cnt--;
     V_mrt_api_config = 0;
 
     VIF_LOCK();
 
     /*
      * For each phyint in use, disable promiscuous reception of all IP
      * multicasts.
      */
     for (vifi = 0; vifi < V_numvifs; vifi++) {
 	if (!in_nullhost(V_viftable[vifi].v_lcl_addr) &&
 		!(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
 	    ifp = V_viftable[vifi].v_ifp;
 	    if_allmulti(ifp, 0);
 	}
     }
     bzero((caddr_t)V_viftable, sizeof(V_viftable));
     V_numvifs = 0;
     V_pim_assert_enabled = 0;
     
     VIF_UNLOCK();
 
     callout_stop(&V_expire_upcalls_ch);
     callout_stop(&V_bw_upcalls_ch);
     callout_stop(&V_bw_meter_ch);
 
     MFC_LOCK();
 
     /*
      * Free all multicast forwarding cache entries.
      * Do not use hashdestroy(), as we must perform other cleanup.
      */
     for (i = 0; i < mfchashsize; i++) {
 	struct mfc *rt, *nrt;
 
 	LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
 		expire_mfc(rt);
 	}
     }
     free(V_mfchashtbl, M_MRTABLE);
     V_mfchashtbl = NULL;
 
     bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize);
 
     V_bw_upcalls_n = 0;
     bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers));
 
     MFC_UNLOCK();
 
     V_reg_vif_num = VIFI_INVALID;
 
     MROUTER_UNLOCK();
 
     CTR1(KTR_IPMF, "%s: done", __func__);
 
     return 0;
 }
 
 /*
  * Set PIM assert processing global
  */
 static int
 set_assert(int i)
 {
     if ((i != 1) && (i != 0))
 	return EINVAL;
 
     V_pim_assert_enabled = i;
 
     return 0;
 }
 
 /*
  * Configure API capabilities
  */
 int
 set_api_config(uint32_t *apival)
 {
     u_long i;
 
     /*
      * We can set the API capabilities only if it is the first operation
      * after MRT_INIT. I.e.:
      *  - there are no vifs installed
      *  - pim_assert is not enabled
      *  - the MFC table is empty
      */
     if (V_numvifs > 0) {
 	*apival = 0;
 	return EPERM;
     }
     if (V_pim_assert_enabled) {
 	*apival = 0;
 	return EPERM;
     }
 
     MFC_LOCK();
 
     for (i = 0; i < mfchashsize; i++) {
 	if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) {
 	    MFC_UNLOCK();
 	    *apival = 0;
 	    return EPERM;
 	}
     }
 
     MFC_UNLOCK();
 
     V_mrt_api_config = *apival & mrt_api_support;
     *apival = V_mrt_api_config;
 
     return 0;
 }
 
 /*
  * Add a vif to the vif table
  */
 static int
 add_vif(struct vifctl *vifcp)
 {
     struct vif *vifp = V_viftable + vifcp->vifc_vifi;
     struct sockaddr_in sin = {sizeof sin, AF_INET};
     struct ifaddr *ifa;
     struct ifnet *ifp;
     int error;
 
     VIF_LOCK();
     if (vifcp->vifc_vifi >= MAXVIFS) {
 	VIF_UNLOCK();
 	return EINVAL;
     }
     /* rate limiting is no longer supported by this code */
     if (vifcp->vifc_rate_limit != 0) {
 	log(LOG_ERR, "rate limiting is no longer supported\n");
 	VIF_UNLOCK();
 	return EINVAL;
     }
     if (!in_nullhost(vifp->v_lcl_addr)) {
 	VIF_UNLOCK();
 	return EADDRINUSE;
     }
     if (in_nullhost(vifcp->vifc_lcl_addr)) {
 	VIF_UNLOCK();
 	return EADDRNOTAVAIL;
     }
 
     /* Find the interface with an address in AF_INET family */
     if (vifcp->vifc_flags & VIFF_REGISTER) {
 	/*
 	 * XXX: Because VIFF_REGISTER does not really need a valid
 	 * local interface (e.g. it could be 127.0.0.2), we don't
 	 * check its address.
 	 */
 	ifp = NULL;
     } else {
 	sin.sin_addr = vifcp->vifc_lcl_addr;
 	ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
 	if (ifa == NULL) {
 	    VIF_UNLOCK();
 	    return EADDRNOTAVAIL;
 	}
 	ifp = ifa->ifa_ifp;
 	ifa_free(ifa);
     }
 
     if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) {
 	CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__);
 	VIF_UNLOCK();
 	return EOPNOTSUPP;
     } else if (vifcp->vifc_flags & VIFF_REGISTER) {
 	ifp = &V_multicast_register_if;
 	CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp);
 	if (V_reg_vif_num == VIFI_INVALID) {
 	    if_initname(&V_multicast_register_if, "register_vif", 0);
 	    V_multicast_register_if.if_flags = IFF_LOOPBACK;
 	    V_reg_vif_num = vifcp->vifc_vifi;
 	}
     } else {		/* Make sure the interface supports multicast */
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 	    VIF_UNLOCK();
 	    return EOPNOTSUPP;
 	}
 
 	/* Enable promiscuous reception of all IP multicasts from the if */
 	error = if_allmulti(ifp, 1);
 	if (error) {
 	    VIF_UNLOCK();
 	    return error;
 	}
     }
 
     vifp->v_flags     = vifcp->vifc_flags;
     vifp->v_threshold = vifcp->vifc_threshold;
     vifp->v_lcl_addr  = vifcp->vifc_lcl_addr;
     vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
     vifp->v_ifp       = ifp;
     /* initialize per vif pkt counters */
     vifp->v_pkt_in    = 0;
     vifp->v_pkt_out   = 0;
     vifp->v_bytes_in  = 0;
     vifp->v_bytes_out = 0;
 
     /* Adjust numvifs up if the vifi is higher than numvifs */
     if (V_numvifs <= vifcp->vifc_vifi)
 	V_numvifs = vifcp->vifc_vifi + 1;
 
     VIF_UNLOCK();
 
     CTR4(KTR_IPMF, "%s: add vif %d laddr %s thresh %x", __func__,
 	(int)vifcp->vifc_vifi, inet_ntoa(vifcp->vifc_lcl_addr),
 	(int)vifcp->vifc_threshold);
 
     return 0;
 }
 
 /*
  * Delete a vif from the vif table
  */
 static int
 del_vif_locked(vifi_t vifi)
 {
     struct vif *vifp;
 
     VIF_LOCK_ASSERT();
 
     if (vifi >= V_numvifs) {
 	return EINVAL;
     }
     vifp = &V_viftable[vifi];
     if (in_nullhost(vifp->v_lcl_addr)) {
 	return EADDRNOTAVAIL;
     }
 
     if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER)))
 	if_allmulti(vifp->v_ifp, 0);
 
     if (vifp->v_flags & VIFF_REGISTER)
 	V_reg_vif_num = VIFI_INVALID;
 
     bzero((caddr_t)vifp, sizeof (*vifp));
 
     CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi);
 
     /* Adjust numvifs down */
     for (vifi = V_numvifs; vifi > 0; vifi--)
 	if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr))
 	    break;
     V_numvifs = vifi;
 
     return 0;
 }
 
 static int
 del_vif(vifi_t vifi)
 {
     int cc;
 
     VIF_LOCK();
     cc = del_vif_locked(vifi);
     VIF_UNLOCK();
 
     return cc;
 }
 
 /*
  * update an mfc entry without resetting counters and S,G addresses.
  */
 static void
 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 {
     int i;
 
     rt->mfc_parent = mfccp->mfcc_parent;
     for (i = 0; i < V_numvifs; i++) {
 	rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 	rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config &
 	    MRT_MFC_FLAGS_ALL;
     }
     /* set the RP address */
     if (V_mrt_api_config & MRT_MFC_RP)
 	rt->mfc_rp = mfccp->mfcc_rp;
     else
 	rt->mfc_rp.s_addr = INADDR_ANY;
 }
 
 /*
  * fully initialize an mfc entry from the parameter.
  */
 static void
 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 {
     rt->mfc_origin     = mfccp->mfcc_origin;
     rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 
     update_mfc_params(rt, mfccp);
 
     /* initialize pkt counters per src-grp */
     rt->mfc_pkt_cnt    = 0;
     rt->mfc_byte_cnt   = 0;
     rt->mfc_wrong_if   = 0;
     timevalclear(&rt->mfc_last_assert);
 }
 
 static void
 expire_mfc(struct mfc *rt)
 {
 	struct rtdetq *rte, *nrte;
 
 	MFC_LOCK_ASSERT();
 
 	free_bw_list(rt->mfc_bw_meter);
 
 	TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
 		m_freem(rte->m);
 		TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link);
 		free(rte, M_MRTABLE);
 	}
 
 	LIST_REMOVE(rt, mfc_hash);
 	free(rt, M_MRTABLE);
 }
 
 /*
  * Add an mfc entry
  */
 static int
 add_mfc(struct mfcctl2 *mfccp)
 {
     struct mfc *rt;
     struct rtdetq *rte, *nrte;
     u_long hash = 0;
     u_short nstl;
 
     VIF_LOCK();
     MFC_LOCK();
 
     rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 
     /* If an entry already exists, just update the fields */
     if (rt) {
 	CTR4(KTR_IPMF, "%s: update mfc orig %s group %lx parent %x",
 	    __func__, inet_ntoa(mfccp->mfcc_origin),
 	    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 	    mfccp->mfcc_parent);
 	update_mfc_params(rt, mfccp);
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return (0);
     }
 
     /*
      * Find the entry for which the upcall was made and update
      */
     nstl = 0;
     hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
     LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
 	if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 	    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
 	    !TAILQ_EMPTY(&rt->mfc_stall)) {
 		CTR5(KTR_IPMF,
 		    "%s: add mfc orig %s group %lx parent %x qh %p",
 		    __func__, inet_ntoa(mfccp->mfcc_origin),
 		    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 		    mfccp->mfcc_parent,
 		    TAILQ_FIRST(&rt->mfc_stall));
 		if (nstl++)
 			CTR1(KTR_IPMF, "%s: multiple matches", __func__);
 
 		init_mfc_params(rt, mfccp);
 		rt->mfc_expire = 0;	/* Don't clean this guy up */
 		V_nexpire[hash]--;
 
 		/* Free queued packets, but attempt to forward them first. */
 		TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
 			if (rte->ifp != NULL)
 				ip_mdq(rte->m, rte->ifp, rt, -1);
 			m_freem(rte->m);
 			TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link);
 			rt->mfc_nstall--;
 			free(rte, M_MRTABLE);
 		}
 	}
     }
 
     /*
      * It is possible that an entry is being inserted without an upcall
      */
     if (nstl == 0) {
 	CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__);
 	LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
 		if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 		    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
 			init_mfc_params(rt, mfccp);
 			if (rt->mfc_expire)
 			    V_nexpire[hash]--;
 			rt->mfc_expire = 0;
 			break; /* XXX */
 		}
 	}
 
 	if (rt == NULL) {		/* no upcall, so make a new entry */
 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 	    if (rt == NULL) {
 		MFC_UNLOCK();
 		VIF_UNLOCK();
 		return (ENOBUFS);
 	    }
 
 	    init_mfc_params(rt, mfccp);
 	    TAILQ_INIT(&rt->mfc_stall);
 	    rt->mfc_nstall = 0;
 
 	    rt->mfc_expire     = 0;
 	    rt->mfc_bw_meter = NULL;
 
 	    /* insert new entry at head of hash chain */
 	    LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
 	}
     }
 
     MFC_UNLOCK();
     VIF_UNLOCK();
 
     return (0);
 }
 
 /*
  * Delete an mfc entry
  */
 static int
 del_mfc(struct mfcctl2 *mfccp)
 {
     struct in_addr	origin;
     struct in_addr	mcastgrp;
     struct mfc		*rt;
 
     origin = mfccp->mfcc_origin;
     mcastgrp = mfccp->mfcc_mcastgrp;
 
     CTR3(KTR_IPMF, "%s: delete mfc orig %s group %lx", __func__,
 	inet_ntoa(origin), (u_long)ntohl(mcastgrp.s_addr));
 
     MFC_LOCK();
 
     rt = mfc_find(&origin, &mcastgrp);
     if (rt == NULL) {
 	MFC_UNLOCK();
 	return EADDRNOTAVAIL;
     }
 
     /*
      * free the bw_meter entries
      */
     free_bw_list(rt->mfc_bw_meter);
     rt->mfc_bw_meter = NULL;
 
     LIST_REMOVE(rt, mfc_hash);
     free(rt, M_MRTABLE);
 
     MFC_UNLOCK();
 
     return (0);
 }
 
 /*
  * Send a message to the routing daemon on the multicast routing socket.
  */
 static int
 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
 {
     if (s) {
 	SOCKBUF_LOCK(&s->so_rcv);
 	if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm,
 	    NULL) != 0) {
 	    sorwakeup_locked(s);
 	    return 0;
 	}
 	SOCKBUF_UNLOCK(&s->so_rcv);
     }
     m_freem(mm);
     return -1;
 }
 
 /*
  * IP multicast forwarding function. This function assumes that the packet
  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
  * pointed to by "ifp", and the packet is to be relayed to other networks
  * that have members of the packet's destination IP multicast group.
  *
  * The packet is returned unscathed to the caller, unless it is
  * erroneous, in which case a non-zero return value tells the caller to
  * discard it.
  */
 
 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 
 static int
 X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m,
     struct ip_moptions *imo)
 {
     struct mfc *rt;
     int error;
     vifi_t vifi;
 
     CTR3(KTR_IPMF, "ip_mforward: delete mfc orig %s group %lx ifp %p",
 	inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr), ifp);
 
     if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 ||
 		((u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
 	/*
 	 * Packet arrived via a physical interface or
 	 * an encapsulated tunnel or a register_vif.
 	 */
     } else {
 	/*
 	 * Packet arrived through a source-route tunnel.
 	 * Source-route tunnels are no longer supported.
 	 */
 	return (1);
     }
 
     VIF_LOCK();
     MFC_LOCK();
     if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) {
 	if (ip->ip_ttl < MAXTTL)
 	    ip->ip_ttl++;	/* compensate for -1 in *_send routines */
 	error = ip_mdq(m, ifp, NULL, vifi);
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return error;
     }
 
     /*
      * Don't forward a packet with time-to-live of zero or one,
      * or a packet destined to a local-only group.
      */
     if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) {
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return 0;
     }
 
     /*
      * Determine forwarding vifs from the forwarding cache table
      */
     MRTSTAT_INC(mrts_mfc_lookups);
     rt = mfc_find(&ip->ip_src, &ip->ip_dst);
 
     /* Entry exists, so forward if necessary */
     if (rt != NULL) {
 	error = ip_mdq(m, ifp, rt, -1);
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return error;
     } else {
 	/*
 	 * If we don't have a route for packet's origin,
 	 * Make a copy of the packet & send message to routing daemon
 	 */
 
 	struct mbuf *mb0;
 	struct rtdetq *rte;
 	u_long hash;
 	int hlen = ip->ip_hl << 2;
 
 	MRTSTAT_INC(mrts_mfc_misses);
 	MRTSTAT_INC(mrts_no_route);
 	CTR2(KTR_IPMF, "ip_mforward: no mfc for (%s,%lx)",
 	    inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr));
 
 	/*
 	 * Allocate mbufs early so that we don't do extra work if we are
 	 * just going to fail anyway.  Make sure to pullup the header so
 	 * that other people can't step on it.
 	 */
 	rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE,
 	    M_NOWAIT|M_ZERO);
 	if (rte == NULL) {
 	    MFC_UNLOCK();
 	    VIF_UNLOCK();
 	    return ENOBUFS;
 	}
 
 	mb0 = m_copypacket(m, M_NOWAIT);
 	if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen))
 	    mb0 = m_pullup(mb0, hlen);
 	if (mb0 == NULL) {
 	    free(rte, M_MRTABLE);
 	    MFC_UNLOCK();
 	    VIF_UNLOCK();
 	    return ENOBUFS;
 	}
 
 	/* is there an upcall waiting for this flow ? */
 	hash = MFCHASH(ip->ip_src, ip->ip_dst);
 	LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
 		if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
 		    in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
 		    !TAILQ_EMPTY(&rt->mfc_stall))
 			break;
 	}
 
 	if (rt == NULL) {
 	    int i;
 	    struct igmpmsg *im;
 	    struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 	    struct mbuf *mm;
 
 	    /*
 	     * Locate the vifi for the incoming interface for this packet.
 	     * If none found, drop packet.
 	     */
 	    for (vifi = 0; vifi < V_numvifs &&
 		    V_viftable[vifi].v_ifp != ifp; vifi++)
 		;
 	    if (vifi >= V_numvifs)	/* vif not found, drop packet */
 		goto non_fatal;
 
 	    /* no upcall, so make a new entry */
 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 	    if (rt == NULL)
 		goto fail;
 
 	    /* Make a copy of the header to send to the user level process */
 	    mm = m_copy(mb0, 0, hlen);
 	    if (mm == NULL)
 		goto fail1;
 
 	    /*
 	     * Send message to routing daemon to install
 	     * a route into the kernel table
 	     */
 
 	    im = mtod(mm, struct igmpmsg *);
 	    im->im_msgtype = IGMPMSG_NOCACHE;
 	    im->im_mbz = 0;
 	    im->im_vif = vifi;
 
 	    MRTSTAT_INC(mrts_upcalls);
 
 	    k_igmpsrc.sin_addr = ip->ip_src;
 	    if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
 		CTR0(KTR_IPMF, "ip_mforward: socket queue full");
 		MRTSTAT_INC(mrts_upq_sockfull);
 fail1:
 		free(rt, M_MRTABLE);
 fail:
 		free(rte, M_MRTABLE);
 		m_freem(mb0);
 		MFC_UNLOCK();
 		VIF_UNLOCK();
 		return ENOBUFS;
 	    }
 
 	    /* insert new entry at head of hash chain */
 	    rt->mfc_origin.s_addr     = ip->ip_src.s_addr;
 	    rt->mfc_mcastgrp.s_addr   = ip->ip_dst.s_addr;
 	    rt->mfc_expire	      = UPCALL_EXPIRE;
 	    V_nexpire[hash]++;
 	    for (i = 0; i < V_numvifs; i++) {
 		rt->mfc_ttls[i] = 0;
 		rt->mfc_flags[i] = 0;
 	    }
 	    rt->mfc_parent = -1;
 
 	    /* clear the RP address */
 	    rt->mfc_rp.s_addr = INADDR_ANY;
 	    rt->mfc_bw_meter = NULL;
 
 	    /* initialize pkt counters per src-grp */
 	    rt->mfc_pkt_cnt = 0;
 	    rt->mfc_byte_cnt = 0;
 	    rt->mfc_wrong_if = 0;
 	    timevalclear(&rt->mfc_last_assert);
 
 	    TAILQ_INIT(&rt->mfc_stall);
 	    rt->mfc_nstall = 0;
 
 	    /* link into table */
 	    LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
 	    TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link);
 	    rt->mfc_nstall++;
 
 	} else {
 	    /* determine if queue has overflowed */
 	    if (rt->mfc_nstall > MAX_UPQ) {
 		MRTSTAT_INC(mrts_upq_ovflw);
 non_fatal:
 		free(rte, M_MRTABLE);
 		m_freem(mb0);
 		MFC_UNLOCK();
 		VIF_UNLOCK();
 		return (0);
 	    }
 	    TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link);
 	    rt->mfc_nstall++;
 	}
 
 	rte->m			= mb0;
 	rte->ifp		= ifp;
 
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 
 	return 0;
     }
 }
 
 /*
  * Clean up the cache entry if upcall is not serviced
  */
 static void
 expire_upcalls(void *arg)
 {
     u_long i;
 
     CURVNET_SET((struct vnet *) arg);
 
     MFC_LOCK();
 
     for (i = 0; i < mfchashsize; i++) {
 	struct mfc *rt, *nrt;
 
 	if (V_nexpire[i] == 0)
 	    continue;
 
 	LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
 		if (TAILQ_EMPTY(&rt->mfc_stall))
 			continue;
 
 		if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
 			continue;
 
 		/*
 		 * free the bw_meter entries
 		 */
 		while (rt->mfc_bw_meter != NULL) {
 		    struct bw_meter *x = rt->mfc_bw_meter;
 
 		    rt->mfc_bw_meter = x->bm_mfc_next;
 		    free(x, M_BWMETER);
 		}
 
 		MRTSTAT_INC(mrts_cache_cleanups);
 		CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__,
 		    (u_long)ntohl(rt->mfc_origin.s_addr),
 		    (u_long)ntohl(rt->mfc_mcastgrp.s_addr));
 
 		expire_mfc(rt);
 	    }
     }
 
     MFC_UNLOCK();
 
     callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
 	curvnet);
 
     CURVNET_RESTORE();
 }
 
 /*
  * Packet forwarding routine once entry in the cache is made
  */
 static int
 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
 {
     struct ip  *ip = mtod(m, struct ip *);
     vifi_t vifi;
     int plen = ntohs(ip->ip_len);
 
     VIF_LOCK_ASSERT();
 
     /*
      * If xmt_vif is not -1, send on only the requested vif.
      *
      * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
      */
     if (xmt_vif < V_numvifs) {
 	if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER)
 		pim_register_send(ip, V_viftable + xmt_vif, m, rt);
 	else
 		phyint_send(ip, V_viftable + xmt_vif, m);
 	return 1;
     }
 
     /*
      * Don't forward if it didn't arrive from the parent vif for its origin.
      */
     vifi = rt->mfc_parent;
     if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) {
 	CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)",
 	    __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp);
 	MRTSTAT_INC(mrts_wrong_if);
 	++rt->mfc_wrong_if;
 	/*
 	 * If we are doing PIM assert processing, send a message
 	 * to the routing daemon.
 	 *
 	 * XXX: A PIM-SM router needs the WRONGVIF detection so it
 	 * can complete the SPT switch, regardless of the type
 	 * of the iif (broadcast media, GRE tunnel, etc).
 	 */
 	if (V_pim_assert_enabled && (vifi < V_numvifs) &&
 	    V_viftable[vifi].v_ifp) {
 
 	    if (ifp == &V_multicast_register_if)
 		PIMSTAT_INC(pims_rcv_registers_wrongiif);
 
 	    /* Get vifi for the incoming packet */
 	    for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp;
 		vifi++)
 		;
 	    if (vifi >= V_numvifs)
 		return 0;	/* The iif is not found: ignore the packet. */
 
 	    if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF)
 		return 0;	/* WRONGVIF disabled: ignore the packet */
 
 	    if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) {
 		struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 		struct igmpmsg *im;
 		int hlen = ip->ip_hl << 2;
 		struct mbuf *mm = m_copy(m, 0, hlen);
 
 		if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen))
 		    mm = m_pullup(mm, hlen);
 		if (mm == NULL)
 		    return ENOBUFS;
 
 		im = mtod(mm, struct igmpmsg *);
 		im->im_msgtype	= IGMPMSG_WRONGVIF;
 		im->im_mbz		= 0;
 		im->im_vif		= vifi;
 
 		MRTSTAT_INC(mrts_upcalls);
 
 		k_igmpsrc.sin_addr = im->im_src;
 		if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
 		    CTR1(KTR_IPMF, "%s: socket queue full", __func__);
 		    MRTSTAT_INC(mrts_upq_sockfull);
 		    return ENOBUFS;
 		}
 	    }
 	}
 	return 0;
     }
 
 
     /* If I sourced this packet, it counts as output, else it was input. */
     if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) {
 	V_viftable[vifi].v_pkt_out++;
 	V_viftable[vifi].v_bytes_out += plen;
     } else {
 	V_viftable[vifi].v_pkt_in++;
 	V_viftable[vifi].v_bytes_in += plen;
     }
     rt->mfc_pkt_cnt++;
     rt->mfc_byte_cnt += plen;
 
     /*
      * For each vif, decide if a copy of the packet should be forwarded.
      * Forward if:
      *		- the ttl exceeds the vif's threshold
      *		- there are group members downstream on interface
      */
     for (vifi = 0; vifi < V_numvifs; vifi++)
 	if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) {
 	    V_viftable[vifi].v_pkt_out++;
 	    V_viftable[vifi].v_bytes_out += plen;
 	    if (V_viftable[vifi].v_flags & VIFF_REGISTER)
 		pim_register_send(ip, V_viftable + vifi, m, rt);
 	    else
 		phyint_send(ip, V_viftable + vifi, m);
 	}
 
     /*
      * Perform upcall-related bw measuring.
      */
     if (rt->mfc_bw_meter != NULL) {
 	struct bw_meter *x;
 	struct timeval now;
 
 	microtime(&now);
 	MFC_LOCK_ASSERT();
 	for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
 	    bw_meter_receive_packet(x, plen, &now);
     }
 
     return 0;
 }
 
 /*
  * Check if a vif number is legal/ok. This is used by in_mcast.c.
  */
 static int
 X_legal_vif_num(int vif)
 {
 	int ret;
 
 	ret = 0;
 	if (vif < 0)
 		return (ret);
 
 	VIF_LOCK();
 	if (vif < V_numvifs)
 		ret = 1;
 	VIF_UNLOCK();
 
 	return (ret);
 }
 
 /*
  * Return the local address used by this vif
  */
 static u_long
 X_ip_mcast_src(int vifi)
 {
 	in_addr_t addr;
 
 	addr = INADDR_ANY;
 	if (vifi < 0)
 		return (addr);
 
 	VIF_LOCK();
 	if (vifi < V_numvifs)
 		addr = V_viftable[vifi].v_lcl_addr.s_addr;
 	VIF_UNLOCK();
 
 	return (addr);
 }
 
 static void
 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 {
     struct mbuf *mb_copy;
     int hlen = ip->ip_hl << 2;
 
     VIF_LOCK_ASSERT();
 
     /*
      * Make a new reference to the packet; make sure that
      * the IP header is actually copied, not just referenced,
      * so that ip_output() only scribbles on the copy.
      */
     mb_copy = m_copypacket(m, M_NOWAIT);
     if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen))
 	mb_copy = m_pullup(mb_copy, hlen);
     if (mb_copy == NULL)
 	return;
 
     send_packet(vifp, mb_copy);
 }
 
 static void
 send_packet(struct vif *vifp, struct mbuf *m)
 {
 	struct ip_moptions imo;
 	struct in_multi *imm[2];
 	int error;
 
 	VIF_LOCK_ASSERT();
 
 	imo.imo_multicast_ifp  = vifp->v_ifp;
 	imo.imo_multicast_ttl  = mtod(m, struct ip *)->ip_ttl - 1;
 	imo.imo_multicast_loop = 1;
 	imo.imo_multicast_vif  = -1;
 	imo.imo_num_memberships = 0;
 	imo.imo_max_memberships = 2;
 	imo.imo_membership  = &imm[0];
 
 	/*
 	 * Re-entrancy should not be a problem here, because
 	 * the packets that we send out and are looped back at us
 	 * should get rejected because they appear to come from
 	 * the loopback interface, thus preventing looping.
 	 */
 	error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL);
 	CTR3(KTR_IPMF, "%s: vif %td err %d", __func__,
 	    (ptrdiff_t)(vifp - V_viftable), error);
 }
 
 /*
  * Stubs for old RSVP socket shim implementation.
  */
 
 static int
 X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused)
 {
 
 	return (EOPNOTSUPP);
 }
 
 static void
 X_ip_rsvp_force_done(struct socket *so __unused)
 {
 
 }
 
 static int
 X_rsvp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 
 	m = *mp;
 	*mp = NULL;
 	if (!V_rsvp_on)
 		m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Code for bandwidth monitors
  */
 
 /*
  * Define common interface for timeval-related methods
  */
 #define	BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp)
 #define	BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp))
 #define	BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp))
 
 static uint32_t
 compute_bw_meter_flags(struct bw_upcall *req)
 {
     uint32_t flags = 0;
 
     if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
 	flags |= BW_METER_UNIT_PACKETS;
     if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
 	flags |= BW_METER_UNIT_BYTES;
     if (req->bu_flags & BW_UPCALL_GEQ)
 	flags |= BW_METER_GEQ;
     if (req->bu_flags & BW_UPCALL_LEQ)
 	flags |= BW_METER_LEQ;
 
     return flags;
 }
 
 /*
  * Add a bw_meter entry
  */
 static int
 add_bw_upcall(struct bw_upcall *req)
 {
     struct mfc *mfc;
     struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
 		BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
     struct timeval now;
     struct bw_meter *x;
     uint32_t flags;
 
     if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
 	return EOPNOTSUPP;
 
     /* Test if the flags are valid */
     if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
 	return EINVAL;
     if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
 	return EINVAL;
     if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 	    == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 	return EINVAL;
 
     /* Test if the threshold time interval is valid */
     if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
 	return EINVAL;
 
     flags = compute_bw_meter_flags(req);
 
     /*
      * Find if we have already same bw_meter entry
      */
     MFC_LOCK();
     mfc = mfc_find(&req->bu_src, &req->bu_dst);
     if (mfc == NULL) {
 	MFC_UNLOCK();
 	return EADDRNOTAVAIL;
     }
     for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
 	if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 			   &req->bu_threshold.b_time, ==)) &&
 	    (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 	    (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 	    (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
 	    MFC_UNLOCK();
 	    return 0;		/* XXX Already installed */
 	}
     }
 
     /* Allocate the new bw_meter entry */
     x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
     if (x == NULL) {
 	MFC_UNLOCK();
 	return ENOBUFS;
     }
 
     /* Set the new bw_meter entry */
     x->bm_threshold.b_time = req->bu_threshold.b_time;
     microtime(&now);
     x->bm_start_time = now;
     x->bm_threshold.b_packets = req->bu_threshold.b_packets;
     x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
     x->bm_measured.b_packets = 0;
     x->bm_measured.b_bytes = 0;
     x->bm_flags = flags;
     x->bm_time_next = NULL;
     x->bm_time_hash = BW_METER_BUCKETS;
 
     /* Add the new bw_meter entry to the front of entries for this MFC */
     x->bm_mfc = mfc;
     x->bm_mfc_next = mfc->mfc_bw_meter;
     mfc->mfc_bw_meter = x;
     schedule_bw_meter(x, &now);
     MFC_UNLOCK();
 
     return 0;
 }
 
 static void
 free_bw_list(struct bw_meter *list)
 {
     while (list != NULL) {
 	struct bw_meter *x = list;
 
 	list = list->bm_mfc_next;
 	unschedule_bw_meter(x);
 	free(x, M_BWMETER);
     }
 }
 
 /*
  * Delete one or multiple bw_meter entries
  */
 static int
 del_bw_upcall(struct bw_upcall *req)
 {
     struct mfc *mfc;
     struct bw_meter *x;
 
     if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
 	return EOPNOTSUPP;
 
     MFC_LOCK();
 
     /* Find the corresponding MFC entry */
     mfc = mfc_find(&req->bu_src, &req->bu_dst);
     if (mfc == NULL) {
 	MFC_UNLOCK();
 	return EADDRNOTAVAIL;
     } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
 	/*
 	 * Delete all bw_meter entries for this mfc
 	 */
 	struct bw_meter *list;
 
 	list = mfc->mfc_bw_meter;
 	mfc->mfc_bw_meter = NULL;
 	free_bw_list(list);
 	MFC_UNLOCK();
 	return 0;
     } else {			/* Delete a single bw_meter entry */
 	struct bw_meter *prev;
 	uint32_t flags = 0;
 
 	flags = compute_bw_meter_flags(req);
 
 	/* Find the bw_meter entry to delete */
 	for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
 	     prev = x, x = x->bm_mfc_next) {
 	    if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 			       &req->bu_threshold.b_time, ==)) &&
 		(x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 		(x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 		(x->bm_flags & BW_METER_USER_FLAGS) == flags)
 		break;
 	}
 	if (x != NULL) { /* Delete entry from the list for this MFC */
 	    if (prev != NULL)
 		prev->bm_mfc_next = x->bm_mfc_next;	/* remove from middle*/
 	    else
 		x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
 
 	    unschedule_bw_meter(x);
 	    MFC_UNLOCK();
 	    /* Free the bw_meter entry */
 	    free(x, M_BWMETER);
 	    return 0;
 	} else {
 	    MFC_UNLOCK();
 	    return EINVAL;
 	}
     }
     /* NOTREACHED */
 }
 
 /*
  * Perform bandwidth measurement processing that may result in an upcall
  */
 static void
 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 {
     struct timeval delta;
 
     MFC_LOCK_ASSERT();
 
     delta = *nowp;
     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 
     if (x->bm_flags & BW_METER_GEQ) {
 	/*
 	 * Processing for ">=" type of bw_meter entry
 	 */
 	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 	    /* Reset the bw_meter entry */
 	    x->bm_start_time = *nowp;
 	    x->bm_measured.b_packets = 0;
 	    x->bm_measured.b_bytes = 0;
 	    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 	}
 
 	/* Record that a packet is received */
 	x->bm_measured.b_packets++;
 	x->bm_measured.b_bytes += plen;
 
 	/*
 	 * Test if we should deliver an upcall
 	 */
 	if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, nowp);
 		x->bm_flags |= BW_METER_UPCALL_DELIVERED;
 	    }
 	}
     } else if (x->bm_flags & BW_METER_LEQ) {
 	/*
 	 * Processing for "<=" type of bw_meter entry
 	 */
 	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 	    /*
 	     * We are behind time with the multicast forwarding table
 	     * scanning for "<=" type of bw_meter entries, so test now
 	     * if we should deliver an upcall.
 	     */
 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, nowp);
 	    }
 	    /* Reschedule the bw_meter entry */
 	    unschedule_bw_meter(x);
 	    schedule_bw_meter(x, nowp);
 	}
 
 	/* Record that a packet is received */
 	x->bm_measured.b_packets++;
 	x->bm_measured.b_bytes += plen;
 
 	/*
 	 * Test if we should restart the measuring interval
 	 */
 	if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
 	     x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
 	    (x->bm_flags & BW_METER_UNIT_BYTES &&
 	     x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
 	    /* Don't restart the measuring interval */
 	} else {
 	    /* Do restart the measuring interval */
 	    /*
 	     * XXX: note that we don't unschedule and schedule, because this
 	     * might be too much overhead per packet. Instead, when we process
 	     * all entries for a given timer hash bin, we check whether it is
 	     * really a timeout. If not, we reschedule at that time.
 	     */
 	    x->bm_start_time = *nowp;
 	    x->bm_measured.b_packets = 0;
 	    x->bm_measured.b_bytes = 0;
 	    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 	}
     }
 }
 
 /*
  * Prepare a bandwidth-related upcall
  */
 static void
 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
 {
     struct timeval delta;
     struct bw_upcall *u;
 
     MFC_LOCK_ASSERT();
 
     /*
      * Compute the measured time interval
      */
     delta = *nowp;
     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 
     /*
      * If there are too many pending upcalls, deliver them now
      */
     if (V_bw_upcalls_n >= BW_UPCALLS_MAX)
 	bw_upcalls_send();
 
     /*
      * Set the bw_upcall entry
      */
     u = &V_bw_upcalls[V_bw_upcalls_n++];
     u->bu_src = x->bm_mfc->mfc_origin;
     u->bu_dst = x->bm_mfc->mfc_mcastgrp;
     u->bu_threshold.b_time = x->bm_threshold.b_time;
     u->bu_threshold.b_packets = x->bm_threshold.b_packets;
     u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
     u->bu_measured.b_time = delta;
     u->bu_measured.b_packets = x->bm_measured.b_packets;
     u->bu_measured.b_bytes = x->bm_measured.b_bytes;
     u->bu_flags = 0;
     if (x->bm_flags & BW_METER_UNIT_PACKETS)
 	u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
     if (x->bm_flags & BW_METER_UNIT_BYTES)
 	u->bu_flags |= BW_UPCALL_UNIT_BYTES;
     if (x->bm_flags & BW_METER_GEQ)
 	u->bu_flags |= BW_UPCALL_GEQ;
     if (x->bm_flags & BW_METER_LEQ)
 	u->bu_flags |= BW_UPCALL_LEQ;
 }
 
 /*
  * Send the pending bandwidth-related upcalls
  */
 static void
 bw_upcalls_send(void)
 {
     struct mbuf *m;
     int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]);
     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
     static struct igmpmsg igmpmsg = { 0,		/* unused1 */
 				      0,		/* unused2 */
 				      IGMPMSG_BW_UPCALL,/* im_msgtype */
 				      0,		/* im_mbz  */
 				      0,		/* im_vif  */
 				      0,		/* unused3 */
 				      { 0 },		/* im_src  */
 				      { 0 } };		/* im_dst  */
 
     MFC_LOCK_ASSERT();
 
     if (V_bw_upcalls_n == 0)
 	return;			/* No pending upcalls */
 
     V_bw_upcalls_n = 0;
 
     /*
      * Allocate a new mbuf, initialize it with the header and
      * the payload for the pending calls.
      */
     m = m_gethdr(M_NOWAIT, MT_DATA);
     if (m == NULL) {
 	log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
 	return;
     }
 
     m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
     m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]);
 
     /*
      * Send the upcalls
      * XXX do we need to set the address in k_igmpsrc ?
      */
     MRTSTAT_INC(mrts_upcalls);
     if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) {
 	log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
 	MRTSTAT_INC(mrts_upq_sockfull);
     }
 }
 
 /*
  * Compute the timeout hash value for the bw_meter entries
  */
 #define	BW_METER_TIMEHASH(bw_meter, hash)				\
     do {								\
 	struct timeval next_timeval = (bw_meter)->bm_start_time;	\
 									\
 	BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
 	(hash) = next_timeval.tv_sec;					\
 	if (next_timeval.tv_usec)					\
 	    (hash)++; /* XXX: make sure we don't timeout early */	\
 	(hash) %= BW_METER_BUCKETS;					\
     } while (0)
 
 /*
  * Schedule a timer to process periodically bw_meter entry of type "<="
  * by linking the entry in the proper hash bucket.
  */
 static void
 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
 {
     int time_hash;
 
     MFC_LOCK_ASSERT();
 
     if (!(x->bm_flags & BW_METER_LEQ))
 	return;		/* XXX: we schedule timers only for "<=" entries */
 
     /*
      * Reset the bw_meter entry
      */
     x->bm_start_time = *nowp;
     x->bm_measured.b_packets = 0;
     x->bm_measured.b_bytes = 0;
     x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 
     /*
      * Compute the timeout hash value and insert the entry
      */
     BW_METER_TIMEHASH(x, time_hash);
     x->bm_time_next = V_bw_meter_timers[time_hash];
     V_bw_meter_timers[time_hash] = x;
     x->bm_time_hash = time_hash;
 }
 
 /*
  * Unschedule the periodic timer that processes bw_meter entry of type "<="
  * by removing the entry from the proper hash bucket.
  */
 static void
 unschedule_bw_meter(struct bw_meter *x)
 {
     int time_hash;
     struct bw_meter *prev, *tmp;
 
     MFC_LOCK_ASSERT();
 
     if (!(x->bm_flags & BW_METER_LEQ))
 	return;		/* XXX: we schedule timers only for "<=" entries */
 
     /*
      * Compute the timeout hash value and delete the entry
      */
     time_hash = x->bm_time_hash;
     if (time_hash >= BW_METER_BUCKETS)
 	return;		/* Entry was not scheduled */
 
     for (prev = NULL, tmp = V_bw_meter_timers[time_hash];
 	     tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
 	if (tmp == x)
 	    break;
 
     if (tmp == NULL)
 	panic("unschedule_bw_meter: bw_meter entry not found");
 
     if (prev != NULL)
 	prev->bm_time_next = x->bm_time_next;
     else
 	V_bw_meter_timers[time_hash] = x->bm_time_next;
 
     x->bm_time_next = NULL;
     x->bm_time_hash = BW_METER_BUCKETS;
 }
 
 
 /*
  * Process all "<=" type of bw_meter that should be processed now,
  * and for each entry prepare an upcall if necessary. Each processed
  * entry is rescheduled again for the (periodic) processing.
  *
  * This is run periodically (once per second normally). On each round,
  * all the potentially matching entries are in the hash slot that we are
  * looking at.
  */
 static void
 bw_meter_process()
 {
     uint32_t loops;
     int i;
     struct timeval now, process_endtime;
 
     microtime(&now);
     if (V_last_tv_sec == now.tv_sec)
 	return;		/* nothing to do */
 
     loops = now.tv_sec - V_last_tv_sec;
     V_last_tv_sec = now.tv_sec;
     if (loops > BW_METER_BUCKETS)
 	loops = BW_METER_BUCKETS;
 
     MFC_LOCK();
     /*
      * Process all bins of bw_meter entries from the one after the last
      * processed to the current one. On entry, i points to the last bucket
      * visited, so we need to increment i at the beginning of the loop.
      */
     for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
 	struct bw_meter *x, *tmp_list;
 
 	if (++i >= BW_METER_BUCKETS)
 	    i = 0;
 
 	/* Disconnect the list of bw_meter entries from the bin */
 	tmp_list = V_bw_meter_timers[i];
 	V_bw_meter_timers[i] = NULL;
 
 	/* Process the list of bw_meter entries */
 	while (tmp_list != NULL) {
 	    x = tmp_list;
 	    tmp_list = tmp_list->bm_time_next;
 
 	    /* Test if the time interval is over */
 	    process_endtime = x->bm_start_time;
 	    BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
 	    if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
 		/* Not yet: reschedule, but don't reset */
 		int time_hash;
 
 		BW_METER_TIMEHASH(x, time_hash);
 		if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
 		    /*
 		     * XXX: somehow the bin processing is a bit ahead of time.
 		     * Put the entry in the next bin.
 		     */
 		    if (++time_hash >= BW_METER_BUCKETS)
 			time_hash = 0;
 		}
 		x->bm_time_next = V_bw_meter_timers[time_hash];
 		V_bw_meter_timers[time_hash] = x;
 		x->bm_time_hash = time_hash;
 
 		continue;
 	    }
 
 	    /*
 	     * Test if we should deliver an upcall
 	     */
 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, &now);
 	    }
 
 	    /*
 	     * Reschedule for next processing
 	     */
 	    schedule_bw_meter(x, &now);
 	}
     }
 
     /* Send all upcalls that are pending delivery */
     bw_upcalls_send();
 
     MFC_UNLOCK();
 }
 
 /*
  * A periodic function for sending all upcalls that are pending delivery
  */
 static void
 expire_bw_upcalls_send(void *arg)
 {
     CURVNET_SET((struct vnet *) arg);
 
     MFC_LOCK();
     bw_upcalls_send();
     MFC_UNLOCK();
 
     callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
 	curvnet);
     CURVNET_RESTORE();
 }
 
 /*
  * A periodic function for periodic scanning of the multicast forwarding
  * table for processing all "<=" bw_meter entries.
  */
 static void
 expire_bw_meter_process(void *arg)
 {
     CURVNET_SET((struct vnet *) arg);
 
     if (V_mrt_api_config & MRT_MFC_BW_UPCALL)
 	bw_meter_process();
 
     callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
 	curvnet);
     CURVNET_RESTORE();
 }
 
 /*
  * End of bandwidth monitoring code
  */
 
 /*
  * Send the packet up to the user daemon, or eventually do kernel encapsulation
  *
  */
 static int
 pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m,
     struct mfc *rt)
 {
     struct mbuf *mb_copy, *mm;
 
     /*
      * Do not send IGMP_WHOLEPKT notifications to userland, if the
      * rendezvous point was unspecified, and we were told not to.
      */
     if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) &&
 	in_nullhost(rt->mfc_rp))
 	return 0;
 
     mb_copy = pim_register_prepare(ip, m);
     if (mb_copy == NULL)
 	return ENOBUFS;
 
     /*
      * Send all the fragments. Note that the mbuf for each fragment
      * is freed by the sending machinery.
      */
     for (mm = mb_copy; mm; mm = mb_copy) {
 	mb_copy = mm->m_nextpkt;
 	mm->m_nextpkt = 0;
 	mm = m_pullup(mm, sizeof(struct ip));
 	if (mm != NULL) {
 	    ip = mtod(mm, struct ip *);
 	    if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) {
 		pim_register_send_rp(ip, vifp, mm, rt);
 	    } else {
 		pim_register_send_upcall(ip, vifp, mm, rt);
 	    }
 	}
     }
 
     return 0;
 }
 
 /*
  * Return a copy of the data packet that is ready for PIM Register
  * encapsulation.
  * XXX: Note that in the returned copy the IP header is a valid one.
  */
 static struct mbuf *
 pim_register_prepare(struct ip *ip, struct mbuf *m)
 {
     struct mbuf *mb_copy = NULL;
     int mtu;
 
     /* Take care of delayed checksums */
     if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 	in_delayed_cksum(m);
 	m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
     }
 
     /*
      * Copy the old packet & pullup its IP header into the
      * new mbuf so we can modify it.
      */
     mb_copy = m_copypacket(m, M_NOWAIT);
     if (mb_copy == NULL)
 	return NULL;
     mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
     if (mb_copy == NULL)
 	return NULL;
 
     /* take care of the TTL */
     ip = mtod(mb_copy, struct ip *);
     --ip->ip_ttl;
 
     /* Compute the MTU after the PIM Register encapsulation */
     mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
 
     if (ntohs(ip->ip_len) <= mtu) {
 	/* Turn the IP header into a valid one */
 	ip->ip_sum = 0;
 	ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
     } else {
 	/* Fragment the packet */
 	mb_copy->m_pkthdr.csum_flags |= CSUM_IP;
 	if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) {
 	    m_freem(mb_copy);
 	    return NULL;
 	}
     }
     return mb_copy;
 }
 
 /*
  * Send an upcall with the data packet to the user-level process.
  */
 static int
 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
     struct mbuf *mb_copy, struct mfc *rt)
 {
     struct mbuf *mb_first;
     int len = ntohs(ip->ip_len);
     struct igmpmsg *im;
     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 
     VIF_LOCK_ASSERT();
 
     /*
      * Add a new mbuf with an upcall header
      */
     mb_first = m_gethdr(M_NOWAIT, MT_DATA);
     if (mb_first == NULL) {
 	m_freem(mb_copy);
 	return ENOBUFS;
     }
     mb_first->m_data += max_linkhdr;
     mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
     mb_first->m_len = sizeof(struct igmpmsg);
     mb_first->m_next = mb_copy;
 
     /* Send message to routing daemon */
     im = mtod(mb_first, struct igmpmsg *);
     im->im_msgtype	= IGMPMSG_WHOLEPKT;
     im->im_mbz		= 0;
     im->im_vif		= vifp - V_viftable;
     im->im_src		= ip->ip_src;
     im->im_dst		= ip->ip_dst;
 
     k_igmpsrc.sin_addr	= ip->ip_src;
 
     MRTSTAT_INC(mrts_upcalls);
 
     if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) {
 	CTR1(KTR_IPMF, "%s: socket queue full", __func__);
 	MRTSTAT_INC(mrts_upq_sockfull);
 	return ENOBUFS;
     }
 
     /* Keep statistics */
     PIMSTAT_INC(pims_snd_registers_msgs);
     PIMSTAT_ADD(pims_snd_registers_bytes, len);
 
     return 0;
 }
 
 /*
  * Encapsulate the data packet in PIM Register message and send it to the RP.
  */
 static int
 pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy,
     struct mfc *rt)
 {
     struct mbuf *mb_first;
     struct ip *ip_outer;
     struct pim_encap_pimhdr *pimhdr;
     int len = ntohs(ip->ip_len);
     vifi_t vifi = rt->mfc_parent;
 
     VIF_LOCK_ASSERT();
 
     if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) {
 	m_freem(mb_copy);
 	return EADDRNOTAVAIL;		/* The iif vif is invalid */
     }
 
     /*
      * Add a new mbuf with the encapsulating header
      */
     mb_first = m_gethdr(M_NOWAIT, MT_DATA);
     if (mb_first == NULL) {
 	m_freem(mb_copy);
 	return ENOBUFS;
     }
     mb_first->m_data += max_linkhdr;
     mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
     mb_first->m_next = mb_copy;
 
     mb_first->m_pkthdr.len = len + mb_first->m_len;
 
     /*
      * Fill in the encapsulating IP and PIM header
      */
     ip_outer = mtod(mb_first, struct ip *);
     *ip_outer = pim_encap_iphdr;
     ip_outer->ip_id = ip_newid();
     ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
 	sizeof(pim_encap_pimhdr));
     ip_outer->ip_src = V_viftable[vifi].v_lcl_addr;
     ip_outer->ip_dst = rt->mfc_rp;
     /*
      * Copy the inner header TOS to the outer header, and take care of the
      * IP_DF bit.
      */
     ip_outer->ip_tos = ip->ip_tos;
     if (ip->ip_off & htons(IP_DF))
 	ip_outer->ip_off |= htons(IP_DF);
     pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
 					 + sizeof(pim_encap_iphdr));
     *pimhdr = pim_encap_pimhdr;
     /* If the iif crosses a border, set the Border-bit */
     if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config)
 	pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
 
     mb_first->m_data += sizeof(pim_encap_iphdr);
     pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
     mb_first->m_data -= sizeof(pim_encap_iphdr);
 
     send_packet(vifp, mb_first);
 
     /* Keep statistics */
     PIMSTAT_INC(pims_snd_registers_msgs);
     PIMSTAT_ADD(pims_snd_registers_bytes, len);
 
     return 0;
 }
 
 /*
  * pim_encapcheck() is called by the encap4_input() path at runtime to
  * determine if a packet is for PIM; allowing PIM to be dynamically loaded
  * into the kernel.
  */
 static int
 pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
 {
 
 #ifdef DIAGNOSTIC
     KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM"));
 #endif
     if (proto != IPPROTO_PIM)
 	return 0;	/* not for us; reject the datagram. */
 
     return 64;		/* claim the datagram. */
 }
 
 /*
  * PIM-SMv2 and PIM-DM messages processing.
  * Receives and verifies the PIM control messages, and passes them
  * up to the listening socket, using rip_input().
  * The only message with special processing is the PIM_REGISTER message
  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
  * is passed to if_simloop().
  */
 int
 pim_input(struct mbuf **mp, int *offp, int proto)
 {
     struct mbuf *m = *mp;
     struct ip *ip = mtod(m, struct ip *);
     struct pim *pim;
     int iphlen = *offp;
     int minlen;
     int datalen = ntohs(ip->ip_len) - iphlen;
     int ip_tos;
 
     *mp = NULL;
 
     /* Keep statistics */
     PIMSTAT_INC(pims_rcv_total_msgs);
     PIMSTAT_ADD(pims_rcv_total_bytes, datalen);
 
     /*
      * Validate lengths
      */
     if (datalen < PIM_MINLEN) {
 	PIMSTAT_INC(pims_rcv_tooshort);
 	CTR3(KTR_IPMF, "%s: short packet (%d) from %s",
 	    __func__, datalen, inet_ntoa(ip->ip_src));
 	m_freem(m);
 	return (IPPROTO_DONE);
     }
 
     /*
      * If the packet is at least as big as a REGISTER, go agead
      * and grab the PIM REGISTER header size, to avoid another
      * possible m_pullup() later.
      *
      * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
      * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
      */
     minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
     /*
      * Get the IP and PIM headers in contiguous memory, and
      * possibly the PIM REGISTER header.
      */
     if (m->m_len < minlen && (m = m_pullup(m, minlen)) == 0) {
 	CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__);
 	return (IPPROTO_DONE);
     }
 
     /* m_pullup() may have given us a new mbuf so reset ip. */
     ip = mtod(m, struct ip *);
     ip_tos = ip->ip_tos;
 
     /* adjust mbuf to point to the PIM header */
     m->m_data += iphlen;
     m->m_len  -= iphlen;
     pim = mtod(m, struct pim *);
 
     /*
      * Validate checksum. If PIM REGISTER, exclude the data packet.
      *
      * XXX: some older PIMv2 implementations don't make this distinction,
      * so for compatibility reason perform the checksum over part of the
      * message, and if error, then over the whole message.
      */
     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
 	/* do nothing, checksum okay */
     } else if (in_cksum(m, datalen)) {
 	PIMSTAT_INC(pims_rcv_badsum);
 	CTR1(KTR_IPMF, "%s: invalid checksum", __func__);
 	m_freem(m);
 	return (IPPROTO_DONE);
     }
 
     /* PIM version check */
     if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
 	PIMSTAT_INC(pims_rcv_badversion);
 	CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__,
 	    (int)PIM_VT_V(pim->pim_vt), PIM_VERSION);
 	m_freem(m);
 	return (IPPROTO_DONE);
     }
 
     /* restore mbuf back to the outer IP */
     m->m_data -= iphlen;
     m->m_len  += iphlen;
 
     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
 	/*
 	 * Since this is a REGISTER, we'll make a copy of the register
 	 * headers ip + pim + u_int32 + encap_ip, to be passed up to the
 	 * routing daemon.
 	 */
 	struct sockaddr_in dst = { sizeof(dst), AF_INET };
 	struct mbuf *mcp;
 	struct ip *encap_ip;
 	u_int32_t *reghdr;
 	struct ifnet *vifp;
 
 	VIF_LOCK();
 	if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) {
 	    VIF_UNLOCK();
 	    CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__,
 		(int)V_reg_vif_num);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 	/* XXX need refcnt? */
 	vifp = V_viftable[V_reg_vif_num].v_ifp;
 	VIF_UNLOCK();
 
 	/*
 	 * Validate length
 	 */
 	if (datalen < PIM_REG_MINLEN) {
 	    PIMSTAT_INC(pims_rcv_tooshort);
 	    PIMSTAT_INC(pims_rcv_badregisters);
 	    CTR1(KTR_IPMF, "%s: register packet size too small", __func__);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	reghdr = (u_int32_t *)(pim + 1);
 	encap_ip = (struct ip *)(reghdr + 1);
 
 	CTR3(KTR_IPMF, "%s: register: encap ip src %s len %d",
 	    __func__, inet_ntoa(encap_ip->ip_src), ntohs(encap_ip->ip_len));
 
 	/* verify the version number of the inner packet */
 	if (encap_ip->ip_v != IPVERSION) {
 	    PIMSTAT_INC(pims_rcv_badregisters);
 	    CTR1(KTR_IPMF, "%s: bad encap ip version", __func__);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	/* verify the inner packet is destined to a mcast group */
 	if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) {
 	    PIMSTAT_INC(pims_rcv_badregisters);
 	    CTR2(KTR_IPMF, "%s: bad encap ip dest %s", __func__,
 		inet_ntoa(encap_ip->ip_dst));
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	/* If a NULL_REGISTER, pass it to the daemon */
 	if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 	    goto pim_input_to_daemon;
 
 	/*
 	 * Copy the TOS from the outer IP header to the inner IP header.
 	 */
 	if (encap_ip->ip_tos != ip_tos) {
 	    /* Outer TOS -> inner TOS */
 	    encap_ip->ip_tos = ip_tos;
 	    /* Recompute the inner header checksum. Sigh... */
 
 	    /* adjust mbuf to point to the inner IP header */
 	    m->m_data += (iphlen + PIM_MINLEN);
 	    m->m_len  -= (iphlen + PIM_MINLEN);
 
 	    encap_ip->ip_sum = 0;
 	    encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
 
 	    /* restore mbuf to point back to the outer IP header */
 	    m->m_data -= (iphlen + PIM_MINLEN);
 	    m->m_len  += (iphlen + PIM_MINLEN);
 	}
 
 	/*
 	 * Decapsulate the inner IP packet and loopback to forward it
 	 * as a normal multicast packet. Also, make a copy of the
 	 *     outer_iphdr + pimhdr + reghdr + encap_iphdr
 	 * to pass to the daemon later, so it can take the appropriate
 	 * actions (e.g., send back PIM_REGISTER_STOP).
 	 * XXX: here m->m_data points to the outer IP header.
 	 */
 	mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN);
 	if (mcp == NULL) {
 	    CTR1(KTR_IPMF, "%s: m_copy() failed", __func__);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	/* Keep statistics */
 	/* XXX: registers_bytes include only the encap. mcast pkt */
 	PIMSTAT_INC(pims_rcv_registers_msgs);
 	PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len));
 
 	/*
 	 * forward the inner ip packet; point m_data at the inner ip.
 	 */
 	m_adj(m, iphlen + PIM_MINLEN);
 
 	CTR4(KTR_IPMF,
 	    "%s: forward decap'd REGISTER: src %lx dst %lx vif %d",
 	    __func__,
 	    (u_long)ntohl(encap_ip->ip_src.s_addr),
 	    (u_long)ntohl(encap_ip->ip_dst.s_addr),
 	    (int)V_reg_vif_num);
 
 	/* NB: vifp was collected above; can it change on us? */
 	if_simloop(vifp, m, dst.sin_family, 0);
 
 	/* prepare the register head to send to the mrouting daemon */
 	m = mcp;
     }
 
 pim_input_to_daemon:
     /*
      * Pass the PIM message up to the daemon; if it is a Register message,
      * pass the 'head' only up to the daemon. This includes the
      * outer IP header, PIM header, PIM-Register header and the
      * inner IP header.
      * XXX: the outer IP header pkt size of a Register is not adjust to
      * reflect the fact that the inner multicast data is truncated.
      */
     *mp = m;
     rip_input(mp, offp, proto);
 
     return (IPPROTO_DONE);
 }
 
 static int
 sysctl_mfctable(SYSCTL_HANDLER_ARGS)
 {
 	struct mfc	*rt;
 	int		 error, i;
 
 	if (req->newptr)
 		return (EPERM);
 	if (V_mfchashtbl == NULL)	/* XXX unlocked */
 		return (0);
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 
 	MFC_LOCK();
 	for (i = 0; i < mfchashsize; i++) {
 		LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) {
 			error = SYSCTL_OUT(req, rt, sizeof(struct mfc));
 			if (error)
 				goto out_locked;
 		}
 	}
 out_locked:
 	MFC_UNLOCK();
 	return (error);
 }
 
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD,
     sysctl_mfctable, "IPv4 Multicast Forwarding Table "
     "(struct *mfc[mfchashsize], netinet/ip_mroute.h)");
 
 static void
 vnet_mroute_init(const void *unused __unused)
 {
 
 	MALLOC(V_nexpire, u_char *, mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO);
 	bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers));
 	callout_init(&V_expire_upcalls_ch, CALLOUT_MPSAFE);
 	callout_init(&V_bw_upcalls_ch, CALLOUT_MPSAFE);
 	callout_init(&V_bw_meter_ch, CALLOUT_MPSAFE);
 }
 
 VNET_SYSINIT(vnet_mroute_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mroute_init,
 	NULL);
 
 static void
 vnet_mroute_uninit(const void *unused __unused)
 {
 
 	FREE(V_nexpire, M_MRTABLE);
 	V_nexpire = NULL;
 }
 
 VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, 
 	vnet_mroute_uninit, NULL);
 
 static int
 ip_mroute_modevent(module_t mod, int type, void *unused)
 {
 
     switch (type) {
     case MOD_LOAD:
 	MROUTER_LOCK_INIT();
 
 	if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, 
 	    if_detached_event, NULL, EVENTHANDLER_PRI_ANY);
 	if (if_detach_event_tag == NULL) {
 		printf("ip_mroute: unable to register "
 		    "ifnet_departure_event handler\n");
 		MROUTER_LOCK_DESTROY();
 		return (EINVAL);
 	}
 
 	MFC_LOCK_INIT();
 	VIF_LOCK_INIT();
 
 	mfchashsize = MFCHASHSIZE;
 	if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) &&
 	    !powerof2(mfchashsize)) {
 		printf("WARNING: %s not a power of 2; using default\n",
 		    "net.inet.ip.mfchashsize");
 		mfchashsize = MFCHASHSIZE;
 	}
 
 	pim_squelch_wholepkt = 0;
 	TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt",
 	    &pim_squelch_wholepkt);
 
 	pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM,
 	    pim_encapcheck, &in_pim_protosw, NULL);
 	if (pim_encap_cookie == NULL) {
 		printf("ip_mroute: unable to attach pim encap\n");
 		VIF_LOCK_DESTROY();
 		MFC_LOCK_DESTROY();
 		MROUTER_LOCK_DESTROY();
 		return (EINVAL);
 	}
 
 	ip_mcast_src = X_ip_mcast_src;
 	ip_mforward = X_ip_mforward;
 	ip_mrouter_done = X_ip_mrouter_done;
 	ip_mrouter_get = X_ip_mrouter_get;
 	ip_mrouter_set = X_ip_mrouter_set;
 
 	ip_rsvp_force_done = X_ip_rsvp_force_done;
 	ip_rsvp_vif = X_ip_rsvp_vif;
 
 	legal_vif_num = X_legal_vif_num;
 	mrt_ioctl = X_mrt_ioctl;
 	rsvp_input_p = X_rsvp_input;
 	break;
 
     case MOD_UNLOAD:
 	/*
 	 * Typically module unload happens after the user-level
 	 * process has shutdown the kernel services (the check
 	 * below insures someone can't just yank the module out
 	 * from under a running process).  But if the module is
 	 * just loaded and then unloaded w/o starting up a user
 	 * process we still need to cleanup.
 	 */
 	MROUTER_LOCK();
 	if (ip_mrouter_cnt != 0) {
 	    MROUTER_UNLOCK();
 	    return (EINVAL);
 	}
 	ip_mrouter_unloading = 1;
 	MROUTER_UNLOCK();
 
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
 
 	if (pim_encap_cookie) {
 	    encap_detach(pim_encap_cookie);
 	    pim_encap_cookie = NULL;
 	}
 
 	ip_mcast_src = NULL;
 	ip_mforward = NULL;
 	ip_mrouter_done = NULL;
 	ip_mrouter_get = NULL;
 	ip_mrouter_set = NULL;
 
 	ip_rsvp_force_done = NULL;
 	ip_rsvp_vif = NULL;
 
 	legal_vif_num = NULL;
 	mrt_ioctl = NULL;
 	rsvp_input_p = NULL;
 
 	VIF_LOCK_DESTROY();
 	MFC_LOCK_DESTROY();
 	MROUTER_LOCK_DESTROY();
 	break;
 
     default:
 	return EOPNOTSUPP;
     }
     return 0;
 }
 
 static moduledata_t ip_mroutemod = {
     "ip_mroute",
     ip_mroute_modevent,
     0
 };
 
 DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE);
Index: user/dchagin/lemul/sys/netinet/raw_ip.c
===================================================================
--- user/dchagin/lemul/sys/netinet/raw_ip.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/raw_ip.c	(revision 274241)
@@ -1,1125 +1,1125 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_mroute.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #endif /*IPSEC*/
 
 #include <machine/stdarg.h>
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, ip_defttl) = IPDEFTTL;
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_defttl), 0,
     "Maximum TTL on IP packets");
 
 VNET_DEFINE(struct inpcbhead, ripcb);
 VNET_DEFINE(struct inpcbinfo, ripcbinfo);
 
 #define	V_ripcb			VNET(ripcb)
 #define	V_ripcbinfo		VNET(ripcbinfo)
 
 /*
  * Control and data hooks for ipfw, dummynet, divert and so on.
  * The data hooks are not used here but it is convenient
  * to keep them all in one place.
  */
 VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL;
 VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL;
 
 int	(*ip_dn_ctl_ptr)(struct sockopt *);
 int	(*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *);
 void	(*ip_divert_ptr)(struct mbuf *, int);
 int	(*ng_ipfw_input_p)(struct mbuf **, int,
 			struct ip_fw_args *, int);
 
 #ifdef INET
 /*
  * Hooks for multicast routing. They all default to NULL, so leave them not
  * initialized and rely on BSS being set to 0.
  */
 
 /*
  * The socket used to communicate with the multicast routing daemon.
  */
 VNET_DEFINE(struct socket *, ip_mrouter);
 
 /*
  * The various mrouter and rsvp functions.
  */
 int (*ip_mrouter_set)(struct socket *, struct sockopt *);
 int (*ip_mrouter_get)(struct socket *, struct sockopt *);
 int (*ip_mrouter_done)(void);
 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 		   struct ip_moptions *);
 int (*mrt_ioctl)(u_long, caddr_t, int);
 int (*legal_vif_num)(int);
 u_long (*ip_mcast_src)(int);
 
 int (*rsvp_input_p)(struct mbuf **, int *, int);
 int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
 void (*ip_rsvp_force_done)(struct socket *);
 #endif /* INET */
 
 u_long	rip_sendspace = 9216;
 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
     &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
 
 u_long	rip_recvspace = 9216;
 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
     &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
 
 /*
  * Hash functions
  */
 
 #define INP_PCBHASH_RAW_SIZE	256
 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
         (((proto) + (laddr) + (faddr)) % (mask) + 1)
 
 #ifdef INET
 static void
 rip_inshash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *pcbhash;
 	int hash;
 
 	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 	
 	if (inp->inp_ip_p != 0 &&
 	    inp->inp_laddr.s_addr != INADDR_ANY &&
 	    inp->inp_faddr.s_addr != INADDR_ANY) {
 		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
 		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
 	} else
 		hash = 0;
 	pcbhash = &pcbinfo->ipi_hashbase[hash];
 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 }
 
 static void
 rip_delhash(struct inpcb *inp)
 {
 
 	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	LIST_REMOVE(inp, inp_hash);
 }
 #endif /* INET */
 
 /*
  * Raw interface to IP protocol.
  */
 
 /*
  * Initialize raw connection block q.
  */
 static void
 rip_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
 }
 
 static int
 rip_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_INIT(inp, "inp", "rawinp");
 	return (0);
 }
 
 void
 rip_init(void)
 {
 
 	in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE,
 	    1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE,
 	    IPI_HASHFIELDS_NONE);
 	EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 #ifdef VIMAGE
 void
 rip_destroy(void)
 {
 
 	in_pcbinfo_destroy(&V_ripcbinfo);
 }
 #endif
 
 #ifdef INET
 static int
 rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
     struct sockaddr_in *ripsrc)
 {
 	int policyfail = 0;
 
 	INP_LOCK_ASSERT(last);
 
 #ifdef IPSEC
 	/* check AH/ESP integrity. */
 	if (ipsec4_in_reject(n, last)) {
 		policyfail = 1;
 	}
 #endif /* IPSEC */
 #ifdef MAC
 	if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
 		policyfail = 1;
 #endif
 	/* Check the minimum TTL for socket. */
 	if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
 		policyfail = 1;
 	if (!policyfail) {
 		struct mbuf *opts = NULL;
 		struct socket *so;
 
 		so = last->inp_socket;
 		if ((last->inp_flags & INP_CONTROLOPTS) ||
 		    (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
 			ip_savecontrol(last, &opts, ip, n);
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (sbappendaddr_locked(&so->so_rcv,
 		    (struct sockaddr *)ripsrc, n, opts) == 0) {
 			/* should notify about lost packet */
 			m_freem(n);
 			if (opts)
 				m_freem(opts);
 			SOCKBUF_UNLOCK(&so->so_rcv);
 		} else
 			sorwakeup_locked(so);
 	} else
 		m_freem(n);
 	return (policyfail);
 }
 
 /*
  * Setup generic address and protocol structures for raw_input routine, then
  * pass them along with mbuf chain.
  */
 int
 rip_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct ifnet *ifp;
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct inpcb *inp, *last;
 	struct sockaddr_in ripsrc;
 	int hash;
 
 	*mp = NULL;
 
 	bzero(&ripsrc, sizeof(ripsrc));
 	ripsrc.sin_len = sizeof(ripsrc);
 	ripsrc.sin_family = AF_INET;
 	ripsrc.sin_addr = ip->ip_src;
 	last = NULL;
 
 	ifp = m->m_pkthdr.rcvif;
 
 	hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
 	    ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
 	INP_INFO_RLOCK(&V_ripcbinfo);
 	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
 		if (inp->inp_ip_p != proto)
 			continue;
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 			continue;
 		if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 			continue;
 		if (jailed_without_vnet(inp->inp_cred)) {
 			/*
 			 * XXX: If faddr was bound to multicast group,
 			 * jailed raw socket will drop datagram.
 			 */
 			if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
 				continue;
 		}
 		if (last != NULL) {
 			struct mbuf *n;
 
 			n = m_copy(m, 0, (int)M_COPYALL);
 			if (n != NULL)
 		    	    (void) rip_append(last, ip, n, &ripsrc);
 			/* XXX count dropped packet */
 			INP_RUNLOCK(last);
 		}
 		INP_RLOCK(inp);
 		last = inp;
 	}
 	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
 		if (inp->inp_ip_p && inp->inp_ip_p != proto)
 			continue;
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (!in_nullhost(inp->inp_laddr) &&
 		    !in_hosteq(inp->inp_laddr, ip->ip_dst))
 			continue;
 		if (!in_nullhost(inp->inp_faddr) &&
 		    !in_hosteq(inp->inp_faddr, ip->ip_src))
 			continue;
 		if (jailed_without_vnet(inp->inp_cred)) {
 			/*
 			 * Allow raw socket in jail to receive multicast;
 			 * assume process had PRIV_NETINET_RAW at attach,
 			 * and fall through into normal filter path if so.
 			 */
 			if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 			    prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0)
 				continue;
 		}
 		/*
 		 * If this raw socket has multicast state, and we
 		 * have received a multicast, check if this socket
 		 * should receive it, as multicast filtering is now
 		 * the responsibility of the transport layer.
 		 */
 		if (inp->inp_moptions != NULL &&
 		    IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 			/*
 			 * If the incoming datagram is for IGMP, allow it
 			 * through unconditionally to the raw socket.
 			 *
 			 * In the case of IGMPv2, we may not have explicitly
 			 * joined the group, and may have set IFF_ALLMULTI
 			 * on the interface. imo_multi_filter() may discard
 			 * control traffic we actually need to see.
 			 *
 			 * Userland multicast routing daemons should continue
 			 * filter the control traffic appropriately.
 			 */
 			int blocked;
 
 			blocked = MCAST_PASS;
 			if (proto != IPPROTO_IGMP) {
 				struct sockaddr_in group;
 
 				bzero(&group, sizeof(struct sockaddr_in));
 				group.sin_len = sizeof(struct sockaddr_in);
 				group.sin_family = AF_INET;
 				group.sin_addr = ip->ip_dst;
 
 				blocked = imo_multi_filter(inp->inp_moptions,
 				    ifp,
 				    (struct sockaddr *)&group,
 				    (struct sockaddr *)&ripsrc);
 			}
 
 			if (blocked != MCAST_PASS) {
 				IPSTAT_INC(ips_notmember);
 				continue;
 			}
 		}
 		if (last != NULL) {
 			struct mbuf *n;
 
 			n = m_copy(m, 0, (int)M_COPYALL);
 			if (n != NULL)
 				(void) rip_append(last, ip, n, &ripsrc);
 			/* XXX count dropped packet */
 			INP_RUNLOCK(last);
 		}
 		INP_RLOCK(inp);
 		last = inp;
 	}
 	INP_INFO_RUNLOCK(&V_ripcbinfo);
 	if (last != NULL) {
 		if (rip_append(last, ip, m, &ripsrc) != 0)
 			IPSTAT_INC(ips_delivered);
 		INP_RUNLOCK(last);
 	} else {
 		m_freem(m);
 		IPSTAT_INC(ips_noproto);
 		IPSTAT_DEC(ips_delivered);
 	}
 	return (IPPROTO_DONE);
 }
 
 /*
  * Generate IP header and pass packet to ip_output.  Tack on options user may
  * have setup with control call.
  */
 int
 rip_output(struct mbuf *m, struct socket *so, ...)
 {
 	struct ip *ip;
 	int error;
 	struct inpcb *inp = sotoinpcb(so);
 	va_list ap;
 	u_long dst;
 	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
 	    IP_ALLOWBROADCAST;
 
 	va_start(ap, so);
 	dst = va_arg(ap, u_long);
 	va_end(ap);
 
 	/*
 	 * If the user handed us a complete IP packet, use it.  Otherwise,
 	 * allocate an mbuf for a header and fill it in.
 	 */
 	if ((inp->inp_flags & INP_HDRINCL) == 0) {
 		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		M_PREPEND(m, sizeof(struct ip), M_NOWAIT);
 		if (m == NULL)
 			return(ENOBUFS);
 
 		INP_RLOCK(inp);
 		ip = mtod(m, struct ip *);
 		ip->ip_tos = inp->inp_ip_tos;
 		if (inp->inp_flags & INP_DONTFRAG)
 			ip->ip_off = htons(IP_DF);
 		else
 			ip->ip_off = htons(0);
 		ip->ip_p = inp->inp_ip_p;
 		ip->ip_len = htons(m->m_pkthdr.len);
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst.s_addr = dst;
 		if (jailed(inp->inp_cred)) {
 			/*
 			 * prison_local_ip4() would be good enough but would
 			 * let a source of INADDR_ANY pass, which we do not
 			 * want to see from jails.
 			 */
 			if (ip->ip_src.s_addr == INADDR_ANY) {
 				error = in_pcbladdr(inp, &ip->ip_dst, &ip->ip_src,
 				    inp->inp_cred);
 			} else {
 				error = prison_local_ip4(inp->inp_cred,
 				    &ip->ip_src);
 			}
 			if (error != 0) {
 				INP_RUNLOCK(inp);
 				m_freem(m);
 				return (error);
 			}
 		}
 		ip->ip_ttl = inp->inp_ip_ttl;
 	} else {
 		if (m->m_pkthdr.len > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		INP_RLOCK(inp);
 		ip = mtod(m, struct ip *);
 		error = prison_check_ip4(inp->inp_cred, &ip->ip_src);
 		if (error != 0) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (error);
 		}
 
 		/*
 		 * Don't allow both user specified and setsockopt options,
 		 * and don't allow packet length sizes that will crash.
 		 */
 		if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options)
 		    || (ntohs(ip->ip_len) > m->m_pkthdr.len)
 		    || (ntohs(ip->ip_len) < (ip->ip_hl << 2))) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (EINVAL);
 		}
 		if (ip->ip_id == 0)
 			ip->ip_id = ip_newid();
 
 		/*
 		 * XXX prevent ip_output from overwriting header fields.
 		 */
 		flags |= IP_RAWOUTPUT;
 		IPSTAT_INC(ips_rawout);
 	}
 
 	if (inp->inp_flags & INP_ONESBCAST)
 		flags |= IP_SENDONES;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 	error = ip_output(m, inp->inp_options, NULL, flags,
 	    inp->inp_moptions, inp);
 	INP_RUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Raw IP socket option processing.
  *
  * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
  * only be created by a privileged process, and as such, socket option
  * operations to manage system properties on any raw socket were allowed to
  * take place without explicit additional access control checks.  However,
  * raw sockets can now also be created in jail(), and therefore explicit
  * checks are now required.  Likewise, raw sockets can be used by a process
  * after it gives up privilege, so some caution is required.  For options
  * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
  * performed in ip_ctloutput() and therefore no check occurs here.
  * Unilaterally checking priv_check() here breaks normal IP socket option
  * operations on raw sockets.
  *
  * When adding new socket options here, make sure to add access control
  * checks here as necessary.
  *
  * XXX-BZ inp locking?
  */
 int
 rip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 
 	if (sopt->sopt_level != IPPROTO_IP) {
 		if ((sopt->sopt_level == SOL_SOCKET) &&
 		    (sopt->sopt_name == SO_SETFIB)) {
 			inp->inp_inc.inc_fibnum = so->so_fibnum;
 			return (0);
 		}
 		return (EINVAL);
 	}
 
 	error = 0;
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			optval = inp->inp_flags & INP_HDRINCL;
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		case IP_FW3:	/* generic ipfw v.3 functions */
 		case IP_FW_ADD:	/* ADD actually returns the body... */
 		case IP_FW_GET:
 		case IP_FW_TABLE_GETSIZE:
 		case IP_FW_TABLE_LIST:
 		case IP_FW_NAT_GET_CONFIG:
 		case IP_FW_NAT_GET_LOG:
 			if (V_ip_fw_ctl_ptr != NULL)
 				error = V_ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */
 		case IP_DUMMYNET_GET:
 			if (ip_dn_ctl_ptr != NULL)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break ;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case MRT_API_SUPPORT:
 		case MRT_API_CONFIG:
 		case MRT_ADD_BW_UPCALL:
 		case MRT_DEL_BW_UPCALL:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
 				EOPNOTSUPP;
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 			if (optval)
 				inp->inp_flags |= INP_HDRINCL;
 			else
 				inp->inp_flags &= ~INP_HDRINCL;
 			break;
 
 		case IP_FW3:	/* generic ipfw v.3 functions */
 		case IP_FW_ADD:
 		case IP_FW_DEL:
 		case IP_FW_FLUSH:
 		case IP_FW_ZERO:
 		case IP_FW_RESETLOG:
 		case IP_FW_TABLE_ADD:
 		case IP_FW_TABLE_DEL:
 		case IP_FW_TABLE_FLUSH:
 		case IP_FW_NAT_CFG:
 		case IP_FW_NAT_DEL:
 			if (V_ip_fw_ctl_ptr != NULL)
 				error = V_ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */
 		case IP_DUMMYNET_CONFIGURE:
 		case IP_DUMMYNET_DEL:
 		case IP_DUMMYNET_FLUSH:
 			if (ip_dn_ctl_ptr != NULL)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT ;
 			break ;
 
 		case IP_RSVP_ON:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_init(so);
 			break;
 
 		case IP_RSVP_OFF:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_done();
 			break;
 
 		case IP_RSVP_VIF_ON:
 		case IP_RSVP_VIF_OFF:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_vif ?
 				ip_rsvp_vif(so, sopt) : EINVAL;
 			break;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case MRT_API_SUPPORT:
 		case MRT_API_CONFIG:
 		case MRT_ADD_BW_UPCALL:
 		case MRT_DEL_BW_UPCALL:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
 					EOPNOTSUPP;
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * This function exists solely to receive the PRC_IFDOWN messages which are
  * sent by if_down().  It looks for an ifaddr whose ifa_addr is sa, and calls
  * in_ifadown() to remove all routes corresponding to that address.  It also
  * receives the PRC_IFUP messages from if_up() and reinstalls the interface
  * routes.
  */
 void
 rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct in_ifaddr *ia;
 	struct ifnet *ifp;
 	int err;
 	int flags;
 
 	switch (cmd) {
 	case PRC_IFDOWN:
 		IN_IFADDR_RLOCK();
 		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 			if (ia->ia_ifa.ifa_addr == sa
 			    && (ia->ia_flags & IFA_ROUTE)) {
 				ifa_ref(&ia->ia_ifa);
 				IN_IFADDR_RUNLOCK();
 				/*
 				 * in_scrubprefix() kills the interface route.
 				 */
 				in_scrubprefix(ia, 0);
 				/*
 				 * in_ifadown gets rid of all the rest of the
 				 * routes.  This is not quite the right thing
 				 * to do, but at least if we are running a
 				 * routing process they will come back.
 				 */
 				in_ifadown(&ia->ia_ifa, 0);
 				ifa_free(&ia->ia_ifa);
 				break;
 			}
 		}
 		if (ia == NULL)		/* If ia matched, already unlocked. */
 			IN_IFADDR_RUNLOCK();
 		break;
 
 	case PRC_IFUP:
 		IN_IFADDR_RLOCK();
 		TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 			if (ia->ia_ifa.ifa_addr == sa)
 				break;
 		}
 		if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) {
 			IN_IFADDR_RUNLOCK();
 			return;
 		}
 		ifa_ref(&ia->ia_ifa);
 		IN_IFADDR_RUNLOCK();
 		flags = RTF_UP;
 		ifp = ia->ia_ifa.ifa_ifp;
 
 		if ((ifp->if_flags & IFF_LOOPBACK)
 		    || (ifp->if_flags & IFF_POINTOPOINT))
 			flags |= RTF_HOST;
 
 		err = ifa_del_loopback_route((struct ifaddr *)ia, sa);
 
 		err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
 		if (err == 0)
 			ia->ia_flags |= IFA_ROUTE;
 
 		err = ifa_add_loopback_route((struct ifaddr *)ia, sa);
 
 		ifa_free(&ia->ia_ifa);
 		break;
 	}
 }
 
 static int
 rip_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
 
 	error = priv_check(td, PRIV_NETINET_RAW);
 	if (error)
 		return (error);
 	if (proto >= IPPROTO_MAX || proto < 0)
 		return EPROTONOSUPPORT;
 	error = soreserve(so, rip_sendspace, rip_recvspace);
 	if (error)
 		return (error);
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	error = in_pcballoc(so, &V_ripcbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(&V_ripcbinfo);
 		return (error);
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_p = proto;
 	inp->inp_ip_ttl = V_ip_defttl;
 	rip_inshash(inp);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static void
 rip_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, 
 	    ("rip_detach: not closed"));
 
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	INP_WLOCK(inp);
 	rip_delhash(inp);
 	if (so == V_ip_mrouter && ip_mrouter_done)
 		ip_mrouter_done();
 	if (ip_rsvp_force_done)
 		ip_rsvp_force_done(so);
 	if (so == V_ip_rsvpd)
 		ip_rsvp_done();
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 }
 
 static void
 rip_dodisconnect(struct socket *so, struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = inp->inp_pcbinfo;
 	INP_INFO_WLOCK(pcbinfo);
 	INP_WLOCK(inp);
 	rip_delhash(inp);
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	rip_inshash(inp);
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTED;
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 static void
 rip_abort(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
 
 	rip_dodisconnect(so, inp);
 }
 
 static void
 rip_close(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
 
 	rip_dodisconnect(so, inp);
 }
 
 static int
 rip_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return (ENOTCONN);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
 
 	rip_dodisconnect(so, inp);
 	return (0);
 }
 
 static int
 rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 	struct inpcb *inp;
 	int error;
 
 	if (nam->sa_len != sizeof(*addr))
 		return (EINVAL);
 
 	error = prison_check_ip4(td->td_ucred, &addr->sin_addr);
 	if (error != 0)
 		return (error);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
 
 	if (TAILQ_EMPTY(&V_ifnet) ||
 	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
 	    (addr->sin_addr.s_addr &&
 	     (inp->inp_flags & INP_BINDANY) == 0 &&
 	     ifa_ifwithaddr_check((struct sockaddr *)addr) == 0))
 		return (EADDRNOTAVAIL);
 
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	INP_WLOCK(inp);
 	rip_delhash(inp);
 	inp->inp_laddr = addr->sin_addr;
 	rip_inshash(inp);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	return (0);
 }
 
 static int
 rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 	struct inpcb *inp;
 
 	if (nam->sa_len != sizeof(*addr))
 		return (EINVAL);
 	if (TAILQ_EMPTY(&V_ifnet))
 		return (EADDRNOTAVAIL);
 	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
 		return (EAFNOSUPPORT);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
 
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	INP_WLOCK(inp);
 	rip_delhash(inp);
 	inp->inp_faddr = addr->sin_addr;
 	rip_inshash(inp);
 	soisconnected(so);
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	return (0);
 }
 
 static int
 rip_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
 
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 	u_long dst;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_send: inp == NULL"));
 
 	/*
 	 * Note: 'dst' reads below are unlocked.
 	 */
 	if (so->so_state & SS_ISCONNECTED) {
 		if (nam) {
 			m_freem(m);
 			return (EISCONN);
 		}
 		dst = inp->inp_faddr.s_addr;	/* Unlocked read. */
 	} else {
 		if (nam == NULL) {
 			m_freem(m);
 			return (ENOTCONN);
 		}
 		dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
 	}
 	return (rip_output(m, so, dst));
 }
 #endif /* INET */
 
 static int
 rip_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = V_ripcbinfo.ipi_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return (0);
 	}
 
 	if (req->newptr != 0)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_ripcbinfo);
 	gencnt = V_ripcbinfo.ipi_gencnt;
 	n = V_ripcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_ripcbinfo);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return (ENOMEM);
 
 	INP_INFO_RLOCK(&V_ripcbinfo);
 	for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_WLOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
 			in_pcbref(inp);
 			inp_list[i++] = inp;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_ripcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
 
 			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.  If the
 		 * generation differs from what we told her before, she knows
 		 * that something happened while we were processing this
 		 * request, and it might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_ripcbinfo);
 		xig.xig_gen = V_ripcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_ripcbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&V_ripcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
     rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
 
 #ifdef INET
 struct pr_usrreqs rip_usrreqs = {
 	.pru_abort =		rip_abort,
 	.pru_attach =		rip_attach,
 	.pru_bind =		rip_bind,
 	.pru_connect =		rip_connect,
 	.pru_control =		in_control,
 	.pru_detach =		rip_detach,
 	.pru_disconnect =	rip_disconnect,
 	.pru_peeraddr =		in_getpeeraddr,
 	.pru_send =		rip_send,
 	.pru_shutdown =		rip_shutdown,
 	.pru_sockaddr =		in_getsockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel,
 	.pru_close =		rip_close,
 };
 #endif /* INET */
Index: user/dchagin/lemul/sys/netinet/tcp_hostcache.c
===================================================================
--- user/dchagin/lemul/sys/netinet/tcp_hostcache.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/tcp_hostcache.c	(revision 274241)
@@ -1,699 +1,699 @@
 /*-
  * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * The tcp_hostcache moves the tcp-specific cached metrics from the routing
  * table to a dedicated structure indexed by the remote IP address.  It keeps
  * information on the measured TCP parameters of past TCP sessions to allow
  * better initial start values to be used with later connections to/from the
  * same source.  Depending on the network parameters (delay, bandwidth, max
  * MTU, congestion window) between local and remote sites, this can lead to
  * significant speed-ups for new TCP connections after the first one.
  *
  * Due to the tcp_hostcache, all TCP-specific metrics information in the
  * routing table have been removed.  The inpcb no longer keeps a pointer to
  * the routing entry, and protocol-initiated route cloning has been removed
  * as well.  With these changes, the routing table has gone back to being
  * more lightwight and only carries information related to packet forwarding.
  *
  * tcp_hostcache is designed for multiple concurrent access in SMP
  * environments and high contention.  All bucket rows have their own lock and
  * thus multiple lookups and modifies can be done at the same time as long as
  * they are in different bucket rows.  If a request for insertion of a new
  * record can't be satisfied, it simply returns an empty structure.  Nobody
  * and nothing outside of tcp_hostcache.c will ever point directly to any
  * entry in the tcp_hostcache.  All communication is done in an
  * object-oriented way and only functions of tcp_hostcache will manipulate
  * hostcache entries.  Otherwise, we are unable to achieve good behaviour in
  * concurrent access situations.  Since tcp_hostcache is only caching
  * information, there are no fatal consequences if we either can't satisfy
  * any particular request or have to drop/overwrite an existing entry because
  * of bucket limit memory constrains.
  */
 
 /*
  * Many thanks to jlemon for basic structure of tcp_syncache which is being
  * followed here.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_hostcache.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 
 #include <vm/uma.h>
 
 /* Arbitrary values */
 #define TCP_HOSTCACHE_HASHSIZE		512
 #define TCP_HOSTCACHE_BUCKETLIMIT	30
 #define TCP_HOSTCACHE_EXPIRE		60*60	/* one hour */
 #define TCP_HOSTCACHE_PRUNE		5*60	/* every 5 minutes */
 
 static VNET_DEFINE(struct tcp_hostcache, tcp_hostcache);
 #define	V_tcp_hostcache		VNET(tcp_hostcache)
 
 static VNET_DEFINE(struct callout, tcp_hc_callout);
 #define	V_tcp_hc_callout	VNET(tcp_hc_callout)
 
 static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *);
 static struct hc_metrics *tcp_hc_insert(struct in_conninfo *);
 static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
 static void tcp_hc_purge_internal(int);
 static void tcp_hc_purge(void *);
 
 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0,
     "TCP Host cache");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
+SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN,
     &VNET_NAME(tcp_hostcache.cache_limit), 0,
     "Overall entry limit for hostcache");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
+SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN,
     &VNET_NAME(tcp_hostcache.hashsize), 0,
     "Size of TCP hostcache hashtable");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit,
-    CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0,
+SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit,
+    CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0,
     "Per-bucket hash limit for hostcache");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD,
+SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_VNET | CTLFLAG_RD,
      &VNET_NAME(tcp_hostcache.cache_count), 0,
     "Current number of entries in hostcache");
 
-SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_hostcache.expire), 0,
     "Expire time of TCP hostcache entries");
 
-SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_hostcache.prune), 0,
     "Time between purge runs");
 
-SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_hostcache.purgeall), 0,
     "Expire all entires on next purge run");
 
 SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0,
     sysctl_tcp_hc_list, "A", "List of all hostcache entries");
 
 
 static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");
 
 #define HOSTCACHE_HASH(ip) \
 	(((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) &	\
 	  V_tcp_hostcache.hashmask)
 
 /* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */
 #define HOSTCACHE_HASH6(ip6)				\
 	(((ip6)->s6_addr32[0] ^				\
 	  (ip6)->s6_addr32[1] ^				\
 	  (ip6)->s6_addr32[2] ^				\
 	  (ip6)->s6_addr32[3]) &			\
 	 V_tcp_hostcache.hashmask)
 
 #define THC_LOCK(lp)		mtx_lock(lp)
 #define THC_UNLOCK(lp)		mtx_unlock(lp)
 
 void
 tcp_hc_init(void)
 {
 	u_int cache_limit;
 	int i;
 
 	/*
 	 * Initialize hostcache structures.
 	 */
 	V_tcp_hostcache.cache_count = 0;
 	V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE;
 	V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT;
 	V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE;
 	V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE;
 
 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize",
 	    &V_tcp_hostcache.hashsize);
 	if (!powerof2(V_tcp_hostcache.hashsize)) {
 		printf("WARNING: hostcache hash size is not a power of 2.\n");
 		V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */
 	}
 	V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1;
 
 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit",
 	    &V_tcp_hostcache.bucket_limit);
 
 	cache_limit = V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit;
 	V_tcp_hostcache.cache_limit = cache_limit;
 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit",
 	    &V_tcp_hostcache.cache_limit);
 	if (V_tcp_hostcache.cache_limit > cache_limit)
 		V_tcp_hostcache.cache_limit = cache_limit;
 
 	/*
 	 * Allocate the hash table.
 	 */
 	V_tcp_hostcache.hashbase = (struct hc_head *)
 	    malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head),
 		   M_HOSTCACHE, M_WAITOK | M_ZERO);
 
 	/*
 	 * Initialize the hash buckets.
 	 */
 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
 		TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket);
 		V_tcp_hostcache.hashbase[i].hch_length = 0;
 		mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry",
 			  NULL, MTX_DEF);
 	}
 
 	/*
 	 * Allocate the hostcache entries.
 	 */
 	V_tcp_hostcache.zone =
 	    uma_zcreate("hostcache", sizeof(struct hc_metrics),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit);
 
 	/*
 	 * Set up periodic cache cleanup.
 	 */
 	callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE);
 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
 	    tcp_hc_purge, curvnet);
 }
 
 #ifdef VIMAGE
 void
 tcp_hc_destroy(void)
 {
 	int i;
 
 	callout_drain(&V_tcp_hc_callout);
 
 	/* Purge all hc entries. */
 	tcp_hc_purge_internal(1);
 
 	/* Free the uma zone and the allocated hash table. */
 	uma_zdestroy(V_tcp_hostcache.zone);
 
 	for (i = 0; i < V_tcp_hostcache.hashsize; i++)
 		mtx_destroy(&V_tcp_hostcache.hashbase[i].hch_mtx);
 	free(V_tcp_hostcache.hashbase, M_HOSTCACHE);
 }
 #endif
 
 /*
  * Internal function: look up an entry in the hostcache or return NULL.
  *
  * If an entry has been returned, the caller becomes responsible for
  * unlocking the bucket row after he is done reading/modifying the entry.
  */
 static struct hc_metrics *
 tcp_hc_lookup(struct in_conninfo *inc)
 {
 	int hash;
 	struct hc_head *hc_head;
 	struct hc_metrics *hc_entry;
 
 	KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer"));
 
 	/*
 	 * Hash the foreign ip address.
 	 */
 	if (inc->inc_flags & INC_ISIPV6)
 		hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
 	else
 		hash = HOSTCACHE_HASH(&inc->inc_faddr);
 
 	hc_head = &V_tcp_hostcache.hashbase[hash];
 
 	/*
 	 * Acquire lock for this bucket row; we release the lock if we don't
 	 * find an entry, otherwise the caller has to unlock after he is
 	 * done.
 	 */
 	THC_LOCK(&hc_head->hch_mtx);
 
 	/*
 	 * Iterate through entries in bucket row looking for a match.
 	 */
 	TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
 		if (inc->inc_flags & INC_ISIPV6) {
 			/* XXX: check ip6_zoneid */
 			if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
 			    sizeof(inc->inc6_faddr)) == 0)
 				return hc_entry;
 		} else {
 			if (memcmp(&inc->inc_faddr, &hc_entry->ip4,
 			    sizeof(inc->inc_faddr)) == 0)
 				return hc_entry;
 		}
 	}
 
 	/*
 	 * We were unsuccessful and didn't find anything.
 	 */
 	THC_UNLOCK(&hc_head->hch_mtx);
 	return NULL;
 }
 
 /*
  * Internal function: insert an entry into the hostcache or return NULL if
  * unable to allocate a new one.
  *
  * If an entry has been returned, the caller becomes responsible for
  * unlocking the bucket row after he is done reading/modifying the entry.
  */
 static struct hc_metrics *
 tcp_hc_insert(struct in_conninfo *inc)
 {
 	int hash;
 	struct hc_head *hc_head;
 	struct hc_metrics *hc_entry;
 
 	KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer"));
 
 	/*
 	 * Hash the foreign ip address.
 	 */
 	if (inc->inc_flags & INC_ISIPV6)
 		hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
 	else
 		hash = HOSTCACHE_HASH(&inc->inc_faddr);
 
 	hc_head = &V_tcp_hostcache.hashbase[hash];
 
 	/*
 	 * Acquire lock for this bucket row; we release the lock if we don't
 	 * find an entry, otherwise the caller has to unlock after he is
 	 * done.
 	 */
 	THC_LOCK(&hc_head->hch_mtx);
 
 	/*
 	 * If the bucket limit is reached, reuse the least-used element.
 	 */
 	if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit ||
 	    V_tcp_hostcache.cache_count >= V_tcp_hostcache.cache_limit) {
 		hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead);
 		/*
 		 * At first we were dropping the last element, just to
 		 * reacquire it in the next two lines again, which isn't very
 		 * efficient.  Instead just reuse the least used element.
 		 * We may drop something that is still "in-use" but we can be
 		 * "lossy".
 		 * Just give up if this bucket row is empty and we don't have
 		 * anything to replace.
 		 */
 		if (hc_entry == NULL) {
 			THC_UNLOCK(&hc_head->hch_mtx);
 			return NULL;
 		}
 		TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q);
 		V_tcp_hostcache.hashbase[hash].hch_length--;
 		V_tcp_hostcache.cache_count--;
 		TCPSTAT_INC(tcps_hc_bucketoverflow);
 #if 0
 		uma_zfree(V_tcp_hostcache.zone, hc_entry);
 #endif
 	} else {
 		/*
 		 * Allocate a new entry, or balk if not possible.
 		 */
 		hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT);
 		if (hc_entry == NULL) {
 			THC_UNLOCK(&hc_head->hch_mtx);
 			return NULL;
 		}
 	}
 
 	/*
 	 * Initialize basic information of hostcache entry.
 	 */
 	bzero(hc_entry, sizeof(*hc_entry));
 	if (inc->inc_flags & INC_ISIPV6) {
 		hc_entry->ip6 = inc->inc6_faddr;
 		hc_entry->ip6_zoneid = inc->inc6_zoneid;
 	} else
 		hc_entry->ip4 = inc->inc_faddr;
 	hc_entry->rmx_head = hc_head;
 	hc_entry->rmx_expire = V_tcp_hostcache.expire;
 
 	/*
 	 * Put it upfront.
 	 */
 	TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q);
 	V_tcp_hostcache.hashbase[hash].hch_length++;
 	V_tcp_hostcache.cache_count++;
 	TCPSTAT_INC(tcps_hc_added);
 
 	return hc_entry;
 }
 
 /*
  * External function: look up an entry in the hostcache and fill out the
  * supplied TCP metrics structure.  Fills in NULL when no entry was found or
  * a value is not set.
  */
 void
 tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite)
 {
 	struct hc_metrics *hc_entry;
 
 	/*
 	 * Find the right bucket.
 	 */
 	hc_entry = tcp_hc_lookup(inc);
 
 	/*
 	 * If we don't have an existing object.
 	 */
 	if (hc_entry == NULL) {
 		bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
 		return;
 	}
 	hc_entry->rmx_hits++;
 	hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
 
 	hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu;
 	hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
 	hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
 	hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
 	hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth;
 	hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
 	hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
 	hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;
 
 	/*
 	 * Unlock bucket row.
 	 */
 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
 }
 
 /*
  * External function: look up an entry in the hostcache and return the
  * discovered path MTU.  Returns NULL if no entry is found or value is not
  * set.
  */
 u_long
 tcp_hc_getmtu(struct in_conninfo *inc)
 {
 	struct hc_metrics *hc_entry;
 	u_long mtu;
 
 	hc_entry = tcp_hc_lookup(inc);
 	if (hc_entry == NULL) {
 		return 0;
 	}
 	hc_entry->rmx_hits++;
 	hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
 
 	mtu = hc_entry->rmx_mtu;
 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
 	return mtu;
 }
 
 /*
  * External function: update the MTU value of an entry in the hostcache.
  * Creates a new entry if none was found.
  */
 void
 tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu)
 {
 	struct hc_metrics *hc_entry;
 
 	/*
 	 * Find the right bucket.
 	 */
 	hc_entry = tcp_hc_lookup(inc);
 
 	/*
 	 * If we don't have an existing object, try to insert a new one.
 	 */
 	if (hc_entry == NULL) {
 		hc_entry = tcp_hc_insert(inc);
 		if (hc_entry == NULL)
 			return;
 	}
 	hc_entry->rmx_updates++;
 	hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
 
 	hc_entry->rmx_mtu = mtu;
 
 	/*
 	 * Put it upfront so we find it faster next time.
 	 */
 	TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
 	TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
 
 	/*
 	 * Unlock bucket row.
 	 */
 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
 }
 
 /*
  * External function: update the TCP metrics of an entry in the hostcache.
  * Creates a new entry if none was found.
  */
 void
 tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
 {
 	struct hc_metrics *hc_entry;
 
 	hc_entry = tcp_hc_lookup(inc);
 	if (hc_entry == NULL) {
 		hc_entry = tcp_hc_insert(inc);
 		if (hc_entry == NULL)
 			return;
 	}
 	hc_entry->rmx_updates++;
 	hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
 
 	if (hcml->rmx_rtt != 0) {
 		if (hc_entry->rmx_rtt == 0)
 			hc_entry->rmx_rtt = hcml->rmx_rtt;
 		else
 			hc_entry->rmx_rtt =
 			    (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2;
 		TCPSTAT_INC(tcps_cachedrtt);
 	}
 	if (hcml->rmx_rttvar != 0) {
 	        if (hc_entry->rmx_rttvar == 0)
 			hc_entry->rmx_rttvar = hcml->rmx_rttvar;
 		else
 			hc_entry->rmx_rttvar =
 			    (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2;
 		TCPSTAT_INC(tcps_cachedrttvar);
 	}
 	if (hcml->rmx_ssthresh != 0) {
 		if (hc_entry->rmx_ssthresh == 0)
 			hc_entry->rmx_ssthresh = hcml->rmx_ssthresh;
 		else
 			hc_entry->rmx_ssthresh =
 			    (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
 		TCPSTAT_INC(tcps_cachedssthresh);
 	}
 	if (hcml->rmx_bandwidth != 0) {
 		if (hc_entry->rmx_bandwidth == 0)
 			hc_entry->rmx_bandwidth = hcml->rmx_bandwidth;
 		else
 			hc_entry->rmx_bandwidth =
 			    (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2;
 		/* TCPSTAT_INC(tcps_cachedbandwidth); */
 	}
 	if (hcml->rmx_cwnd != 0) {
 		if (hc_entry->rmx_cwnd == 0)
 			hc_entry->rmx_cwnd = hcml->rmx_cwnd;
 		else
 			hc_entry->rmx_cwnd =
 			    (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2;
 		/* TCPSTAT_INC(tcps_cachedcwnd); */
 	}
 	if (hcml->rmx_sendpipe != 0) {
 		if (hc_entry->rmx_sendpipe == 0)
 			hc_entry->rmx_sendpipe = hcml->rmx_sendpipe;
 		else
 			hc_entry->rmx_sendpipe =
 			    (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2;
 		/* TCPSTAT_INC(tcps_cachedsendpipe); */
 	}
 	if (hcml->rmx_recvpipe != 0) {
 		if (hc_entry->rmx_recvpipe == 0)
 			hc_entry->rmx_recvpipe = hcml->rmx_recvpipe;
 		else
 			hc_entry->rmx_recvpipe =
 			    (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2;
 		/* TCPSTAT_INC(tcps_cachedrecvpipe); */
 	}
 
 	TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
 	TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
 }
 
 /*
  * Sysctl function: prints the list and values of all hostcache entries in
  * unsorted order.
  */
 static int
 sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
 {
 	int bufsize;
 	int linesize = 128;
 	char *p, *buf;
 	int len, i, error;
 	struct hc_metrics *hc_entry;
 #ifdef INET6
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	bufsize = linesize * (V_tcp_hostcache.cache_count + 1);
 
 	p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
 
 	len = snprintf(p, linesize,
 		"\nIP address        MTU  SSTRESH      RTT   RTTVAR BANDWIDTH "
 		"    CWND SENDPIPE RECVPIPE HITS  UPD  EXP\n");
 	p += len;
 
 #define msec(u) (((u) + 500) / 1000)
 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
 		THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
 		TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket,
 			      rmx_q) {
 			len = snprintf(p, linesize,
 			    "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu "
 			    "%4lu %4lu %4i\n",
 			    hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) :
 #ifdef INET6
 				ip6_sprintf(ip6buf, &hc_entry->ip6),
 #else
 				"IPv6?",
 #endif
 			    hc_entry->rmx_mtu,
 			    hc_entry->rmx_ssthresh,
 			    msec(hc_entry->rmx_rtt *
 				(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
 			    msec(hc_entry->rmx_rttvar *
 				(RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE))),
 			    hc_entry->rmx_bandwidth * 8,
 			    hc_entry->rmx_cwnd,
 			    hc_entry->rmx_sendpipe,
 			    hc_entry->rmx_recvpipe,
 			    hc_entry->rmx_hits,
 			    hc_entry->rmx_updates,
 			    hc_entry->rmx_expire);
 			p += len;
 		}
 		THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
 	}
 #undef msec
 	error = SYSCTL_OUT(req, buf, p - buf);
 	free(buf, M_TEMP);
 	return(error);
 }
 
 /*
  * Caller has to make sure the curvnet is set properly.
  */
 static void
 tcp_hc_purge_internal(int all)
 {
 	struct hc_metrics *hc_entry, *hc_next;
 	int i;
 
 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
 		THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
 		TAILQ_FOREACH_SAFE(hc_entry,
 		    &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) {
 			if (all || hc_entry->rmx_expire <= 0) {
 				TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket,
 					      hc_entry, rmx_q);
 				uma_zfree(V_tcp_hostcache.zone, hc_entry);
 				V_tcp_hostcache.hashbase[i].hch_length--;
 				V_tcp_hostcache.cache_count--;
 			} else
 				hc_entry->rmx_expire -= V_tcp_hostcache.prune;
 		}
 		THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
 	}
 }
 
 /*
  * Expire and purge (old|all) entries in the tcp_hostcache.  Runs
  * periodically from the callout.
  */
 static void
 tcp_hc_purge(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	int all = 0;
 
 	if (V_tcp_hostcache.purgeall) {
 		all = 1;
 		V_tcp_hostcache.purgeall = 0;
 	}
 
 	tcp_hc_purge_internal(all);
 
 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
 	    tcp_hc_purge, arg);
 	CURVNET_RESTORE();
 }
Index: user/dchagin/lemul/sys/netinet/tcp_input.c
===================================================================
--- user/dchagin/lemul/sys/netinet/tcp_input.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/tcp_input.c	(revision 274241)
@@ -1,3678 +1,3678 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2007-2008,2010
  *	Swinburne University of Technology, Melbourne, Australia.
  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by Lawrence Stewart,
  * James Healy and David Hayes, made possible in part by a grant from the Cisco
  * University Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"		/* for ipfw_fwd	*/
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/hhook.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_syncache.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 const int tcprexmtthresh = 3;
 
 int tcp_log_in_vain = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &tcp_log_in_vain, 0,
     "Log all incoming TCP segments to closed ports");
 
 VNET_DEFINE(int, blackhole) = 0;
 #define	V_blackhole		VNET(blackhole)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(blackhole), 0,
     "Do not send RST on segments to closed ports");
 
 VNET_DEFINE(int, tcp_delack_enabled) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_delack_enabled), 0,
     "Delay ACK to try and piggyback it onto a data packet");
 
 VNET_DEFINE(int, drop_synfin) = 0;
 #define	V_drop_synfin		VNET(drop_synfin)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(drop_synfin), 0,
     "Drop TCP packets with SYN+FIN set");
 
 VNET_DEFINE(int, tcp_do_rfc3042) = 1;
 #define	V_tcp_do_rfc3042	VNET(tcp_do_rfc3042)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3042), 0,
     "Enable RFC 3042 (Limited Transmit)");
 
 VNET_DEFINE(int, tcp_do_rfc3390) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3390), 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, experimental, CTLFLAG_RW, 0,
     "Experimental TCP extensions");
 
 VNET_DEFINE(int, tcp_do_initcwnd10) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_initcwnd10), 0,
     "Enable RFC 6928 (Increasing initial CWND to 10)");
 
 VNET_DEFINE(int, tcp_do_rfc3465) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3465), 0,
     "Enable RFC 3465 (Appropriate Byte Counting)");
 
 VNET_DEFINE(int, tcp_abc_l_var) = 2;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_abc_l_var), 2,
     "Cap the max cwnd increment during slow-start to this number of segments");
 
 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
 
 VNET_DEFINE(int, tcp_do_ecn) = 0;
-SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_ecn), 0,
     "TCP ECN support");
 
 VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_ecn_maxretries), 0,
     "Max retries before giving up on ECN");
 
 VNET_DEFINE(int, tcp_insecure_syn) = 0;
 #define	V_tcp_insecure_syn	VNET(tcp_insecure_syn)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_insecure_syn), 0,
     "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets");
 
 VNET_DEFINE(int, tcp_insecure_rst) = 0;
 #define	V_tcp_insecure_rst	VNET(tcp_insecure_rst)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_insecure_rst), 0,
     "Follow RFC793 instead of RFC5961 criteria for accepting RST packets");
 
 VNET_DEFINE(int, tcp_recvspace) = 1024*64;
 #define	V_tcp_recvspace	VNET(tcp_recvspace)
-SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size");
 
 VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
 #define	V_tcp_do_autorcvbuf	VNET(tcp_do_autorcvbuf)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_autorcvbuf), 0,
     "Enable automatic receive buffer sizing");
 
 VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024;
 #define	V_tcp_autorcvbuf_inc	VNET(tcp_autorcvbuf_inc)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_autorcvbuf_inc), 0,
     "Incrementor step size of automatic receive buffer");
 
 VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024;
 #define	V_tcp_autorcvbuf_max	VNET(tcp_autorcvbuf_max)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_autorcvbuf_max), 0,
     "Max size of automatic receive buffer");
 
 VNET_DEFINE(struct inpcbhead, tcb);
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 VNET_DEFINE(struct inpcbinfo, tcbinfo);
 
 static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 static void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
 		     struct socket *, struct tcpcb *, int, int, uint8_t,
 		     int);
 static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 static void	 tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 static void	 tcp_xmit_timer(struct tcpcb *, int);
 static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
 static void inline	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
 			    uint16_t type);
 static void inline	cc_conn_init(struct tcpcb *tp);
 static void inline	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 static void inline	hhook_run_tcp_est_in(struct tcpcb *tp,
 			    struct tcphdr *th, struct tcpopt *to);
 
 /*
  * TCP statistics are stored in an "array" of counter(9)s.
  */
 VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat);
 VNET_PCPUSTAT_SYSINIT(tcpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat,
     tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(tcpstat);
 #endif /* VIMAGE */
 /*
  * Kernel module interface for updating tcpstat.  The argument is an index
  * into tcpstat treated as an array.
  */
 void
 kmod_tcpstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(tcpstat)[statnum], 1);
 }
 
 /*
  * Wrapper for the TCP established input helper hook.
  */
 static void inline
 hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
 {
 	struct tcp_hhook_data hhook_data;
 
 	if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) {
 		hhook_data.tp = tp;
 		hhook_data.th = th;
 		hhook_data.to = to;
 
 		hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data,
 		    tp->osd);
 	}
 }
 
 /*
  * CC wrapper hook functions
  */
 static void inline
 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
 	if (tp->snd_cwnd <= tp->snd_wnd)
 		tp->ccv->flags |= CCF_CWND_LIMITED;
 	else
 		tp->ccv->flags &= ~CCF_CWND_LIMITED;
 
 	if (type == CC_ACK) {
 		if (tp->snd_cwnd > tp->snd_ssthresh) {
 			tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
 			     V_tcp_abc_l_var * tp->t_maxseg);
 			if (tp->t_bytes_acked >= tp->snd_cwnd) {
 				tp->t_bytes_acked -= tp->snd_cwnd;
 				tp->ccv->flags |= CCF_ABC_SENTAWND;
 			}
 		} else {
 				tp->ccv->flags &= ~CCF_ABC_SENTAWND;
 				tp->t_bytes_acked = 0;
 		}
 	}
 
 	if (CC_ALGO(tp)->ack_received != NULL) {
 		/* XXXLAS: Find a way to live without this */
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->ack_received(tp->ccv, type);
 	}
 }
 
 static void inline
 cc_conn_init(struct tcpcb *tp)
 {
 	struct hc_metrics_lite metrics;
 	struct inpcb *inp = tp->t_inpcb;
 	int rtt;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tcp_hc_get(&inp->inp_inc, &metrics);
 
 	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
 		tp->t_srtt = rtt;
 		tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
 		TCPSTAT_INC(tcps_usedrtt);
 		if (metrics.rmx_rttvar) {
 			tp->t_rttvar = metrics.rmx_rttvar;
 			TCPSTAT_INC(tcps_usedrttvar);
 		} else {
 			/* default variation is +- 1 rtt */
 			tp->t_rttvar =
 			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
 		}
 		TCPT_RANGESET(tp->t_rxtcur,
 		    ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
 		    tp->t_rttmin, TCPTV_REXMTMAX);
 	}
 	if (metrics.rmx_ssthresh) {
 		/*
 		 * There's some sort of gateway or interface
 		 * buffer limit on the path.  Use this to set
 		 * the slow start threshhold, but set the
 		 * threshold to no less than 2*mss.
 		 */
 		tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh);
 		TCPSTAT_INC(tcps_usedssthresh);
 	}
 
 	/*
 	 * Set the initial slow-start flight size.
 	 *
 	 * RFC5681 Section 3.1 specifies the default conservative values.
 	 * RFC3390 specifies slightly more aggressive values.
 	 * RFC6928 increases it to ten segments.
 	 *
 	 * If a SYN or SYN/ACK was lost and retransmitted, we have to
 	 * reduce the initial CWND to one segment as congestion is likely
 	 * requiring us to be cautious.
 	 */
 	if (tp->snd_cwnd == 1)
 		tp->snd_cwnd = tp->t_maxseg;		/* SYN(-ACK) lost */
 	else if (V_tcp_do_initcwnd10)
 		tp->snd_cwnd = min(10 * tp->t_maxseg,
 		    max(2 * tp->t_maxseg, 14600));
 	else if (V_tcp_do_rfc3390)
 		tp->snd_cwnd = min(4 * tp->t_maxseg,
 		    max(2 * tp->t_maxseg, 4380));
 	else {
 		/* Per RFC5681 Section 3.1 */
 		if (tp->t_maxseg > 2190)
 			tp->snd_cwnd = 2 * tp->t_maxseg;
 		else if (tp->t_maxseg > 1095)
 			tp->snd_cwnd = 3 * tp->t_maxseg;
 		else
 			tp->snd_cwnd = 4 * tp->t_maxseg;
 	}
 
 	if (CC_ALGO(tp)->conn_init != NULL)
 		CC_ALGO(tp)->conn_init(tp->ccv);
 }
 
 void inline
 cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	switch(type) {
 	case CC_NDUPACK:
 		if (!IN_FASTRECOVERY(tp->t_flags)) {
 			tp->snd_recover = tp->snd_max;
 			if (tp->t_flags & TF_ECN_PERMIT)
 				tp->t_flags |= TF_ECN_SND_CWR;
 		}
 		break;
 	case CC_ECN:
 		if (!IN_CONGRECOVERY(tp->t_flags)) {
 			TCPSTAT_INC(tcps_ecn_rcwnd);
 			tp->snd_recover = tp->snd_max;
 			if (tp->t_flags & TF_ECN_PERMIT)
 				tp->t_flags |= TF_ECN_SND_CWR;
 		}
 		break;
 	case CC_RTO:
 		tp->t_dupacks = 0;
 		tp->t_bytes_acked = 0;
 		EXIT_RECOVERY(tp->t_flags);
 		tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
 		    tp->t_maxseg) * tp->t_maxseg;
 		tp->snd_cwnd = tp->t_maxseg;
 		break;
 	case CC_RTO_ERR:
 		TCPSTAT_INC(tcps_sndrexmitbad);
 		/* RTO was unnecessary, so reset everything. */
 		tp->snd_cwnd = tp->snd_cwnd_prev;
 		tp->snd_ssthresh = tp->snd_ssthresh_prev;
 		tp->snd_recover = tp->snd_recover_prev;
 		if (tp->t_flags & TF_WASFRECOVERY)
 			ENTER_FASTRECOVERY(tp->t_flags);
 		if (tp->t_flags & TF_WASCRECOVERY)
 			ENTER_CONGRECOVERY(tp->t_flags);
 		tp->snd_nxt = tp->snd_max;
 		tp->t_flags &= ~TF_PREVVALID;
 		tp->t_badrxtwin = 0;
 		break;
 	}
 
 	if (CC_ALGO(tp)->cong_signal != NULL) {
 		if (th != NULL)
 			tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->cong_signal(tp->ccv, type);
 	}
 }
 
 static void inline
 cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/* XXXLAS: KASSERT that we're in recovery? */
 
 	if (CC_ALGO(tp)->post_recovery != NULL) {
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->post_recovery(tp->ccv);
 	}
 	/* XXXLAS: EXIT_RECOVERY ? */
 	tp->t_bytes_acked = 0;
 }
 
 #ifdef TCP_SIGNATURE
 static inline int
 tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen,
     struct tcpopt *to, struct tcphdr *th, u_int tcpbflag)
 {
 	int ret;
 
 	tcp_fields_to_net(th);
 	ret = tcp_signature_verify(m, off0, tlen, optlen, to, th, tcpbflag);
 	tcp_fields_to_host(th);
 	return (ret);
 }
 #endif
 
 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
 #ifdef INET6
 #define ND6_HINT(tp) \
 do { \
 	if ((tp) && (tp)->t_inpcb && \
 	    ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
 		nd6_nud_hint(NULL, NULL, 0); \
 } while (0)
 #else
 #define ND6_HINT(tp)
 #endif
 
 /*
  * Indicate whether this ack should be delayed.  We can delay the ack if
  * following conditions are met:
  *	- There is no delayed ack timer in progress.
  *	- Our last ack wasn't a 0-sized window. We never want to delay
  *	  the ack that opens up a 0-sized window.
  *	- LRO wasn't used for this segment. We make sure by checking that the
  *	  segment size is not larger than the MSS.
  *	- Delayed acks are enabled or this is a half-synchronized T/TCP
  *	  connection.
  */
 #define DELAY_ACK(tp, tlen)						\
 	((!tcp_timer_active(tp, TT_DELACK) &&				\
 	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
 	    (tlen <= tp->t_maxopd) &&					\
 	    (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
 
 /*
  * TCP input handling is split into multiple parts:
  *   tcp6_input is a thin wrapper around tcp_input for the extended
  *	ip6_protox[] call format in ip6_input
  *   tcp_input handles primary segment validation, inpcb lookup and
  *	SYN processing on listen sockets
  *   tcp_do_segment processes the ACK and text of the segment for
  *	establishing, established and closing connections
  */
 #ifdef INET6
 int
 tcp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct in6_ifaddr *ia6;
 
 	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
 
 	/*
 	 * draft-itojun-ipv6-tcp-to-anycast
 	 * better place to put this in?
 	 */
 	ia6 = ip6_getdstifaddr(m);
 	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
 		struct ip6_hdr *ip6;
 
 		ifa_free(&ia6->ia_ifa);
 		ip6 = mtod(m, struct ip6_hdr *);
 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
 			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 		return (IPPROTO_DONE);
 	}
 	if (ia6)
 		ifa_free(&ia6->ia_ifa);
 
 	return (tcp_input(mp, offp, proto));
 }
 #endif /* INET6 */
 
 int
 tcp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct tcphdr *th = NULL;
 	struct ip *ip = NULL;
 	struct inpcb *inp = NULL;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 	u_char *optp = NULL;
 	int off0;
 	int optlen = 0;
 #ifdef INET
 	int len;
 #endif
 	int tlen = 0, off;
 	int drop_hdrlen;
 	int thflags;
 	int rstreason = 0;	/* For badport_bandlim accounting purposes */
 #ifdef TCP_SIGNATURE
 	uint8_t sig_checked = 0;
 #endif
 	uint8_t iptos = 0;
 	struct m_tag *fwd_tag = NULL;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 #else
 	const void *ip6 = NULL;
 #endif /* INET6 */
 	struct tcpopt to;		/* options in this segment */
 	char *s = NULL;			/* address and port logging */
 	int ti_locked;
 #define	TI_UNLOCKED	1
 #define	TI_WLOCKED	2
 
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 
 #ifdef INET6
 	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
 #endif
 
 	off0 = *offp;
 	m = *mp;
 	*mp = NULL;
 	to.to_flags = 0;
 	TCPSTAT_INC(tcps_rcvtotal);
 
 #ifdef INET6
 	if (isipv6) {
 		/* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */
 
 		if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
 			m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
 			if (m == NULL) {
 				TCPSTAT_INC(tcps_rcvshort);
 				return (IPPROTO_DONE);
 			}
 		}
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)((caddr_t)ip6 + off0);
 		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
 			else
 				th->th_sum = in6_cksum_pseudo(ip6, tlen,
 				    IPPROTO_TCP, m->m_pkthdr.csum_data);
 			th->th_sum ^= 0xffff;
 		} else
 			th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen);
 		if (th->th_sum) {
 			TCPSTAT_INC(tcps_rcvbadsum);
 			goto drop;
 		}
 
 		/*
 		 * Be proactive about unspecified IPv6 address in source.
 		 * As we use all-zero to indicate unbounded/unconnected pcb,
 		 * unspecified IPv6 address can be used to confuse us.
 		 *
 		 * Note that packets with unspecified IPv6 destination is
 		 * already dropped in ip6_input.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 			/* XXX stat */
 			goto drop;
 		}
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		/*
 		 * Get IP and TCP header together in first mbuf.
 		 * Note: IP leaves IP header in first mbuf.
 		 */
 		if (off0 > sizeof (struct ip)) {
 			ip_stripoptions(m);
 			off0 = sizeof(struct ip);
 		}
 		if (m->m_len < sizeof (struct tcpiphdr)) {
 			if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
 			    == NULL) {
 				TCPSTAT_INC(tcps_rcvshort);
 				return (IPPROTO_DONE);
 			}
 		}
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)((caddr_t)ip + off0);
 		tlen = ntohs(ip->ip_len) - off0;
 
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
 			else
 				th->th_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr,
 				    htonl(m->m_pkthdr.csum_data + tlen +
 				    IPPROTO_TCP));
 			th->th_sum ^= 0xffff;
 		} else {
 			struct ipovly *ipov = (struct ipovly *)ip;
 
 			/*
 			 * Checksum extended TCP header and data.
 			 */
 			len = off0 + tlen;
 			bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
 			ipov->ih_len = htons(tlen);
 			th->th_sum = in_cksum(m, len);
 			/* Reset length for SDT probes. */
 			ip->ip_len = htons(tlen + off0);
 		}
 
 		if (th->th_sum) {
 			TCPSTAT_INC(tcps_rcvbadsum);
 			goto drop;
 		}
 		/* Re-initialization for later version check */
 		ip->ip_v = IPVERSION;
 	}
 #endif /* INET */
 
 #ifdef INET6
 	if (isipv6)
 		iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 		iptos = ip->ip_tos;
 #endif
 
 	/*
 	 * Check that TCP offset makes sense,
 	 * pull out TCP options and adjust length.		XXX
 	 */
 	off = th->th_off << 2;
 	if (off < sizeof (struct tcphdr) || off > tlen) {
 		TCPSTAT_INC(tcps_rcvbadoff);
 		goto drop;
 	}
 	tlen -= off;	/* tlen is used instead of ti->ti_len */
 	if (off > sizeof (struct tcphdr)) {
 #ifdef INET6
 		if (isipv6) {
 			IP6_EXTHDR_CHECK(m, off0, off, IPPROTO_DONE);
 			ip6 = mtod(m, struct ip6_hdr *);
 			th = (struct tcphdr *)((caddr_t)ip6 + off0);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			if (m->m_len < sizeof(struct ip) + off) {
 				if ((m = m_pullup(m, sizeof (struct ip) + off))
 				    == NULL) {
 					TCPSTAT_INC(tcps_rcvshort);
 					return (IPPROTO_DONE);
 				}
 				ip = mtod(m, struct ip *);
 				th = (struct tcphdr *)((caddr_t)ip + off0);
 			}
 		}
 #endif
 		optlen = off - sizeof (struct tcphdr);
 		optp = (u_char *)(th + 1);
 	}
 	thflags = th->th_flags;
 
 	/*
 	 * Convert TCP protocol specific fields to host format.
 	 */
 	tcp_fields_to_host(th);
 
 	/*
 	 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
 	 */
 	drop_hdrlen = off0 + off;
 
 	/*
 	 * Locate pcb for segment; if we're likely to add or remove a
 	 * connection then first acquire pcbinfo lock.  There are three cases
 	 * where we might discover later we need a write lock despite the
 	 * flags: ACKs moving a connection out of the syncache, ACKs for a
 	 * connection in TIMEWAIT and SYNs not targeting a listening socket.
 	 */
 	if ((thflags & (TH_FIN | TH_RST)) != 0) {
 		INP_INFO_WLOCK(&V_tcbinfo);
 		ti_locked = TI_WLOCKED;
 	} else
 		ti_locked = TI_UNLOCKED;
 
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
         if (
 #ifdef INET6
 	    (isipv6 && (m->m_flags & M_IP6_NEXTHOP))
 #ifdef INET
 	    || (!isipv6 && (m->m_flags & M_IP_NEXTHOP))
 #endif
 #endif
 #if defined(INET) && !defined(INET6)
 	    (m->m_flags & M_IP_NEXTHOP)
 #endif
 	    )
 		fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 
 findpcb:
 #ifdef INVARIANTS
 	if (ti_locked == TI_WLOCKED) {
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	} else {
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	}
 #endif
 #ifdef INET6
 	if (isipv6 && fwd_tag != NULL) {
 		struct sockaddr_in6 *next_hop6;
 
 		next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1);
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * Already got one like this?
 		 */
 		inp = in6_pcblookup_mbuf(&V_tcbinfo,
 		    &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport,
 		    INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src,
 			    th->th_sport, &next_hop6->sin6_addr,
 			    next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) :
 			    th->th_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
 		}
 	} else if (isipv6) {
 		inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src,
 		    th->th_sport, &ip6->ip6_dst, th->th_dport,
 		    INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
 		    m->m_pkthdr.rcvif, m);
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	if (fwd_tag != NULL) {
 		struct sockaddr_in *next_hop;
 
 		next_hop = (struct sockaddr_in *)(fwd_tag+1);
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * already got one like this?
 		 */
 		inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport,
 		    ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB,
 		    m->m_pkthdr.rcvif, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in_pcblookup(&V_tcbinfo, ip->ip_src,
 			    th->th_sport, next_hop->sin_addr,
 			    next_hop->sin_port ? ntohs(next_hop->sin_port) :
 			    th->th_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
 		}
 	} else
 		inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
 		    th->th_sport, ip->ip_dst, th->th_dport,
 		    INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
 		    m->m_pkthdr.rcvif, m);
 #endif /* INET */
 
 	/*
 	 * If the INPCB does not exist then all data in the incoming
 	 * segment is discarded and an appropriate RST is sent back.
 	 * XXX MRT Send RST using which routing table?
 	 */
 	if (inp == NULL) {
 		/*
 		 * Log communication attempts to ports that are not
 		 * in use.
 		 */
 		if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
 		    tcp_log_in_vain == 2) {
 			if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6)))
 				log(LOG_INFO, "%s; %s: Connection attempt "
 				    "to closed port\n", s, __func__);
 		}
 		/*
 		 * When blackholing do not respond with a RST but
 		 * completely ignore the segment and drop it.
 		 */
 		if ((V_blackhole == 1 && (thflags & TH_SYN)) ||
 		    V_blackhole == 2)
 			goto dropunlock;
 
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 	INP_WLOCK_ASSERT(inp);
 	if (!(inp->inp_flags & INP_HW_FLOWID)
 	    && (m->m_flags & M_FLOWID)
 	    && ((inp->inp_socket == NULL)
 		|| !(inp->inp_socket->so_options & SO_ACCEPTCONN))) {
 		inp->inp_flags |= INP_HW_FLOWID;
 		inp->inp_flags &= ~INP_SW_FLOWID;
 		inp->inp_flowid = m->m_pkthdr.flowid;
 		inp->inp_flowtype = M_HASHTYPE_GET(m);
 	}
 #ifdef IPSEC
 #ifdef INET6
 	if (isipv6 && ipsec6_in_reject(m, inp)) {
 		IPSEC6STAT_INC(ips_in_polvio);
 		goto dropunlock;
 	} else
 #endif /* INET6 */
 	if (ipsec4_in_reject(m, inp) != 0) {
 		IPSECSTAT_INC(ips_in_polvio);
 		goto dropunlock;
 	}
 #endif /* IPSEC */
 
 	/*
 	 * Check the minimum TTL for socket.
 	 */
 	if (inp->inp_ip_minttl != 0) {
 #ifdef INET6
 		if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim)
 			goto dropunlock;
 		else
 #endif
 		if (inp->inp_ip_minttl > ip->ip_ttl)
 			goto dropunlock;
 	}
 
 	/*
 	 * A previous connection in TIMEWAIT state is supposed to catch stray
 	 * or duplicate segments arriving late.  If this segment was a
 	 * legitimate new connection attempt, the old INPCB gets removed and
 	 * we can try again to find a listening socket.
 	 *
 	 * At this point, due to earlier optimism, we may hold only an inpcb
 	 * lock, and not the inpcbinfo write lock.  If so, we need to try to
 	 * acquire it, or if that fails, acquire a reference on the inpcb,
 	 * drop all locks, acquire a global write lock, and then re-acquire
 	 * the inpcb lock.  We may at that point discover that another thread
 	 * has tried to free the inpcb, in which case we need to loop back
 	 * and try to find a new inpcb to deliver to.
 	 *
 	 * XXXRW: It may be time to rethink timewait locking.
 	 */
 relocked:
 	if (inp->inp_flags & INP_TIMEWAIT) {
 		if (ti_locked == TI_UNLOCKED) {
 			if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
 				in_pcbref(inp);
 				INP_WUNLOCK(inp);
 				INP_INFO_WLOCK(&V_tcbinfo);
 				ti_locked = TI_WLOCKED;
 				INP_WLOCK(inp);
 				if (in_pcbrele_wlocked(inp)) {
 					inp = NULL;
 					goto findpcb;
 				}
 			} else
 				ti_locked = TI_WLOCKED;
 		}
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 		if (thflags & TH_SYN)
 			tcp_dooptions(&to, optp, optlen, TO_SYN);
 		/*
 		 * NB: tcp_twcheck unlocks the INP and frees the mbuf.
 		 */
 		if (tcp_twcheck(inp, &to, th, m, tlen))
 			goto findpcb;
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		return (IPPROTO_DONE);
 	}
 	/*
 	 * The TCPCB may no longer exist if the connection is winding
 	 * down or it is in the CLOSED state.  Either way we drop the
 	 * segment and send an appropriate response.
 	 */
 	tp = intotcpcb(inp);
 	if (tp == NULL || tp->t_state == TCPS_CLOSED) {
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE) {
 		tcp_offload_input(tp, m);
 		m = NULL;	/* consumed by the TOE driver */
 		goto dropunlock;
 	}
 #endif
 
 	/*
 	 * We've identified a valid inpcb, but it could be that we need an
 	 * inpcbinfo write lock but don't hold it.  In this case, attempt to
 	 * acquire using the same strategy as the TIMEWAIT case above.  If we
 	 * relock, we have to jump back to 'relocked' as the connection might
 	 * now be in TIMEWAIT.
 	 */
 #ifdef INVARIANTS
 	if ((thflags & (TH_FIN | TH_RST)) != 0)
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 #endif
 	if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) ||
 	    (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN)))) {
 		if (ti_locked == TI_UNLOCKED) {
 			if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
 				in_pcbref(inp);
 				INP_WUNLOCK(inp);
 				INP_INFO_WLOCK(&V_tcbinfo);
 				ti_locked = TI_WLOCKED;
 				INP_WLOCK(inp);
 				if (in_pcbrele_wlocked(inp)) {
 					inp = NULL;
 					goto findpcb;
 				}
 				goto relocked;
 			} else
 				ti_locked = TI_WLOCKED;
 		}
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	}
 
 #ifdef MAC
 	INP_WLOCK_ASSERT(inp);
 	if (mac_inpcb_check_deliver(inp, m))
 		goto dropunlock;
 #endif
 	so = inp->inp_socket;
 	KASSERT(so != NULL, ("%s: so == NULL", __func__));
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG) {
 		ostate = tp->t_state;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6));
 		} else
 #endif
 			bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
 		tcp_savetcp = *th;
 	}
 #endif /* TCPDEBUG */
 	/*
 	 * When the socket is accepting connections (the INPCB is in LISTEN
 	 * state) we look into the SYN cache if this is a new connection
 	 * attempt or the completion of a previous one.
 	 */
 	if (so->so_options & SO_ACCEPTCONN) {
 		struct in_conninfo inc;
 
 		KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but "
 		    "tp not listening", __func__));
 		bzero(&inc, sizeof(inc));
 #ifdef INET6
 		if (isipv6) {
 			inc.inc_flags |= INC_ISIPV6;
 			inc.inc6_faddr = ip6->ip6_src;
 			inc.inc6_laddr = ip6->ip6_dst;
 		} else
 #endif
 		{
 			inc.inc_faddr = ip->ip_src;
 			inc.inc_laddr = ip->ip_dst;
 		}
 		inc.inc_fport = th->th_sport;
 		inc.inc_lport = th->th_dport;
 		inc.inc_fibnum = so->so_fibnum;
 
 		/*
 		 * Check for an existing connection attempt in syncache if
 		 * the flag is only ACK.  A successful lookup creates a new
 		 * socket appended to the listen queue in SYN_RECEIVED state.
 		 */
 		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
 
 			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 			/*
 			 * Parse the TCP options here because
 			 * syncookies need access to the reflected
 			 * timestamp.
 			 */
 			tcp_dooptions(&to, optp, optlen, 0);
 			/*
 			 * NB: syncache_expand() doesn't unlock
 			 * inp and tcpinfo locks.
 			 */
 			if (!syncache_expand(&inc, &to, th, &so, m)) {
 				/*
 				 * No syncache entry or ACK was not
 				 * for our SYN/ACK.  Send a RST.
 				 * NB: syncache did its own logging
 				 * of the failure cause.
 				 */
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 			if (so == NULL) {
 				/*
 				 * We completed the 3-way handshake
 				 * but could not allocate a socket
 				 * either due to memory shortage,
 				 * listen queue length limits or
 				 * global socket limits.  Send RST
 				 * or wait and have the remote end
 				 * retransmit the ACK for another
 				 * try.
 				 */
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 					log(LOG_DEBUG, "%s; %s: Listen socket: "
 					    "Socket allocation failed due to "
 					    "limits or memory shortage, %s\n",
 					    s, __func__,
 					    V_tcp_sc_rst_sock_fail ?
 					    "sending RST" : "try again");
 				if (V_tcp_sc_rst_sock_fail) {
 					rstreason = BANDLIM_UNLIMITED;
 					goto dropwithreset;
 				} else
 					goto dropunlock;
 			}
 			/*
 			 * Socket is created in state SYN_RECEIVED.
 			 * Unlock the listen socket, lock the newly
 			 * created socket and update the tp variable.
 			 */
 			INP_WUNLOCK(inp);	/* listen socket */
 			inp = sotoinpcb(so);
 			INP_WLOCK(inp);		/* new connection */
 			tp = intotcpcb(inp);
 			KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
 			    ("%s: ", __func__));
 #ifdef TCP_SIGNATURE
 			if (sig_checked == 0)  {
 				tcp_dooptions(&to, optp, optlen,
 				    (thflags & TH_SYN) ? TO_SYN : 0);
 				if (!tcp_signature_verify_input(m, off0, tlen,
 				    optlen, &to, th, tp->t_flags)) {
 
 					/*
 					 * In SYN_SENT state if it receives an
 					 * RST, it is allowed for further
 					 * processing.
 					 */
 					if ((thflags & TH_RST) == 0 ||
 					    (tp->t_state == TCPS_SYN_SENT) == 0)
 						goto dropunlock;
 				}
 				sig_checked = 1;
 			}
 #endif
 
 			/*
 			 * Process the segment and the data it
 			 * contains.  tcp_do_segment() consumes
 			 * the mbuf chain and unlocks the inpcb.
 			 */
 			tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
 			    iptos, ti_locked);
 			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 			return (IPPROTO_DONE);
 		}
 		/*
 		 * Segment flag validation for new connection attempts:
 		 *
 		 * Our (SYN|ACK) response was rejected.
 		 * Check with syncache and remove entry to prevent
 		 * retransmits.
 		 *
 		 * NB: syncache_chkrst does its own logging of failure
 		 * causes.
 		 */
 		if (thflags & TH_RST) {
 			syncache_chkrst(&inc, th);
 			goto dropunlock;
 		}
 		/*
 		 * We can't do anything without SYN.
 		 */
 		if ((thflags & TH_SYN) == 0) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN is missing, segment ignored\n",
 				    s, __func__);
 			TCPSTAT_INC(tcps_badsyn);
 			goto dropunlock;
 		}
 		/*
 		 * (SYN|ACK) is bogus on a listen socket.
 		 */
 		if (thflags & TH_ACK) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|ACK invalid, segment rejected\n",
 				    s, __func__);
 			syncache_badack(&inc);	/* XXX: Not needed! */
 			TCPSTAT_INC(tcps_badsyn);
 			rstreason = BANDLIM_RST_OPENPORT;
 			goto dropwithreset;
 		}
 		/*
 		 * If the drop_synfin option is enabled, drop all
 		 * segments with both the SYN and FIN bits set.
 		 * This prevents e.g. nmap from identifying the
 		 * TCP/IP stack.
 		 * XXX: Poor reasoning.  nmap has other methods
 		 * and is constantly refining its stack detection
 		 * strategies.
 		 * XXX: This is a violation of the TCP specification
 		 * and was used by RFC1644.
 		 */
 		if ((thflags & TH_FIN) && V_drop_synfin) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|FIN segment ignored (based on "
 				    "sysctl setting)\n", s, __func__);
 			TCPSTAT_INC(tcps_badsyn);
 			goto dropunlock;
 		}
 		/*
 		 * Segment's flags are (SYN) or (SYN|FIN).
 		 *
 		 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
 		 * as they do not affect the state of the TCP FSM.
 		 * The data pointed to by TH_URG and th_urp is ignored.
 		 */
 		KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
 		    ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
 		KASSERT(thflags & (TH_SYN),
 		    ("%s: Listen socket: TH_SYN not set", __func__));
 #ifdef INET6
 		/*
 		 * If deprecated address is forbidden,
 		 * we do not accept SYN to deprecated interface
 		 * address to prevent any new inbound connection from
 		 * getting established.
 		 * When we do not accept SYN, we send a TCP RST,
 		 * with deprecated source address (instead of dropping
 		 * it).  We compromise it as it is much better for peer
 		 * to send a RST, and RST will be the final packet
 		 * for the exchange.
 		 *
 		 * If we do not forbid deprecated addresses, we accept
 		 * the SYN packet.  RFC2462 does not suggest dropping
 		 * SYN in this case.
 		 * If we decipher RFC2462 5.5.4, it says like this:
 		 * 1. use of deprecated addr with existing
 		 *    communication is okay - "SHOULD continue to be
 		 *    used"
 		 * 2. use of it with new communication:
 		 *   (2a) "SHOULD NOT be used if alternate address
 		 *        with sufficient scope is available"
 		 *   (2b) nothing mentioned otherwise.
 		 * Here we fall into (2b) case as we have no choice in
 		 * our source address selection - we must obey the peer.
 		 *
 		 * The wording in RFC2462 is confusing, and there are
 		 * multiple description text for deprecated address
 		 * handling - worse, they are not exactly the same.
 		 * I believe 5.5.4 is the best one, so we follow 5.5.4.
 		 */
 		if (isipv6 && !V_ip6_use_deprecated) {
 			struct in6_ifaddr *ia6;
 
 			ia6 = ip6_getdstifaddr(m);
 			if (ia6 != NULL &&
 			    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
 				ifa_free(&ia6->ia_ifa);
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt to deprecated "
 					"IPv6 address rejected\n",
 					s, __func__);
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 			if (ia6)
 				ifa_free(&ia6->ia_ifa);
 		}
 #endif /* INET6 */
 		/*
 		 * Basic sanity checks on incoming SYN requests:
 		 *   Don't respond if the destination is a link layer
 		 *	broadcast according to RFC1122 4.2.3.10, p. 104.
 		 *   If it is from this socket it must be forged.
 		 *   Don't respond if the source or destination is a
 		 *	global or subnet broad- or multicast address.
 		 *   Note that it is quite possible to receive unicast
 		 *	link-layer packets with a broadcast IP address. Use
 		 *	in_broadcast() to find them.
 		 */
 		if (m->m_flags & (M_BCAST|M_MCAST)) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 			    log(LOG_DEBUG, "%s; %s: Listen socket: "
 				"Connection attempt from broad- or multicast "
 				"link layer address ignored\n", s, __func__);
 			goto dropunlock;
 		}
 #ifdef INET6
 		if (isipv6) {
 			if (th->th_dport == th->th_sport &&
 			    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt to/from self "
 					"ignored\n", s, __func__);
 				goto dropunlock;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 			    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to multicast "
 					"address ignored\n", s, __func__);
 				goto dropunlock;
 			}
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			if (th->th_dport == th->th_sport &&
 			    ip->ip_dst.s_addr == ip->ip_src.s_addr) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to self "
 					"ignored\n", s, __func__);
 				goto dropunlock;
 			}
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 			    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 			    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 			    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to broad- "
 					"or multicast address ignored\n",
 					s, __func__);
 				goto dropunlock;
 			}
 		}
 #endif
 		/*
 		 * SYN appears to be valid.  Create compressed TCP state
 		 * for syncache.
 		 */
 #ifdef TCPDEBUG
 		if (so->so_options & SO_DEBUG)
 			tcp_trace(TA_INPUT, ostate, tp,
 			    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 		tcp_dooptions(&to, optp, optlen, TO_SYN);
 		syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL);
 		/*
 		 * Entry added to syncache and mbuf consumed.
 		 * Only the listen socket is unlocked by syncache_add().
 		 */
 		if (ti_locked == TI_WLOCKED) {
 			INP_INFO_WUNLOCK(&V_tcbinfo);
 			ti_locked = TI_UNLOCKED;
 		}
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 		return (IPPROTO_DONE);
 	} else if (tp->t_state == TCPS_LISTEN) {
 		/*
 		 * When a listen socket is torn down the SO_ACCEPTCONN
 		 * flag is removed first while connections are drained
 		 * from the accept queue in a unlock/lock cycle of the
 		 * ACCEPT_LOCK, opening a race condition allowing a SYN
 		 * attempt go through unhandled.
 		 */
 		goto dropunlock;
 	}
 
 #ifdef TCP_SIGNATURE
 	if (sig_checked == 0)  {
 		tcp_dooptions(&to, optp, optlen,
 		    (thflags & TH_SYN) ? TO_SYN : 0);
 		if (!tcp_signature_verify_input(m, off0, tlen, optlen, &to,
 		    th, tp->t_flags)) {
 
 			/*
 			 * In SYN_SENT state if it receives an RST, it is
 			 * allowed for further processing.
 			 */
 			if ((thflags & TH_RST) == 0 ||
 			    (tp->t_state == TCPS_SYN_SENT) == 0)
 				goto dropunlock;
 		}
 		sig_checked = 1;
 	}
 #endif
 
 	TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
 
 	/*
 	 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
 	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
 	 * the inpcb, and unlocks pcbinfo.
 	 */
 	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	return (IPPROTO_DONE);
 
 dropwithreset:
 	TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
 
 	if (ti_locked == TI_WLOCKED) {
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		ti_locked = TI_UNLOCKED;
 	}
 #ifdef INVARIANTS
 	else {
 		KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset "
 		    "ti_locked: %d", __func__, ti_locked));
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	}
 #endif
 
 	if (inp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(inp);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 	m = NULL;	/* mbuf chain got consumed. */
 	goto drop;
 
 dropunlock:
 	if (m != NULL)
 		TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
 
 	if (ti_locked == TI_WLOCKED) {
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		ti_locked = TI_UNLOCKED;
 	}
 #ifdef INVARIANTS
 	else {
 		KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock "
 		    "ti_locked: %d", __func__, ti_locked));
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	}
 #endif
 
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 
 drop:
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	if (m != NULL)
 		m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 static void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
     int ti_locked)
 {
 	int thflags, acked, ourfinisacked, needoutput = 0;
 	int rstreason, todrop, win;
 	u_long tiwin;
 	char *s;
 	struct in_conninfo *inc;
 	struct mbuf *mfree;
 	struct tcpopt to;
 
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 	thflags = th->th_flags;
 	inc = &tp->t_inpcb->inp_inc;
 	tp->sackhint.last_sack_ack = 0;
 
 	/*
 	 * If this is either a state-changing packet or current state isn't
 	 * established, we require a write lock on tcbinfo.  Otherwise, we
 	 * allow the tcbinfo to be in either alocked or unlocked, as the
 	 * caller may have unnecessarily acquired a write lock due to a race.
 	 */
 	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
 	    tp->t_state != TCPS_ESTABLISHED) {
 		KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
 		    "SYN/FIN/RST/!EST", __func__, ti_locked));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	} else {
 #ifdef INVARIANTS
 		if (ti_locked == TI_WLOCKED)
 			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 		else {
 			KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
 			    "ti_locked: %d", __func__, ti_locked));
 			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 		}
 #endif
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
 	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
 	    __func__));
 
 	/*
 	 * Segment received on connection.
 	 * Reset idle time and keep-alive timer.
 	 * XXX: This should be done after segment
 	 * validation to ignore broken/spoofed segs.
 	 */
 	tp->t_rcvtime = ticks;
 	if (TCPS_HAVEESTABLISHED(tp->t_state))
 		tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
 
 	/*
 	 * Unscale the window into a 32-bit value.
 	 * For the SYN_SENT state the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
 
 	/*
 	 * TCP ECN processing.
 	 */
 	if (tp->t_flags & TF_ECN_PERMIT) {
 		if (thflags & TH_CWR)
 			tp->t_flags &= ~TF_ECN_SND_ECE;
 		switch (iptos & IPTOS_ECN_MASK) {
 		case IPTOS_ECN_CE:
 			tp->t_flags |= TF_ECN_SND_ECE;
 			TCPSTAT_INC(tcps_ecn_ce);
 			break;
 		case IPTOS_ECN_ECT0:
 			TCPSTAT_INC(tcps_ecn_ect0);
 			break;
 		case IPTOS_ECN_ECT1:
 			TCPSTAT_INC(tcps_ecn_ect1);
 			break;
 		}
 		/* Congestion experienced. */
 		if (thflags & TH_ECE) {
 			cc_cong_signal(tp, th, CC_ECN);
 		}
 	}
 
 	/*
 	 * Parse options on any incoming segment.
 	 */
 	tcp_dooptions(&to, (u_char *)(th + 1),
 	    (th->th_off << 2) - sizeof(struct tcphdr),
 	    (thflags & TH_SYN) ? TO_SYN : 0);
 
 	/*
 	 * If echoed timestamp is later than the current time,
 	 * fall back to non RFC1323 RTT calculation.  Normalize
 	 * timestamp if syncookies were used when this connection
 	 * was established.
 	 */
 	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
 		to.to_tsecr -= tp->ts_offset;
 		if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
 			to.to_tsecr = 0;
 	}
 	/*
 	 * If timestamps were negotiated during SYN/ACK they should
 	 * appear on every segment during this session and vice versa.
 	 */
 	if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Timestamp missing, "
 			    "no action\n", s, __func__);
 			free(s, M_TCPLOG);
 		}
 	}
 	if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
 			    "no action\n", s, __func__);
 			free(s, M_TCPLOG);
 		}
 	}
 
 	/*
 	 * Process options only when we get SYN/ACK back. The SYN case
 	 * for incoming connections is handled in tcp_syncache.
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 	 * or <SYN,ACK>) segment itself is never scaled.
 	 * XXX this is traditional behavior, may need to be cleaned up.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
 		if ((to.to_flags & TOF_SCALE) &&
 		    (tp->t_flags & TF_REQ_SCALE)) {
 			tp->t_flags |= TF_RCVD_SCALE;
 			tp->snd_scale = to.to_wscale;
 		}
 		/*
 		 * Initial send window.  It will be updated with
 		 * the next incoming segment to the scaled value.
 		 */
 		tp->snd_wnd = th->th_win;
 		if (to.to_flags & TOF_TS) {
 			tp->t_flags |= TF_RCVD_TSTMP;
 			tp->ts_recent = to.to_tsval;
 			tp->ts_recent_age = tcp_ts_getticks();
 		}
 		if (to.to_flags & TOF_MSS)
 			tcp_mss(tp, to.to_mss);
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (to.to_flags & TOF_SACKPERM) == 0)
 			tp->t_flags &= ~TF_SACK_PERMIT;
 	}
 
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
 	 * no control flags, is in-sequence, the window didn't
 	 * change and we're not retransmitting, it's a
 	 * candidate.  If the length is zero and the ack moved
 	 * forward, we're the sender side of the xfer.  Just
 	 * free the data acked & wake any higher level process
 	 * that was blocked waiting for space.  If the length
 	 * is non-zero and the ack didn't move, we're the
 	 * receiver side.  If we're getting packets in-order
 	 * (the reassembly queue is empty), add the data to
 	 * the socket buffer and note that we need a delayed ack.
 	 * Make sure that the hidden state-flags are also off.
 	 * Since we check for TCPS_ESTABLISHED first, it can only
 	 * be TH_NEEDSYN.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    th->th_seq == tp->rcv_nxt &&
 	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
 	    tp->snd_nxt == tp->snd_max &&
 	    tiwin && tiwin == tp->snd_wnd && 
 	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
 	    tp->t_segq == NULL && ((to.to_flags & TOF_TS) == 0 ||
 	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
 
 		/*
 		 * If last ACK falls within this segment's sequence numbers,
 		 * record the timestamp.
 		 * NOTE that the test is modified according to the latest
 		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 			tp->ts_recent_age = tcp_ts_getticks();
 			tp->ts_recent = to.to_tsval;
 		}
 
 		if (tlen == 0) {
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    !IN_RECOVERY(tp->t_flags) &&
 			    (to.to_flags & TOF_SACK) == 0 &&
 			    TAILQ_EMPTY(&tp->snd_holes)) {
 				/*
 				 * This is a pure ack for outstanding data.
 				 */
 				if (ti_locked == TI_WLOCKED)
 					INP_INFO_WUNLOCK(&V_tcbinfo);
 				ti_locked = TI_UNLOCKED;
 
 				TCPSTAT_INC(tcps_predack);
 
 				/*
 				 * "bad retransmit" recovery.
 				 */
 				if (tp->t_rxtshift == 1 &&
 				    tp->t_flags & TF_PREVVALID &&
 				    (int)(ticks - tp->t_badrxtwin) < 0) {
 					cc_cong_signal(tp, th, CC_RTO_ERR);
 				}
 
 				/*
 				 * Recalculate the transmit timer / rtt.
 				 *
 				 * Some boxes send broken timestamp replies
 				 * during the SYN+ACK phase, ignore
 				 * timestamps of 0 or we could calculate a
 				 * huge RTT and blow up the retransmit timer.
 				 */
 				if ((to.to_flags & TOF_TS) != 0 &&
 				    to.to_tsecr) {
 					u_int t;
 
 					t = tcp_ts_getticks() - to.to_tsecr;
 					if (!tp->t_rttlow || tp->t_rttlow > t)
 						tp->t_rttlow = t;
 					tcp_xmit_timer(tp,
 					    TCP_TS_TO_TICKS(t) + 1);
 				} else if (tp->t_rtttime &&
 				    SEQ_GT(th->th_ack, tp->t_rtseq)) {
 					if (!tp->t_rttlow ||
 					    tp->t_rttlow > ticks - tp->t_rtttime)
 						tp->t_rttlow = ticks - tp->t_rtttime;
 					tcp_xmit_timer(tp,
 							ticks - tp->t_rtttime);
 				}
 				acked = BYTES_THIS_ACK(tp, th);
 
 				/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
 				hhook_run_tcp_est_in(tp, th, &to);
 
 				TCPSTAT_INC(tcps_rcvackpack);
 				TCPSTAT_ADD(tcps_rcvackbyte, acked);
 				sbdrop(&so->so_snd, acked);
 				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
 				    SEQ_LEQ(th->th_ack, tp->snd_recover))
 					tp->snd_recover = th->th_ack - 1;
 				
 				/*
 				 * Let the congestion control algorithm update
 				 * congestion control related information. This
 				 * typically means increasing the congestion
 				 * window.
 				 */
 				cc_ack_received(tp, th, CC_ACK);
 
 				tp->snd_una = th->th_ack;
 				/*
 				 * Pull snd_wl2 up to prevent seq wrap relative
 				 * to th_ack.
 				 */
 				tp->snd_wl2 = th->th_ack;
 				tp->t_dupacks = 0;
 				m_freem(m);
 				ND6_HINT(tp); /* Some progress has been made. */
 
 				/*
 				 * If all outstanding data are acked, stop
 				 * retransmit timer, otherwise restart timer
 				 * using current (possibly backed-off) value.
 				 * If process is waiting for space,
 				 * wakeup/selwakeup/signal.  If data
 				 * are ready to send, let tcp_output
 				 * decide between more output or persist.
 				 */
 #ifdef TCPDEBUG
 				if (so->so_options & SO_DEBUG)
 					tcp_trace(TA_INPUT, ostate, tp,
 					    (void *)tcp_saveipgen,
 					    &tcp_savetcp, 0);
 #endif
 				if (tp->snd_una == tp->snd_max)
 					tcp_timer_activate(tp, TT_REXMT, 0);
 				else if (!tcp_timer_active(tp, TT_PERSIST))
 					tcp_timer_activate(tp, TT_REXMT,
 						      tp->t_rxtcur);
 				sowwakeup(so);
 				if (so->so_snd.sb_cc)
 					(void) tcp_output(tp);
 				goto check_delack;
 			}
 		} else if (th->th_ack == tp->snd_una &&
 		    tlen <= sbspace(&so->so_rcv)) {
 			int newsize = 0;	/* automatic sockbuf scaling */
 
 			/*
 			 * This is a pure, in-sequence data packet with
 			 * nothing on the reassembly queue and we have enough
 			 * buffer space to take it.
 			 */
 			if (ti_locked == TI_WLOCKED)
 				INP_INFO_WUNLOCK(&V_tcbinfo);
 			ti_locked = TI_UNLOCKED;
 
 			/* Clean receiver SACK report if present */
 			if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
 				tcp_clean_sackreport(tp);
 			TCPSTAT_INC(tcps_preddat);
 			tp->rcv_nxt += tlen;
 			/*
 			 * Pull snd_wl1 up to prevent seq wrap relative to
 			 * th_seq.
 			 */
 			tp->snd_wl1 = th->th_seq;
 			/*
 			 * Pull rcv_up up to prevent seq wrap relative to
 			 * rcv_nxt.
 			 */
 			tp->rcv_up = tp->rcv_nxt;
 			TCPSTAT_INC(tcps_rcvpack);
 			TCPSTAT_ADD(tcps_rcvbyte, tlen);
 			ND6_HINT(tp);	/* Some progress has been made */
 #ifdef TCPDEBUG
 			if (so->so_options & SO_DEBUG)
 				tcp_trace(TA_INPUT, ostate, tp,
 				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 		/*
 		 * Automatic sizing of receive socket buffer.  Often the send
 		 * buffer size is not optimally adjusted to the actual network
 		 * conditions at hand (delay bandwidth product).  Setting the
 		 * buffer size too small limits throughput on links with high
 		 * bandwidth and high delay (eg. trans-continental/oceanic links).
 		 *
 		 * On the receive side the socket buffer memory is only rarely
 		 * used to any significant extent.  This allows us to be much
 		 * more aggressive in scaling the receive socket buffer.  For
 		 * the case that the buffer space is actually used to a large
 		 * extent and we run out of kernel memory we can simply drop
 		 * the new segments; TCP on the sender will just retransmit it
 		 * later.  Setting the buffer size too big may only consume too
 		 * much kernel memory if the application doesn't read() from
 		 * the socket or packet loss or reordering makes use of the
 		 * reassembly queue.
 		 *
 		 * The criteria to step up the receive buffer one notch are:
 		 *  1. Application has not set receive buffer size with
 		 *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
 		 *  2. the number of bytes received during the time it takes
 		 *     one timestamp to be reflected back to us (the RTT);
 		 *  3. received bytes per RTT is within seven eighth of the
 		 *     current socket buffer size;
 		 *  4. receive buffer size has not hit maximal automatic size;
 		 *
 		 * This algorithm does one step per RTT at most and only if
 		 * we receive a bulk stream w/o packet losses or reorderings.
 		 * Shrinking the buffer during idle times is not necessary as
 		 * it doesn't consume any memory when idle.
 		 *
 		 * TODO: Only step up if the application is actually serving
 		 * the buffer to better manage the socket buffer resources.
 		 */
 			if (V_tcp_do_autorcvbuf &&
 			    to.to_tsecr &&
 			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
 				if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
 				    to.to_tsecr - tp->rfbuf_ts < hz) {
 					if (tp->rfbuf_cnt >
 					    (so->so_rcv.sb_hiwat / 8 * 7) &&
 					    so->so_rcv.sb_hiwat <
 					    V_tcp_autorcvbuf_max) {
 						newsize =
 						    min(so->so_rcv.sb_hiwat +
 						    V_tcp_autorcvbuf_inc,
 						    V_tcp_autorcvbuf_max);
 					}
 					/* Start over with next RTT. */
 					tp->rfbuf_ts = 0;
 					tp->rfbuf_cnt = 0;
 				} else
 					tp->rfbuf_cnt += tlen;	/* add up */
 			}
 
 			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
 				/*
 				 * Set new socket buffer size.
 				 * Give up when limit is reached.
 				 */
 				if (newsize)
 					if (!sbreserve_locked(&so->so_rcv,
 					    newsize, so, NULL))
 						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappendstream_locked(&so->so_rcv, m);
 			}
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 			if (DELAY_ACK(tp, tlen)) {
 				tp->t_flags |= TF_DELACK;
 			} else {
 				tp->t_flags |= TF_ACKNOW;
 				tcp_output(tp);
 			}
 			goto check_delack;
 		}
 	}
 
 	/*
 	 * Calculate amount of space in receive window,
 	 * and then do TCP input processing.
 	 * Receive window is amount of space in rcv queue,
 	 * but not less than advertised window.
 	 */
 	win = sbspace(&so->so_rcv);
 	if (win < 0)
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 
 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
 	tp->rfbuf_ts = 0;
 	tp->rfbuf_cnt = 0;
 
 	switch (tp->t_state) {
 
 	/*
 	 * If the state is SYN_RECEIVED:
 	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
 	 */
 	case TCPS_SYN_RECEIVED:
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 		}
 		break;
 
 	/*
 	 * If the state is SYN_SENT:
 	 *	if seg contains an ACK, but not for our SYN, drop the input.
 	 *	if seg contains a RST, then drop the connection.
 	 *	if seg does not contain SYN, then drop it.
 	 * Otherwise this is an acceptable SYN segment
 	 *	initialize tp->rcv_nxt and tp->irs
 	 *	if seg contains ack then advance tp->snd_una
 	 *	if seg contains an ECE and ECN support is enabled, the stream
 	 *	    is ECN capable.
 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 	 *	arrange for segment to be acked (eventually)
 	 *	continue processing rest of data/controls, beginning with URG
 	 */
 	case TCPS_SYN_SENT:
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->iss) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 			rstreason = BANDLIM_UNLIMITED;
 			goto dropwithreset;
 		}
 		if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
 			TCP_PROBE5(connect__refused, NULL, tp,
 			    mtod(m, const char *), tp, th);
 			tp = tcp_drop(tp, ECONNREFUSED);
 		}
 		if (thflags & TH_RST)
 			goto drop;
 		if (!(thflags & TH_SYN))
 			goto drop;
 
 		tp->irs = th->th_seq;
 		tcp_rcvseqinit(tp);
 		if (thflags & TH_ACK) {
 			TCPSTAT_INC(tcps_connects);
 			soisconnected(so);
 #ifdef MAC
 			mac_socketpeer_set_from_mbuf(m, so);
 #endif
 			/* Do window scaling on this connection? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->rcv_scale = tp->request_r_scale;
 			}
 			tp->rcv_adv += imin(tp->rcv_wnd,
 			    TCP_MAXWIN << tp->rcv_scale);
 			tp->snd_una++;		/* SYN is acked */
 			/*
 			 * If there's data, delay ACK; if there's also a FIN
 			 * ACKNOW will be turned on later.
 			 */
 			if (DELAY_ACK(tp, tlen) && tlen != 0)
 				tcp_timer_activate(tp, TT_DELACK,
 				    tcp_delacktime);
 			else
 				tp->t_flags |= TF_ACKNOW;
 
 			if ((thflags & TH_ECE) && V_tcp_do_ecn) {
 				tp->t_flags |= TF_ECN_PERMIT;
 				TCPSTAT_INC(tcps_ecn_shs);
 			}
 			
 			/*
 			 * Received <SYN,ACK> in SYN_SENT[*] state.
 			 * Transitions:
 			 *	SYN_SENT  --> ESTABLISHED
 			 *	SYN_SENT* --> FIN_WAIT_1
 			 */
 			tp->t_starttime = ticks;
 			if (tp->t_flags & TF_NEEDFIN) {
 				tcp_state_change(tp, TCPS_FIN_WAIT_1);
 				tp->t_flags &= ~TF_NEEDFIN;
 				thflags &= ~TH_SYN;
 			} else {
 				tcp_state_change(tp, TCPS_ESTABLISHED);
 				TCP_PROBE5(connect__established, NULL, tp,
 				    mtod(m, const char *), tp, th);
 				cc_conn_init(tp);
 				tcp_timer_activate(tp, TT_KEEP,
 				    TP_KEEPIDLE(tp));
 			}
 		} else {
 			/*
 			 * Received initial SYN in SYN-SENT[*] state =>
 			 * simultaneous open.
 			 * If it succeeds, connection is * half-synchronized.
 			 * Otherwise, do 3-way handshake:
 			 *        SYN-SENT -> SYN-RECEIVED
 			 *        SYN-SENT* -> SYN-RECEIVED*
 			 */
 			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			tcp_state_change(tp, TCPS_SYN_RECEIVED);
 		}
 
 		KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
 		    "ti_locked %d", __func__, ti_locked));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		/*
 		 * Advance th->th_seq to correspond to first data byte.
 		 * If data, trim to stay within window,
 		 * dropping FIN if necessary.
 		 */
 		th->th_seq++;
 		if (tlen > tp->rcv_wnd) {
 			todrop = tlen - tp->rcv_wnd;
 			m_adj(m, -todrop);
 			tlen = tp->rcv_wnd;
 			thflags &= ~TH_FIN;
 			TCPSTAT_INC(tcps_rcvpackafterwin);
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 		}
 		tp->snd_wl1 = th->th_seq - 1;
 		tp->rcv_up = th->th_seq;
 		/*
 		 * Client side of transaction: already sent SYN and data.
 		 * If the remote host used T/TCP to validate the SYN,
 		 * our data will be ACK'd; if so, enter normal data segment
 		 * processing in the middle of step 5, ack processing.
 		 * Otherwise, goto step 6.
 		 */
 		if (thflags & TH_ACK)
 			goto process_ACK;
 
 		goto step6;
 
 	/*
 	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
 	 *      do normal processing.
 	 *
 	 * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
 	 */
 	case TCPS_LAST_ACK:
 	case TCPS_CLOSING:
 		break;  /* continue normal processing */
 	}
 
 	/*
 	 * States other than LISTEN or SYN_SENT.
 	 * First check the RST flag and sequence number since reset segments
 	 * are exempt from the timestamp and connection count tests.  This
 	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
 	 * below which allowed reset segments in half the sequence space
 	 * to fall though and be processed (which gives forged reset
 	 * segments with a random sequence number a 50 percent chance of
 	 * killing a connection).
 	 * Then check timestamp, if present.
 	 * Then check the connection count, if present.
 	 * Then check that at least some bytes of segment are within
 	 * receive window.  If segment begins before rcv_nxt,
 	 * drop leading data (and SYN); if nothing left, just ack.
 	 */
 	if (thflags & TH_RST) {
 		/*
 		 * RFC5961 Section 3.2
 		 *
 		 * - RST drops connection only if SEG.SEQ == RCV.NXT.
 		 * - If RST is in window, we send challenge ACK.
 		 *
 		 * Note: to take into account delayed ACKs, we should
 		 *   test against last_ack_sent instead of rcv_nxt.
 		 * Note 2: we handle special case of closed window, not
 		 *   covered by the RFC.
 		 */
 		if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
 		    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
 
 			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 			KASSERT(ti_locked == TI_WLOCKED,
 			    ("%s: TH_RST ti_locked %d, th %p tp %p",
 			    __func__, ti_locked, th, tp));
 			KASSERT(tp->t_state != TCPS_SYN_SENT,
 			    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
 			    __func__, th, tp));
 
 			if (V_tcp_insecure_rst ||
 			    tp->last_ack_sent == th->th_seq) {
 				TCPSTAT_INC(tcps_drops);
 				/* Drop the connection. */
 				switch (tp->t_state) {
 				case TCPS_SYN_RECEIVED:
 					so->so_error = ECONNREFUSED;
 					goto close;
 				case TCPS_ESTABLISHED:
 				case TCPS_FIN_WAIT_1:
 				case TCPS_FIN_WAIT_2:
 				case TCPS_CLOSE_WAIT:
 					so->so_error = ECONNRESET;
 				close:
 					tcp_state_change(tp, TCPS_CLOSED);
 					/* FALLTHROUGH */
 				default:
 					tp = tcp_close(tp);
 				}
 			} else {
 				TCPSTAT_INC(tcps_badrst);
 				/* Send challenge ACK. */
 				tcp_respond(tp, mtod(m, void *), th, m,
 				    tp->rcv_nxt, tp->snd_nxt, TH_ACK);
 				tp->last_ack_sent = tp->rcv_nxt;
 				m = NULL;
 			}
 		}
 		goto drop;
 	}
 
 	/*
 	 * RFC5961 Section 4.2
 	 * Send challenge ACK for any SYN in synchronized state.
 	 */
 	if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) {
 		KASSERT(ti_locked == TI_WLOCKED,
 		    ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 		TCPSTAT_INC(tcps_badsyn);
 		if (V_tcp_insecure_syn &&
 		    SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 			tp = tcp_drop(tp, ECONNRESET);
 			rstreason = BANDLIM_UNLIMITED;
 		} else {
 			/* Send challenge ACK. */
 			tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
 			    tp->snd_nxt, TH_ACK);
 			tp->last_ack_sent = tp->rcv_nxt;
 			m = NULL;
 		}
 		goto drop;
 	}
 
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
 	 * and it's less than ts_recent, drop it.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 
 		/* Check to see if ts_recent is over 24 days old.  */
 		if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
 			/*
 			 * Invalidate ts_recent.  If this segment updates
 			 * ts_recent, the age will be reset later and ts_recent
 			 * will get a valid value.  If it does not, setting
 			 * ts_recent to zero will at least satisfy the
 			 * requirement that zero be placed in the timestamp
 			 * echo reply when ts_recent isn't valid.  The
 			 * age isn't reset until we get a valid ts_recent
 			 * because we don't want out-of-order segments to be
 			 * dropped when ts_recent is old.
 			 */
 			tp->ts_recent = 0;
 		} else {
 			TCPSTAT_INC(tcps_rcvduppack);
 			TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
 			TCPSTAT_INC(tcps_pawsdrop);
 			if (tlen)
 				goto dropafterack;
 			goto drop;
 		}
 	}
 
 	/*
 	 * In the SYN-RECEIVED state, validate that the packet belongs to
 	 * this connection before trimming the data to fit the receive
 	 * window.  Check the sequence number versus IRS since we know
 	 * the sequence numbers haven't wrapped.  This is a partial fix
 	 * for the "LAND" DoS attack.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 
 	todrop = tp->rcv_nxt - th->th_seq;
 	if (todrop > 0) {
 		if (thflags & TH_SYN) {
 			thflags &= ~TH_SYN;
 			th->th_seq++;
 			if (th->th_urp > 1)
 				th->th_urp--;
 			else
 				thflags &= ~TH_URG;
 			todrop--;
 		}
 		/*
 		 * Following if statement from Stevens, vol. 2, p. 960.
 		 */
 		if (todrop > tlen
 		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 			/*
 			 * Any valid FIN must be to the left of the window.
 			 * At this point the FIN must be a duplicate or out
 			 * of sequence; drop it.
 			 */
 			thflags &= ~TH_FIN;
 
 			/*
 			 * Send an ACK to resynchronize and drop any data.
 			 * But keep on processing for RST or ACK.
 			 */
 			tp->t_flags |= TF_ACKNOW;
 			todrop = tlen;
 			TCPSTAT_INC(tcps_rcvduppack);
 			TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
 		} else {
 			TCPSTAT_INC(tcps_rcvpartduppack);
 			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
 		}
 		drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
 		tlen -= todrop;
 		if (th->th_urp > todrop)
 			th->th_urp -= todrop;
 		else {
 			thflags &= ~TH_URG;
 			th->th_urp = 0;
 		}
 	}
 
 	/*
 	 * If new data are received on a connection after the
 	 * user processes are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) &&
 	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 		KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && "
 		    "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
 			    "after socket was closed, "
 			    "sending RST and removing tcpcb\n",
 			    s, __func__, tcpstates[tp->t_state], tlen);
 			free(s, M_TCPLOG);
 		}
 		tp = tcp_close(tp);
 		TCPSTAT_INC(tcps_rcvafterclose);
 		rstreason = BANDLIM_UNLIMITED;
 		goto dropwithreset;
 	}
 
 	/*
 	 * If segment ends after window, drop trailing data
 	 * (and PUSH and FIN); if nothing left, just ACK.
 	 */
 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
 	if (todrop > 0) {
 		TCPSTAT_INC(tcps_rcvpackafterwin);
 		if (todrop >= tlen) {
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
 			/*
 			 * If window is closed can only take segments at
 			 * window edge, and have to drop data and PUSH from
 			 * incoming segments.  Continue processing, but
 			 * remember to ack.  Otherwise, drop segment
 			 * and ack.
 			 */
 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 				tp->t_flags |= TF_ACKNOW;
 				TCPSTAT_INC(tcps_rcvwinprobe);
 			} else
 				goto dropafterack;
 		} else
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 		m_adj(m, -todrop);
 		tlen -= todrop;
 		thflags &= ~(TH_PUSH|TH_FIN);
 	}
 
 	/*
 	 * If last ACK falls within this segment's sequence numbers,
 	 * record its timestamp.
 	 * NOTE: 
 	 * 1) That the test incorporates suggestions from the latest
 	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 * 2) That updating only on newer timestamps interferes with
 	 *    our earlier PAWS tests, so this check should be solely
 	 *    predicated on the sequence space of this segment.
 	 * 3) That we modify the segment boundary check to be 
 	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len  
 	 *    instead of RFC1323's
 	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
 	 *    This modified check allows us to overcome RFC1323's
 	 *    limitations as described in Stevens TCP/IP Illustrated
 	 *    Vol. 2 p.869. In such cases, we can still calculate the
 	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 		((thflags & (TH_SYN|TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to.to_tsval;
 	}
 
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
 	 * flag is on (half-synchronized state), then queue data for
 	 * later processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_state == TCPS_SYN_RECEIVED ||
 		    (tp->t_flags & TF_NEEDSYN))
 			goto step6;
 		else if (tp->t_flags & TF_ACKNOW)
 			goto dropafterack;
 		else
 			goto drop;
 	}
 
 	/*
 	 * Ack processing.
 	 */
 	switch (tp->t_state) {
 
 	/*
 	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
 	 * ESTABLISHED state and continue processing.
 	 * The ACK was checked above.
 	 */
 	case TCPS_SYN_RECEIVED:
 
 		TCPSTAT_INC(tcps_connects);
 		soisconnected(so);
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 			tp->snd_wnd = tiwin;
 		}
 		/*
 		 * Make transitions:
 		 *      SYN-RECEIVED  -> ESTABLISHED
 		 *      SYN-RECEIVED* -> FIN-WAIT-1
 		 */
 		tp->t_starttime = ticks;
 		if (tp->t_flags & TF_NEEDFIN) {
 			tcp_state_change(tp, TCPS_FIN_WAIT_1);
 			tp->t_flags &= ~TF_NEEDFIN;
 		} else {
 			tcp_state_change(tp, TCPS_ESTABLISHED);
 			TCP_PROBE5(accept__established, NULL, tp,
 			    mtod(m, const char *), tp, th);
 			cc_conn_init(tp);
 			tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
 		}
 		/*
 		 * If segment contains data or ACK, will call tcp_reass()
 		 * later; if not, do so now to pass queued data to user.
 		 */
 		if (tlen == 0 && (thflags & TH_FIN) == 0)
 			(void) tcp_reass(tp, (struct tcphdr *)0, 0,
 			    (struct mbuf *)0);
 		tp->snd_wl1 = th->th_seq - 1;
 		/* FALLTHROUGH */
 
 	/*
 	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 	 * ACKs.  If the ack is in the range
 	 *	tp->snd_una < th->th_ack <= tp->snd_max
 	 * then advance tp->snd_una to th->th_ack and drop
 	 * data from the retransmission queue.  If this ACK reflects
 	 * more up to date window information we update our window information.
 	 */
 	case TCPS_ESTABLISHED:
 	case TCPS_FIN_WAIT_1:
 	case TCPS_FIN_WAIT_2:
 	case TCPS_CLOSE_WAIT:
 	case TCPS_CLOSING:
 	case TCPS_LAST_ACK:
 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
 			TCPSTAT_INC(tcps_rcvacktoomuch);
 			goto dropafterack;
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    ((to.to_flags & TOF_SACK) ||
 		     !TAILQ_EMPTY(&tp->snd_holes)))
 			tcp_sack_doack(tp, &to, th->th_ack);
 
 		/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
 		hhook_run_tcp_est_in(tp, th, &to);
 
 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 			if (tlen == 0 && tiwin == tp->snd_wnd) {
 				/*
 				 * If this is the first time we've seen a
 				 * FIN from the remote, this is not a
 				 * duplicate and it needs to be processed
 				 * normally.  This happens during a
 				 * simultaneous close.
 				 */
 				if ((thflags & TH_FIN) &&
 				    (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
 					tp->t_dupacks = 0;
 					break;
 				}
 				TCPSTAT_INC(tcps_rcvdupack);
 				/*
 				 * If we have outstanding data (other than
 				 * a window probe), this is a completely
 				 * duplicate ack (ie, window info didn't
 				 * change and FIN isn't set),
 				 * the ack is the biggest we've
 				 * seen and we've seen exactly our rexmt
 				 * threshhold of them, assume a packet
 				 * has been dropped and retransmit it.
 				 * Kludge snd_nxt & the congestion
 				 * window so we send only this one
 				 * packet.
 				 *
 				 * We know we're losing at the current
 				 * window size so do congestion avoidance
 				 * (set ssthresh to half the current window
 				 * and pull our congestion window back to
 				 * the new ssthresh).
 				 *
 				 * Dup acks mean that packets have left the
 				 * network (they're now cached at the receiver)
 				 * so bump cwnd by the amount in the receiver
 				 * to keep a constant cwnd packets in the
 				 * network.
 				 *
 				 * When using TCP ECN, notify the peer that
 				 * we reduced the cwnd.
 				 */
 				if (!tcp_timer_active(tp, TT_REXMT) ||
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks > tcprexmtthresh ||
 				     IN_FASTRECOVERY(tp->t_flags)) {
 					cc_ack_received(tp, th, CC_DUPACK);
 					if ((tp->t_flags & TF_SACK_PERMIT) &&
 					    IN_FASTRECOVERY(tp->t_flags)) {
 						int awnd;
 						
 						/*
 						 * Compute the amount of data in flight first.
 						 * We can inject new data into the pipe iff 
 						 * we have less than 1/2 the original window's
 						 * worth of data in flight.
 						 */
 						awnd = (tp->snd_nxt - tp->snd_fack) +
 							tp->sackhint.sack_bytes_rexmit;
 						if (awnd < tp->snd_ssthresh) {
 							tp->snd_cwnd += tp->t_maxseg;
 							if (tp->snd_cwnd > tp->snd_ssthresh)
 								tp->snd_cwnd = tp->snd_ssthresh;
 						}
 					} else
 						tp->snd_cwnd += tp->t_maxseg;
 					(void) tcp_output(tp);
 					goto drop;
 				} else if (tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
 
 					/*
 					 * If we're doing sack, check to
 					 * see if we're already in sack
 					 * recovery. If we're not doing sack,
 					 * check to see if we're in newreno
 					 * recovery.
 					 */
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						if (IN_FASTRECOVERY(tp->t_flags)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					} else {
 						if (SEQ_LEQ(th->th_ack,
 						    tp->snd_recover)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					}
 					/* Congestion signal before ack. */
 					cc_cong_signal(tp, th, CC_NDUPACK);
 					cc_ack_received(tp, th, CC_DUPACK);
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
 					if (tp->t_flags & TF_SACK_PERMIT) {
 						TCPSTAT_INC(
 						    tcps_sack_recovery_episode);
 						tp->sack_newdata = tp->snd_nxt;
 						tp->snd_cwnd = tp->t_maxseg;
 						(void) tcp_output(tp);
 						goto drop;
 					}
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = tp->t_maxseg;
 					(void) tcp_output(tp);
 					KASSERT(tp->snd_limited <= 2,
 					    ("%s: tp->snd_limited too big",
 					    __func__));
 					tp->snd_cwnd = tp->snd_ssthresh +
 					     tp->t_maxseg *
 					     (tp->t_dupacks - tp->snd_limited);
 					if (SEQ_GT(onxt, tp->snd_nxt))
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (V_tcp_do_rfc3042) {
 					cc_ack_received(tp, th, CC_DUPACK);
 					u_long oldcwnd = tp->snd_cwnd;
 					tcp_seq oldsndmax = tp->snd_max;
 					u_int sent;
 					int avail;
 
 					KASSERT(tp->t_dupacks == 1 ||
 					    tp->t_dupacks == 2,
 					    ("%s: dupacks not 1 or 2",
 					    __func__));
 					if (tp->t_dupacks == 1)
 						tp->snd_limited = 0;
 					tp->snd_cwnd =
 					    (tp->snd_nxt - tp->snd_una) +
 					    (tp->t_dupacks - tp->snd_limited) *
 					    tp->t_maxseg;
 					/*
 					 * Only call tcp_output when there
 					 * is new data available to be sent.
 					 * Otherwise we would send pure ACKs.
 					 */
 					SOCKBUF_LOCK(&so->so_snd);
 					avail = so->so_snd.sb_cc -
 					    (tp->snd_nxt - tp->snd_una);
 					SOCKBUF_UNLOCK(&so->so_snd);
 					if (avail > 0)
 						(void) tcp_output(tp);
 					sent = tp->snd_max - oldsndmax;
 					if (sent > tp->t_maxseg) {
 						KASSERT((tp->t_dupacks == 2 &&
 						    tp->snd_limited == 0) ||
 						   (sent == tp->t_maxseg + 1 &&
 						    tp->t_flags & TF_SENTFIN),
 						    ("%s: sent too much",
 						    __func__));
 						tp->snd_limited = 2;
 					} else if (sent > 0)
 						++tp->snd_limited;
 					tp->snd_cwnd = oldcwnd;
 					goto drop;
 				}
 			} else
 				tp->t_dupacks = 0;
 			break;
 		}
 
 		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
 		    ("%s: th_ack <= snd_una", __func__));
 
 		/*
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
 		if (IN_FASTRECOVERY(tp->t_flags)) {
 			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 				if (tp->t_flags & TF_SACK_PERMIT)
 					tcp_sack_partialack(tp, th);
 				else
 					tcp_newreno_partial_ack(tp, th);
 			} else
 				cc_post_recovery(tp, th);
 		}
 		tp->t_dupacks = 0;
 		/*
 		 * If we reach this point, ACK is not a duplicate,
 		 *     i.e., it ACKs something we sent.
 		 */
 		if (tp->t_flags & TF_NEEDSYN) {
 			/*
 			 * T/TCP: Connection was half-synchronized, and our
 			 * SYN has been ACK'd (so connection is now fully
 			 * synchronized).  Go to non-starred state,
 			 * increment snd_una for ACK of SYN, and check if
 			 * we can do window scaling.
 			 */
 			tp->t_flags &= ~TF_NEEDSYN;
 			tp->snd_una++;
 			/* Do window scaling? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->rcv_scale = tp->request_r_scale;
 				/* Send window already scaled. */
 			}
 		}
 
 process_ACK:
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		acked = BYTES_THIS_ACK(tp, th);
 		TCPSTAT_INC(tcps_rcvackpack);
 		TCPSTAT_ADD(tcps_rcvackbyte, acked);
 
 		/*
 		 * If we just performed our first retransmit, and the ACK
 		 * arrives within our recovery window, then it was a mistake
 		 * to do the retransmit in the first place.  Recover our
 		 * original cwnd and ssthresh, and proceed to transmit where
 		 * we left off.
 		 */
 		if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
 		    (int)(ticks - tp->t_badrxtwin) < 0)
 			cc_cong_signal(tp, th, CC_RTO_ERR);
 
 		/*
 		 * If we have a timestamp reply, update smoothed
 		 * round trip time.  If no timestamp is present but
 		 * transmit timer is running and timed sequence
 		 * number was acked, update smoothed round trip time.
 		 * Since we now have an rtt measurement, cancel the
 		 * timer backoff (cf., Phil Karn's retransmit alg.).
 		 * Recompute the initial retransmit timer.
 		 *
 		 * Some boxes send broken timestamp replies
 		 * during the SYN+ACK phase, ignore
 		 * timestamps of 0 or we could calculate a
 		 * huge RTT and blow up the retransmit timer.
 		 */
 		if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) {
 			u_int t;
 
 			t = tcp_ts_getticks() - to.to_tsecr;
 			if (!tp->t_rttlow || tp->t_rttlow > t)
 				tp->t_rttlow = t;
 			tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
 		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
 			if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
 				tp->t_rttlow = ticks - tp->t_rtttime;
 			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
 		}
 
 		/*
 		 * If all outstanding data is acked, stop retransmit
 		 * timer and remember to restart (more output or persist).
 		 * If there is more data to be acked, restart retransmit
 		 * timer, using current (possibly backed-off) value.
 		 */
 		if (th->th_ack == tp->snd_max) {
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			needoutput = 1;
 		} else if (!tcp_timer_active(tp, TT_PERSIST))
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 
 		/*
 		 * If no data (only SYN) was ACK'd,
 		 *    skip rest of ACK processing.
 		 */
 		if (acked == 0)
 			goto step6;
 
 		/*
 		 * Let the congestion control algorithm update congestion
 		 * control related information. This typically means increasing
 		 * the congestion window.
 		 */
 		cc_ack_received(tp, th, CC_ACK);
 
 		SOCKBUF_LOCK(&so->so_snd);
 		if (acked > so->so_snd.sb_cc) {
 			tp->snd_wnd -= so->so_snd.sb_cc;
 			mfree = sbcut_locked(&so->so_snd,
 			    (int)so->so_snd.sb_cc);
 			ourfinisacked = 1;
 		} else {
 			mfree = sbcut_locked(&so->so_snd, acked);
 			tp->snd_wnd -= acked;
 			ourfinisacked = 0;
 		}
 		/* NB: sowwakeup_locked() does an implicit unlock. */
 		sowwakeup_locked(so);
 		m_freem(mfree);
 		/* Detect una wraparound. */
 		if (!IN_RECOVERY(tp->t_flags) &&
 		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover))
 			tp->snd_recover = th->th_ack - 1;
 		/* XXXLAS: Can this be moved up into cc_post_recovery? */
 		if (IN_RECOVERY(tp->t_flags) &&
 		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
 			EXIT_RECOVERY(tp->t_flags);
 		}
 		tp->snd_una = th->th_ack;
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (SEQ_GT(tp->snd_una, tp->snd_recover))
 				tp->snd_recover = tp->snd_una;
 		}
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
 
 		switch (tp->t_state) {
 
 		/*
 		 * In FIN_WAIT_1 STATE in addition to the processing
 		 * for the ESTABLISHED state if our FIN is now acknowledged
 		 * then enter FIN_WAIT_2.
 		 */
 		case TCPS_FIN_WAIT_1:
 			if (ourfinisacked) {
 				/*
 				 * If we can't receive any more
 				 * data, then closing user can proceed.
 				 * Starting the timer is contrary to the
 				 * specification, but if we don't get a FIN
 				 * we'll hang forever.
 				 *
 				 * XXXjl:
 				 * we should release the tp also, and use a
 				 * compressed state.
 				 */
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					soisdisconnected(so);
 					tcp_timer_activate(tp, TT_2MSL,
 					    (tcp_fast_finwait2_recycle ?
 					    tcp_finwait2_timeout :
 					    TP_MAXIDLE(tp)));
 				}
 				tcp_state_change(tp, TCPS_FIN_WAIT_2);
 			}
 			break;
 
 		/*
 		 * In CLOSING STATE in addition to the processing for
 		 * the ESTABLISHED state if the ACK acknowledges our FIN
 		 * then enter the TIME-WAIT state, otherwise ignore
 		 * the segment.
 		 */
 		case TCPS_CLOSING:
 			if (ourfinisacked) {
 				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 				tcp_twstart(tp);
 				INP_INFO_WUNLOCK(&V_tcbinfo);
 				m_freem(m);
 				return;
 			}
 			break;
 
 		/*
 		 * In LAST_ACK, we may still be waiting for data to drain
 		 * and/or to be acked, as well as for the ack of our FIN.
 		 * If our FIN is now acknowledged, delete the TCB,
 		 * enter the closed state and return.
 		 */
 		case TCPS_LAST_ACK:
 			if (ourfinisacked) {
 				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 				tp = tcp_close(tp);
 				goto drop;
 			}
 			break;
 		}
 	}
 
 step6:
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Update window information.
 	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
 	 */
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if (tlen == 0 &&
 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			TCPSTAT_INC(tcps_rcvwinupd);
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		tp->snd_wl2 = th->th_ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 		needoutput = 1;
 	}
 
 	/*
 	 * Process segments with URG.
 	 */
 	if ((thflags & TH_URG) && th->th_urp &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		/*
 		 * This is a kludge, but if we receive and accept
 		 * random urgent pointers, we'll crash in
 		 * soreceive.  It's hard to imagine someone
 		 * actually wanting to send this much urgent data.
 		 */
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
 			th->th_urp = 0;			/* XXX */
 			thflags &= ~TH_URG;		/* XXX */
 			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
 			goto dodata;			/* XXX */
 		}
 		/*
 		 * If this segment advances the known urgent pointer,
 		 * then mark the data stream.  This should not happen
 		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 		 * a FIN has been received from the remote side.
 		 * In these states we ignore the URG.
 		 *
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section as the original
 		 * spec states (in one of two places).
 		 */
 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 			tp->rcv_up = th->th_seq + th->th_urp;
 			so->so_oobmark = so->so_rcv.sb_cc +
 			    (tp->rcv_up - tp->rcv_nxt) - 1;
 			if (so->so_oobmark == 0)
 				so->so_rcv.sb_state |= SBS_RCVATMARK;
 			sohasoutofband(so);
 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 		}
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		/*
 		 * Remove out of band data so doesn't get presented to user.
 		 * This can happen independent of advancing the URG pointer,
 		 * but if two URG's are pending at once, some out-of-band
 		 * data may creep in... ick.
 		 */
 		if (th->th_urp <= (u_long)tlen &&
 		    !(so->so_options & SO_OOBINLINE)) {
 			/* hdr drop is delayed */
 			tcp_pulloutofband(so, th, m, drop_hdrlen);
 		}
 	} else {
 		/*
 		 * If no out of band data is expected,
 		 * pull receive urgent pointer along
 		 * with the receive window.
 		 */
 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 			tp->rcv_up = tp->rcv_nxt;
 	}
 dodata:							/* XXX */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Process the segment text, merging it into the TCP sequencing queue,
 	 * and arranging for acknowledgment of receipt if necessary.
 	 * This process logically involves adjusting tp->rcv_wnd as data
 	 * is presented to the user (this happens in tcp_usrreq.c,
 	 * case PRU_RCVD).  If a FIN has already been received on this
 	 * connection then we just ignore the text.
 	 */
 	if ((tlen || (thflags & TH_FIN)) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		tcp_seq save_start = th->th_seq;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly queue
 		 * with control block tp.  Set thflags to whether reassembly now
 		 * includes a segment with FIN.  This handles the common case
 		 * inline (segment is the next to be received on an established
 		 * connection, and the queue is empty), avoiding linkage into
 		 * and removal from the queue and repetition of various
 		 * conversions.
 		 * Set DELACK for segments received in order, but ack
 		 * immediately when segments are out of order (so
 		 * fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt && tp->t_segq == NULL &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			if (DELAY_ACK(tp, tlen))
 				tp->t_flags |= TF_DELACK;
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt += tlen;
 			thflags = th->th_flags & TH_FIN;
 			TCPSTAT_INC(tcps_rcvpack);
 			TCPSTAT_ADD(tcps_rcvbyte, tlen);
 			ND6_HINT(tp);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				m_freem(m);
 			else
 				sbappendstream_locked(&so->so_rcv, m);
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 		} else {
 			/*
 			 * XXX: Due to the header drop above "th" is
 			 * theoretically invalid by now.  Fortunately
 			 * m_adj() doesn't actually frees any mbufs
 			 * when trimming from the head.
 			 */
 			thflags = tcp_reass(tp, th, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
 		if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
 			tcp_update_sack_list(tp, save_start, save_start + tlen);
 #if 0
 		/*
 		 * Note the amount of data that peer has sent into
 		 * our window, in order to estimate the sender's
 		 * buffer size.
 		 * XXX: Unused.
 		 */
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
 			len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 		else
 			len = so->so_rcv.sb_hiwat;
 #endif
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
 	}
 
 	/*
 	 * If FIN is received ACK the FIN and let the user know
 	 * that the connection is closing.
 	 */
 	if (thflags & TH_FIN) {
 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 			socantrcvmore(so);
 			/*
 			 * If connection is half-synchronized
 			 * (ie NEEDSYN flag on) then delay ACK,
 			 * so it may be piggybacked when SYN is sent.
 			 * Otherwise, since we received a FIN then no
 			 * more input can be expected, send ACK now.
 			 */
 			if (tp->t_flags & TF_NEEDSYN)
 				tp->t_flags |= TF_DELACK;
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt++;
 		}
 		switch (tp->t_state) {
 
 		/*
 		 * In SYN_RECEIVED and ESTABLISHED STATES
 		 * enter the CLOSE_WAIT state.
 		 */
 		case TCPS_SYN_RECEIVED:
 			tp->t_starttime = ticks;
 			/* FALLTHROUGH */
 		case TCPS_ESTABLISHED:
 			tcp_state_change(tp, TCPS_CLOSE_WAIT);
 			break;
 
 		/*
 		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
 		 * enter the CLOSING state.
 		 */
 		case TCPS_FIN_WAIT_1:
 			tcp_state_change(tp, TCPS_CLOSING);
 			break;
 
 		/*
 		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
 		 * starting the time-wait timer, turning off the other
 		 * standard timers.
 		 */
 		case TCPS_FIN_WAIT_2:
 			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 			KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata "
 			    "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
 			    ti_locked));
 
 			tcp_twstart(tp);
 			INP_INFO_WUNLOCK(&V_tcbinfo);
 			return;
 		}
 	}
 	if (ti_locked == TI_WLOCKED)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	ti_locked = TI_UNLOCKED;
 
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 
 	/*
 	 * Return any desired output.
 	 */
 	if (needoutput || (tp->t_flags & TF_ACKNOW))
 		(void) tcp_output(tp);
 
 check_delack:
 	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
 	    __func__, ti_locked));
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (tp->t_flags & TF_DELACK) {
 		tp->t_flags &= ~TF_DELACK;
 		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
 	}
 	INP_WUNLOCK(tp->t_inpcb);
 	return;
 
 dropafterack:
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies
 	 * sequence space, where the ACK reflects our state.
 	 *
 	 * We can now skip the test for the RST flag since all
 	 * paths to this code happen after packets containing
 	 * RST have been dropped.
 	 *
 	 * In the SYN-RECEIVED state, don't send an ACK unless the
 	 * segment we received passes the SYN-RECEIVED ACK test.
 	 * If it fails send a RST.  This breaks the loop in the
 	 * "LAND" DoS attack, and also prevents an ACK storm
 	 * between two listening ports that have been sent forged
 	 * SYN segments, each with the source address of the other.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		goto dropwithreset;
 	}
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	if (ti_locked == TI_WLOCKED)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	ti_locked = TI_UNLOCKED;
 
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	INP_WUNLOCK(tp->t_inpcb);
 	m_freem(m);
 	return;
 
 dropwithreset:
 	if (ti_locked == TI_WLOCKED)
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	ti_locked = TI_UNLOCKED;
 
 	if (tp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(tp->t_inpcb);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 	return;
 
 drop:
 	if (ti_locked == TI_WLOCKED) {
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		ti_locked = TI_UNLOCKED;
 	}
 #ifdef INVARIANTS
 	else
 		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 #endif
 
 	/*
 	 * Drop space held by incoming segment and return.
 	 */
 #ifdef TCPDEBUG
 	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
 	if (tp != NULL)
 		INP_WUNLOCK(tp->t_inpcb);
 	m_freem(m);
 }
 
 /*
  * Issue RST and make ACK acceptable to originator of segment.
  * The mbuf must still include the original packet header.
  * tp may be NULL.
  */
 static void
 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
     int tlen, int rstreason)
 {
 #ifdef INET
 	struct ip *ip;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 
 	if (tp != NULL) {
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 	}
 
 	/* Don't bother if destination was broadcast/multicast. */
 	if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
 		goto drop;
 #ifdef INET6
 	if (mtod(m, struct ip *)->ip_v == 6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 			goto drop;
 		/* IPv6 anycast check is done at tcp6_input() */
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		ip = mtod(m, struct ip *);
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			goto drop;
 	}
 #endif
 
 	/* Perform bandwidth limiting. */
 	if (badport_bandlim(rstreason) < 0)
 		goto drop;
 
 	/* tcp_respond consumes the mbuf chain. */
 	if (th->th_flags & TH_ACK) {
 		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
 		    th->th_ack, TH_RST);
 	} else {
 		if (th->th_flags & TH_SYN)
 			tlen++;
 		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
 		    (tcp_seq)0, TH_RST|TH_ACK);
 	}
 	return;
 drop:
 	m_freem(m);
 }
 
 /*
  * Parse TCP options and place in tcpopt.
  */
 static void
 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
 {
 	int opt, optlen;
 
 	to->to_flags = 0;
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_MAXSEG:
 			if (optlen != TCPOLEN_MAXSEG)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			to->to_flags |= TOF_MSS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_mss, sizeof(to->to_mss));
 			to->to_mss = ntohs(to->to_mss);
 			break;
 		case TCPOPT_WINDOW:
 			if (optlen != TCPOLEN_WINDOW)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			to->to_flags |= TOF_SCALE;
 			to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
 			break;
 		case TCPOPT_TIMESTAMP:
 			if (optlen != TCPOLEN_TIMESTAMP)
 				continue;
 			to->to_flags |= TOF_TS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_tsval, sizeof(to->to_tsval));
 			to->to_tsval = ntohl(to->to_tsval);
 			bcopy((char *)cp + 6,
 			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
 			to->to_tsecr = ntohl(to->to_tsecr);
 			break;
 #ifdef TCP_SIGNATURE
 		/*
 		 * XXX In order to reply to a host which has set the
 		 * TCP_SIGNATURE option in its initial SYN, we have to
 		 * record the fact that the option was observed here
 		 * for the syncache code to perform the correct response.
 		 */
 		case TCPOPT_SIGNATURE:
 			if (optlen != TCPOLEN_SIGNATURE)
 				continue;
 			to->to_flags |= TOF_SIGNATURE;
 			to->to_signature = cp + 2;
 			break;
 #endif
 		case TCPOPT_SACK_PERMITTED:
 			if (optlen != TCPOLEN_SACK_PERMITTED)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			if (!V_tcp_do_sack)
 				continue;
 			to->to_flags |= TOF_SACKPERM;
 			break;
 		case TCPOPT_SACK:
 			if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
 				continue;
 			if (flags & TO_SYN)
 				continue;
 			to->to_flags |= TOF_SACK;
 			to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
 			to->to_sacks = cp + 2;
 			TCPSTAT_INC(tcps_sack_rcv_blocks);
 			break;
 		default:
 			continue;
 		}
 	}
 }
 
 /*
  * Pull out of band byte out of a segment so
  * it doesn't appear in the user's data queue.
  * It is still reflected in the segment length for
  * sequencing purposes.
  */
 static void
 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
     int off)
 {
 	int cnt = off + th->th_urp - 1;
 
 	while (cnt >= 0) {
 		if (m->m_len > cnt) {
 			char *cp = mtod(m, caddr_t) + cnt;
 			struct tcpcb *tp = sototcpcb(so);
 
 			INP_WLOCK_ASSERT(tp->t_inpcb);
 
 			tp->t_iobc = *cp;
 			tp->t_oobflags |= TCPOOB_HAVEDATA;
 			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
 			m->m_len--;
 			if (m->m_flags & M_PKTHDR)
 				m->m_pkthdr.len--;
 			return;
 		}
 		cnt -= m->m_len;
 		m = m->m_next;
 		if (m == NULL)
 			break;
 	}
 	panic("tcp_pulloutofband");
 }
 
 /*
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
 static void
 tcp_xmit_timer(struct tcpcb *tp, int rtt)
 {
 	int delta;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	TCPSTAT_INC(tcps_rttupdated);
 	tp->t_rttupdated++;
 	if (tp->t_srtt != 0) {
 		/*
 		 * srtt is stored as fixed point with 5 bits after the
 		 * binary point (i.e., scaled by 8).  The following magic
 		 * is equivalent to the smoothing algorithm in rfc793 with
 		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 		 * point).  Adjust rtt to origin 0.
 		 */
 		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
 			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
 
 		if ((tp->t_srtt += delta) <= 0)
 			tp->t_srtt = 1;
 
 		/*
 		 * We accumulate a smoothed rtt variance (actually, a
 		 * smoothed mean difference), then set the retransmit
 		 * timer to smoothed rtt + 4 times the smoothed variance.
 		 * rttvar is stored as fixed point with 4 bits after the
 		 * binary point (scaled by 16).  The following is
 		 * equivalent to rfc793 smoothing with an alpha of .75
 		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 		 * rfc793's wired-in beta.
 		 */
 		if (delta < 0)
 			delta = -delta;
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		if ((tp->t_rttvar += delta) <= 0)
 			tp->t_rttvar = 1;
 		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
 		    tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt.
 		 * Set the variance to half the rtt (so our first
 		 * retransmit happens at 3*rtt).
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
 		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	tp->t_rtttime = 0;
 	tp->t_rxtshift = 0;
 
 	/*
 	 * the retransmit should happen at rtt + 4 * rttvar.
 	 * Because of the way we do the smoothing, srtt and rttvar
 	 * will each average +1/2 tick of bias.  When we compute
 	 * the retransmit timer, we want 1/2 tick of rounding and
 	 * 1 extra tick because of +-1/2 tick uncertainty in the
 	 * firing of the timer.  The bias will give us exactly the
 	 * 1.5 tick we need.  But, because the bias is
 	 * statistical, we have to test that we don't drop below
 	 * the minimum feasible timer (which is 2 ticks).
 	 */
 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 		      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 
 	/*
 	 * We received an ack for a packet that wasn't retransmitted;
 	 * it is probably safe to discard any error indications we've
 	 * received recently.  This isn't quite right, but close enough
 	 * for now (a route might have failed after we sent a segment,
 	 * and the return path might not be symmetrical).
 	 */
 	tp->t_softerror = 0;
 }
 
 /*
  * Determine a reasonable value for maxseg size.
  * If the route is known, check route for mtu.
  * If none, use an mss that can be handled on the outgoing interface
  * without forcing IP to fragment.  If no route is found, route has no mtu,
  * or the destination isn't local, use a default, hopefully conservative
  * size (usually 512 or the default IP max size, but no more than the mtu
  * of the interface), as we can't discover anything about intervening
  * gateways or networks.  We also initialize the congestion/slow start
  * window to be a single segment if the destination isn't local.
  * While looking at the routing entry, we also initialize other path-dependent
  * parameters from pre-set or cached values in the routing entry.
  *
  * Also take into account the space needed for options that we
  * send regularly.  Make maxseg shorter by that amount to assure
  * that we can send maxseg amount of data even when the options
  * are present.  Store the upper limit of the length of options plus
  * data in maxopd.
  *
  * NOTE that this routine is only called when we process an incoming
  * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
  * settings are handled in tcp_mssopt().
  */
 void
 tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
     struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap)
 {
 	int mss = 0;
 	u_long maxmtu = 0;
 	struct inpcb *inp = tp->t_inpcb;
 	struct hc_metrics_lite metrics;
 	int origoffer;
 #ifdef INET6
 	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 	size_t min_protoh = isipv6 ?
 			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
 			    sizeof (struct tcpiphdr);
 #else
 	const size_t min_protoh = sizeof(struct tcpiphdr);
 #endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (mtuoffer != -1) {
 		KASSERT(offer == -1, ("%s: conflict", __func__));
 		offer = mtuoffer - min_protoh;
 	}
 	origoffer = offer;
 
 	/* Initialize. */
 #ifdef INET6
 	if (isipv6) {
 		maxmtu = tcp_maxmtu6(&inp->inp_inc, cap);
 		tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		maxmtu = tcp_maxmtu(&inp->inp_inc, cap);
 		tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
 	}
 #endif
 
 	/*
 	 * No route to sender, stay with default mss and return.
 	 */
 	if (maxmtu == 0) {
 		/*
 		 * In case we return early we need to initialize metrics
 		 * to a defined state as tcp_hc_get() would do for us
 		 * if there was no cache hit.
 		 */
 		if (metricptr != NULL)
 			bzero(metricptr, sizeof(struct hc_metrics_lite));
 		return;
 	}
 
 	/* What have we got? */
 	switch (offer) {
 		case 0:
 			/*
 			 * Offer == 0 means that there was no MSS on the SYN
 			 * segment, in this case we use tcp_mssdflt as
 			 * already assigned to t_maxopd above.
 			 */
 			offer = tp->t_maxopd;
 			break;
 
 		case -1:
 			/*
 			 * Offer == -1 means that we didn't receive SYN yet.
 			 */
 			/* FALLTHROUGH */
 
 		default:
 			/*
 			 * Prevent DoS attack with too small MSS. Round up
 			 * to at least minmss.
 			 */
 			offer = max(offer, V_tcp_minmss);
 	}
 
 	/*
 	 * rmx information is now retrieved from tcp_hostcache.
 	 */
 	tcp_hc_get(&inp->inp_inc, &metrics);
 	if (metricptr != NULL)
 		bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
 
 	/*
 	 * If there's a discovered mtu in tcp hostcache, use it.
 	 * Else, use the link mtu.
 	 */
 	if (metrics.rmx_mtu)
 		mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
 	else {
 #ifdef INET6
 		if (isipv6) {
 			mss = maxmtu - min_protoh;
 			if (!V_path_mtu_discovery &&
 			    !in6_localaddr(&inp->in6p_faddr))
 				mss = min(mss, V_tcp_v6mssdflt);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			mss = maxmtu - min_protoh;
 			if (!V_path_mtu_discovery &&
 			    !in_localaddr(inp->inp_faddr))
 				mss = min(mss, V_tcp_mssdflt);
 		}
 #endif
 		/*
 		 * XXX - The above conditional (mss = maxmtu - min_protoh)
 		 * probably violates the TCP spec.
 		 * The problem is that, since we don't know the
 		 * other end's MSS, we are supposed to use a conservative
 		 * default.  But, if we do that, then MTU discovery will
 		 * never actually take place, because the conservative
 		 * default is much less than the MTUs typically seen
 		 * on the Internet today.  For the moment, we'll sweep
 		 * this under the carpet.
 		 *
 		 * The conservative default might not actually be a problem
 		 * if the only case this occurs is when sending an initial
 		 * SYN with options and data to a host we've never talked
 		 * to before.  Then, they will reply with an MSS value which
 		 * will get recorded and the new parameters should get
 		 * recomputed.  For Further Study.
 		 */
 	}
 	mss = min(mss, offer);
 
 	/*
 	 * Sanity check: make sure that maxopd will be large
 	 * enough to allow some data on segments even if the
 	 * all the option space is used (40bytes).  Otherwise
 	 * funny things may happen in tcp_output.
 	 */
 	mss = max(mss, 64);
 
 	/*
 	 * maxopd stores the maximum length of data AND options
 	 * in a segment; maxseg is the amount of data in a normal
 	 * segment.  We need to store this value (maxopd) apart
 	 * from maxseg, because now every segment carries options
 	 * and thus we normally have somewhat less data in segments.
 	 */
 	tp->t_maxopd = mss;
 
 	/*
 	 * origoffer==-1 indicates that no segments were received yet.
 	 * In this case we just guess.
 	 */
 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 	    (origoffer == -1 ||
 	     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
 		mss -= TCPOLEN_TSTAMP_APPA;
 
 	tp->t_maxseg = mss;
 }
 
 void
 tcp_mss(struct tcpcb *tp, int offer)
 {
 	int mss;
 	u_long bufsize;
 	struct inpcb *inp;
 	struct socket *so;
 	struct hc_metrics_lite metrics;
 	struct tcp_ifcap cap;
 
 	KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
 
 	bzero(&cap, sizeof(cap));
 	tcp_mss_update(tp, offer, -1, &metrics, &cap);
 
 	mss = tp->t_maxseg;
 	inp = tp->t_inpcb;
 
 	/*
 	 * If there's a pipesize, change the socket buffer to that size,
 	 * don't change if sb_hiwat is different than default (then it
 	 * has been changed on purpose with setsockopt).
 	 * Make the socket buffers an integral number of mss units;
 	 * if the mss is larger than the socket buffer, decrease the mss.
 	 */
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe)
 		bufsize = metrics.rmx_sendpipe;
 	else
 		bufsize = so->so_snd.sb_hiwat;
 	if (bufsize < mss)
 		mss = bufsize;
 	else {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_snd.sb_hiwat)
 			(void)sbreserve_locked(&so->so_snd, bufsize, so, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_snd);
 	tp->t_maxseg = mss;
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe)
 		bufsize = metrics.rmx_recvpipe;
 	else
 		bufsize = so->so_rcv.sb_hiwat;
 	if (bufsize > mss) {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_rcv.sb_hiwat)
 			(void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/* Check the interface for TSO capabilities. */
 	if (cap.ifcap & CSUM_TSO) {
 		tp->t_flags |= TF_TSO;
 		tp->t_tsomax = cap.tsomax;
 		tp->t_tsomaxsegcount = cap.tsomaxsegcount;
 		tp->t_tsomaxsegsize = cap.tsomaxsegsize;
 	}
 }
 
 /*
  * Determine the MSS option to send on an outgoing SYN.
  */
 int
 tcp_mssopt(struct in_conninfo *inc)
 {
 	int mss = 0;
 	u_long maxmtu = 0;
 	u_long thcmtu = 0;
 	size_t min_protoh;
 
 	KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		mss = V_tcp_v6mssdflt;
 		maxmtu = tcp_maxmtu6(inc, NULL);
 		min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		mss = V_tcp_mssdflt;
 		maxmtu = tcp_maxmtu(inc, NULL);
 		min_protoh = sizeof(struct tcpiphdr);
 	}
 #endif
 #if defined(INET6) || defined(INET)
 	thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
 #endif
 
 	if (maxmtu && thcmtu)
 		mss = min(maxmtu, thcmtu) - min_protoh;
 	else if (maxmtu || thcmtu)
 		mss = max(maxmtu, thcmtu) - min_protoh;
 
 	return (mss);
 }
 
 
 /*
  * On a partial ack arrives, force the retransmission of the
  * next unacknowledged segment.  Do not clear tp->t_dupacks.
  * By setting snd_nxt to ti_ack, this forces retransmission timer to
  * be started again.
  */
 static void
 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
 {
 	tcp_seq onxt = tp->snd_nxt;
 	u_long  ocwnd = tp->snd_cwnd;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tcp_timer_activate(tp, TT_REXMT, 0);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = th->th_ack;
 	/*
 	 * Set snd_cwnd to one segment beyond acknowledged offset.
 	 * (tp->snd_una has not yet been updated when this function is called.)
 	 */
 	tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	tp->snd_cwnd = ocwnd;
 	if (SEQ_GT(onxt, tp->snd_nxt))
 		tp->snd_nxt = onxt;
 	/*
 	 * Partial window deflation.  Relies on fact that tp->snd_una
 	 * not updated yet.
 	 */
 	if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th))
 		tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
 	else
 		tp->snd_cwnd = 0;
 	tp->snd_cwnd += tp->t_maxseg;
 }
Index: user/dchagin/lemul/sys/netinet/tcp_output.c
===================================================================
--- user/dchagin/lemul/sys/netinet/tcp_output.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/tcp_output.c	(revision 274241)
@@ -1,1679 +1,1679 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #define	TCPOUTFLAGS
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, path_mtu_discovery) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(path_mtu_discovery), 1,
 	"Enable Path MTU Discovery");
 
 VNET_DEFINE(int, tcp_do_tso) = 1;
 #define	V_tcp_do_tso		VNET(tcp_do_tso)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_do_tso), 0,
 	"Enable TCP Segmentation Offload");
 
 VNET_DEFINE(int, tcp_sendspace) = 1024*32;
 #define	V_tcp_sendspace	VNET(tcp_sendspace)
-SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size");
 
 VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
 #define	V_tcp_do_autosndbuf	VNET(tcp_do_autosndbuf)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_do_autosndbuf), 0,
 	"Enable automatic send buffer sizing");
 
 VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024;
 #define	V_tcp_autosndbuf_inc	VNET(tcp_autosndbuf_inc)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_autosndbuf_inc), 0,
 	"Incrementor step size of automatic send buffer");
 
 VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024;
 #define	V_tcp_autosndbuf_max	VNET(tcp_autosndbuf_max)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_autosndbuf_max), 0,
 	"Max size of automatic send buffer");
 
 static void inline	hhook_run_tcp_est_out(struct tcpcb *tp,
 			    struct tcphdr *th, struct tcpopt *to,
 			    long len, int tso);
 static void inline	cc_after_idle(struct tcpcb *tp);
 
 /*
  * Wrapper for the TCP established output helper hook.
  */
 static void inline
 hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
     struct tcpopt *to, long len, int tso)
 {
 	struct tcp_hhook_data hhook_data;
 
 	if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) {
 		hhook_data.tp = tp;
 		hhook_data.th = th;
 		hhook_data.to = to;
 		hhook_data.len = len;
 		hhook_data.tso = tso;
 
 		hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data,
 		    tp->osd);
 	}
 }
 
 /*
  * CC wrapper hook functions
  */
 static void inline
 cc_after_idle(struct tcpcb *tp)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (CC_ALGO(tp)->after_idle != NULL)
 		CC_ALGO(tp)->after_idle(tp->ccv);
 }
 
 /*
  * Tcp output routine: figure out what should be sent and send it.
  */
 int
 tcp_output(struct tcpcb *tp)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 	long len, recwin, sendwin;
 	int off, flags, error = 0;	/* Keep compiler happy */
 	struct mbuf *m;
 	struct ip *ip = NULL;
 	struct ipovly *ipov = NULL;
 	struct tcphdr *th;
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
 #ifdef IPSEC
 	unsigned ipsec_optlen = 0;
 #endif
 	int idle, sendalot;
 	int sack_rxmit, sack_bytes_rxmt;
 	struct sackhole *p;
 	int tso, mtu;
 	struct tcpopt to;
 #if 0
 	int maxburst = TCP_MAXBURST;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 
 	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 #endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return (tcp_offload_output(tp));
 #endif
 
 	/*
 	 * Determine length of data that should be transmitted,
 	 * and flags that will be used.
 	 * If there is some data or critical controls (SYN, RST)
 	 * to send, then transmit; otherwise, investigate further.
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 	if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
 		cc_after_idle(tp);
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {
 		if (tp->t_flags & TF_MORETOCOME) {
 			tp->t_flags |= TF_LASTIDLE;
 			idle = 0;
 		}
 	}
 again:
 	/*
 	 * If we've recently taken a timeout, snd_max will be greater than
 	 * snd_nxt.  There may be SACK information that allows us to avoid
 	 * resending already delivered data.  Adjust snd_nxt accordingly.
 	 */
 	if ((tp->t_flags & TF_SACK_PERMIT) &&
 	    SEQ_LT(tp->snd_nxt, tp->snd_max))
 		tcp_sack_adjust(tp);
 	sendalot = 0;
 	tso = 0;
 	mtu = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*
 	 * Send any SACK-generated retransmissions.  If we're explicitly trying
 	 * to send out new data (when sendalot is 1), bypass this function.
 	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
 	 * we're replacing a (future) new transmission with a retransmission
 	 * now, and we previously incremented snd_cwnd in tcp_input().
 	 */
 	/*
 	 * Still in sack recovery , reset rxmit flag to zero.
 	 */
 	sack_rxmit = 0;
 	sack_bytes_rxmt = 0;
 	len = 0;
 	p = NULL;
 	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
 	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 		long cwin;
 		
 		cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
 		if (cwin < 0)
 			cwin = 0;
 		/* Do not retransmit SACK segments beyond snd_recover */
 		if (SEQ_GT(p->end, tp->snd_recover)) {
 			/*
 			 * (At least) part of sack hole extends beyond
 			 * snd_recover. Check to see if we can rexmit data
 			 * for this hole.
 			 */
 			if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
 				/*
 				 * Can't rexmit any more data for this hole.
 				 * That data will be rexmitted in the next
 				 * sack recovery episode, when snd_recover
 				 * moves past p->rxmit.
 				 */
 				p = NULL;
 				goto after_sack_rexmit;
 			} else
 				/* Can rexmit part of the current hole */
 				len = ((long)ulmin(cwin,
 						   tp->snd_recover - p->rxmit));
 		} else
 			len = ((long)ulmin(cwin, p->end - p->rxmit));
 		off = p->rxmit - tp->snd_una;
 		KASSERT(off >= 0,("%s: sack block to the left of una : %d",
 		    __func__, off));
 		if (len > 0) {
 			sack_rxmit = 1;
 			sendalot = 1;
 			TCPSTAT_INC(tcps_sack_rexmits);
 			TCPSTAT_ADD(tcps_sack_rexmit_bytes,
 			    min(len, tp->t_maxseg));
 		}
 	}
 after_sack_rexmit:
 	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
 	 */
 	if (tp->t_flags & TF_NEEDFIN)
 		flags |= TH_FIN;
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	/*
 	 * If in persist timeout with window of 0, send 1 byte.
 	 * Otherwise, if window is small but nonzero
 	 * and timer expired, we will send what we can
 	 * and go to transmit state.
 	 */
 	if (tp->t_flags & TF_FORCEDATA) {
 		if (sendwin == 0) {
 			/*
 			 * If we still have some data to send, then
 			 * clear the FIN bit.  Usually this would
 			 * happen below when it realizes that we
 			 * aren't sending all the data.  However,
 			 * if we have exactly 1 byte of unsent data,
 			 * then it won't clear the FIN bit below,
 			 * and if we are in persist state, we wind
 			 * up sending the packet without recording
 			 * that we sent the FIN bit.
 			 *
 			 * We can't just blindly clear the FIN bit,
 			 * because if we don't have any more data
 			 * to send then the probe will be the FIN
 			 * itself.
 			 */
 			if (off < so->so_snd.sb_cc)
 				flags &= ~TH_FIN;
 			sendwin = 1;
 		} else {
 			tcp_timer_activate(tp, TT_PERSIST, 0);
 			tp->t_rxtshift = 0;
 		}
 	}
 
 	/*
 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
 	 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
 	 * a negative length.  This can also occur when TCP opens up
 	 * its congestion window while receiving additional duplicate
 	 * acks after fast-retransmit because TCP will reset snd_nxt
 	 * to snd_max after the fast-retransmit.
 	 *
 	 * In the normal retransmit-FIN-only case, however, snd_nxt will
 	 * be set to snd_una, the offset will be 0, and the length may
 	 * wind up 0.
 	 *
 	 * If sack_rxmit is true we are retransmitting from the scoreboard
 	 * in which case len is already set.
 	 */
 	if (sack_rxmit == 0) {
 		if (sack_bytes_rxmt == 0)
 			len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
 		else {
 			long cwin;
 
                         /*
 			 * We are inside of a SACK recovery episode and are
 			 * sending new data, having retransmitted all the
 			 * data possible in the scoreboard.
 			 */
 			len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) 
 			       - off);
 			/*
 			 * Don't remove this (len > 0) check !
 			 * We explicitly check for len > 0 here (although it 
 			 * isn't really necessary), to work around a gcc 
 			 * optimization issue - to force gcc to compute
 			 * len above. Without this check, the computation
 			 * of len is bungled by the optimizer.
 			 */
 			if (len > 0) {
 				cwin = tp->snd_cwnd - 
 					(tp->snd_nxt - tp->sack_newdata) -
 					sack_bytes_rxmt;
 				if (cwin < 0)
 					cwin = 0;
 				len = lmin(len, cwin);
 			}
 		}
 	}
 
 	/*
 	 * Lop off SYN bit if it has already been sent.  However, if this
 	 * is SYN-SENT state and if segment contains data and if we don't
 	 * know that foreign host supports TAO, suppress sending segment.
 	 */
 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 		if (tp->t_state != TCPS_SYN_RECEIVED)
 			flags &= ~TH_SYN;
 		off--, len++;
 	}
 
 	/*
 	 * Be careful not to send data and/or FIN on SYN segments.
 	 * This measure is needed to prevent interoperability problems
 	 * with not fully conformant TCP implementations.
 	 */
 	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 		len = 0;
 		flags &= ~TH_FIN;
 	}
 
 	if (len < 0) {
 		/*
 		 * If FIN has been sent but not acked,
 		 * but we haven't been called to retransmit,
 		 * len will be < 0.  Otherwise, window shrank
 		 * after we sent into it.  If window shrank to 0,
 		 * cancel pending retransmit, pull snd_nxt back
 		 * to (closed) window, and set the persist timer
 		 * if it isn't already going.  If the window didn't
 		 * close completely, just wait for an ACK.
 		 */
 		len = 0;
 		if (sendwin == 0) {
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			tp->t_rxtshift = 0;
 			tp->snd_nxt = tp->snd_una;
 			if (!tcp_timer_active(tp, TT_PERSIST))
 				tcp_setpersist(tp);
 		}
 	}
 
 	/* len will be >= 0 after this point. */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 
 	/*
 	 * Automatic sizing of send socket buffer.  Often the send buffer
 	 * size is not optimally adjusted to the actual network conditions
 	 * at hand (delay bandwidth product).  Setting the buffer size too
 	 * small limits throughput on links with high bandwidth and high
 	 * delay (eg. trans-continental/oceanic links).  Setting the
 	 * buffer size too big consumes too much real kernel memory,
 	 * especially with many connections on busy servers.
 	 *
 	 * The criteria to step up the send buffer one notch are:
 	 *  1. receive window of remote host is larger than send buffer
 	 *     (with a fudge factor of 5/4th);
 	 *  2. send buffer is filled to 7/8th with data (so we actually
 	 *     have data to make use of it);
 	 *  3. send buffer fill has not hit maximal automatic size;
 	 *  4. our send window (slow start and cogestion controlled) is
 	 *     larger than sent but unacknowledged data in send buffer.
 	 *
 	 * The remote host receive window scaling factor may limit the
 	 * growing of the send buffer before it reaches its allowed
 	 * maximum.
 	 *
 	 * It scales directly with slow start or congestion window
 	 * and does at most one step per received ACK.  This fast
 	 * scaling has the drawback of growing the send buffer beyond
 	 * what is strictly necessary to make full use of a given
 	 * delay*bandwith product.  However testing has shown this not
 	 * to be much of an problem.  At worst we are trading wasting
 	 * of available bandwith (the non-use of it) for wasting some
 	 * socket buffer memory.
 	 *
 	 * TODO: Shrink send buffer during idle periods together
 	 * with congestion window.  Requires another timer.  Has to
 	 * wait for upcoming tcp timer rewrite.
 	 */
 	if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
 		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
 		    so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
 		    so->so_snd.sb_cc < V_tcp_autosndbuf_max &&
 		    sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
 			if (!sbreserve_locked(&so->so_snd,
 			    min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
 			     V_tcp_autosndbuf_max), so, curthread))
 				so->so_snd.sb_flags &= ~SB_AUTOSIZE;
 		}
 	}
 
 	/*
 	 * Decide if we can use TCP Segmentation Offloading (if supported by
 	 * hardware).
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
 	 * IP options prevent using TSO.  With TSO the TCP header is the same
 	 * (except for the sequence number) for all generated packets.  This
 	 * makes it impossible to transmit any options which vary per generated
 	 * segment or packet.
 	 */
 #ifdef IPSEC
 	/*
 	 * Pre-calculate here as we save another lookup into the darknesses
 	 * of IPsec that way and can actually decide if TSO is ok.
 	 */
 	ipsec_optlen = ipsec_hdrsiz_tcp(tp);
 #endif
 	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
 	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
 	    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
 #ifdef IPSEC
 	    ipsec_optlen == 0 &&
 #endif
 	    tp->t_inpcb->inp_options == NULL &&
 	    tp->t_inpcb->in6p_options == NULL)
 		tso = 1;
 
 	if (sack_rxmit) {
 		if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
 			flags &= ~TH_FIN;
 	} else {
 		if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
 			flags &= ~TH_FIN;
 	}
 
 	recwin = sbspace(&so->so_rcv);
 
 	/*
 	 * Sender silly window avoidance.   We transmit under the following
 	 * conditions when len is non-zero:
 	 *
 	 *	- We have a full segment (or more with TSO)
 	 *	- This is the last buffer in a write()/send() and we are
 	 *	  either idle or running NODELAY
 	 *	- we've timed out (e.g. persist timer)
 	 *	- we have more then 1/2 the maximum send window's worth of
 	 *	  data (receiver may be limited the window size)
 	 *	- we need to retransmit
 	 */
 	if (len) {
 		if (len >= tp->t_maxseg)
 			goto send;
 		/*
 		 * NOTE! on localhost connections an 'ack' from the remote
 		 * end may occur synchronously with the output and cause
 		 * us to flush a buffer queued with moretocome.  XXX
 		 *
 		 * note: the len + off check is almost certainly unnecessary.
 		 */
 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
 		    (idle || (tp->t_flags & TF_NODELAY)) &&
 		    len + off >= so->so_snd.sb_cc &&
 		    (tp->t_flags & TF_NOPUSH) == 0) {
 			goto send;
 		}
 		if (tp->t_flags & TF_FORCEDATA)		/* typ. timeout case */
 			goto send;
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
 			goto send;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
 			goto send;
 		if (sack_rxmit)
 			goto send;
 	}
 
 	/*
 	 * Sending of standalone window updates.
 	 *
 	 * Window updates are important when we close our window due to a
 	 * full socket buffer and are opening it again after the application
 	 * reads data from it.  Once the window has opened again and the
 	 * remote end starts to send again the ACK clock takes over and
 	 * provides the most current window information.
 	 *
 	 * We must avoid the silly window syndrome whereas every read
 	 * from the receive buffer, no matter how small, causes a window
 	 * update to be sent.  We also should avoid sending a flurry of
 	 * window updates when the socket buffer had queued a lot of data
 	 * and the application is doing small reads.
 	 *
 	 * Prevent a flurry of pointless window updates by only sending
 	 * an update when we can increase the advertized window by more
 	 * than 1/4th of the socket buffer capacity.  When the buffer is
 	 * getting full or is very small be more aggressive and send an
 	 * update whenever we can increase by two mss sized segments.
 	 * In all other situations the ACK's to new incoming data will
 	 * carry further window increases.
 	 *
 	 * Don't send an independent window update if a delayed
 	 * ACK is pending (it will get piggy-backed on it) or the
 	 * remote side already has done a half-close and won't send
 	 * more data.  Skip this if the connection is in T/TCP
 	 * half-open state.
 	 */
 	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
 	    !(tp->t_flags & TF_DELACK) &&
 	    !TCPS_HAVERCVDFIN(tp->t_state)) {
 		/*
 		 * "adv" is the amount we could increase the window,
 		 * taking into account that we are limited by
 		 * TCP_MAXWIN << tp->rcv_scale.
 		 */
 		long adv;
 		int oldwin;
 
 		adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
 			oldwin = (tp->rcv_adv - tp->rcv_nxt);
 			adv -= oldwin;
 		} else
 			oldwin = 0;
 
 		/* 
 		 * If the new window size ends up being the same as the old
 		 * size when it is scaled, then don't force a window update.
 		 */
 		if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
 			goto dontupdate;
 
 		if (adv >= (long)(2 * tp->t_maxseg) &&
 		    (adv >= (long)(so->so_rcv.sb_hiwat / 4) ||
 		     recwin <= (long)(so->so_rcv.sb_hiwat / 8) ||
 		     so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg))
 			goto send;
 	}
 dontupdate:
 
 	/*
 	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 	 * is also a catch-all for the retransmit timer timeout case.
 	 */
 	if (tp->t_flags & TF_ACKNOW)
 		goto send;
 	if ((flags & TH_RST) ||
 	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 		goto send;
 	if (SEQ_GT(tp->snd_up, tp->snd_una))
 		goto send;
 	/*
 	 * If our state indicates that FIN should be sent
 	 * and we have not yet done so, then we need to send.
 	 */
 	if (flags & TH_FIN &&
 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 		goto send;
 	/*
 	 * In SACK, it is possible for tcp_output to fail to send a segment
 	 * after the retransmission timer has been turned off.  Make sure
 	 * that the retransmission timer is set.
 	 */
 	if ((tp->t_flags & TF_SACK_PERMIT) &&
 	    SEQ_GT(tp->snd_max, tp->snd_una) &&
 	    !tcp_timer_active(tp, TT_REXMT) &&
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 		goto just_return;
 	} 
 	/*
 	 * TCP window updates are not reliable, rather a polling protocol
 	 * using ``persist'' packets is used to insure receipt of window
 	 * updates.  The three ``states'' for the output side are:
 	 *	idle			not doing retransmits or persists
 	 *	persisting		to move a small or zero window
 	 *	(re)transmitting	and thereby not persisting
 	 *
 	 * tcp_timer_active(tp, TT_PERSIST)
 	 *	is true when we are in persist state.
 	 * (tp->t_flags & TF_FORCEDATA)
 	 *	is set when we are called to send a persist packet.
 	 * tcp_timer_active(tp, TT_REXMT)
 	 *	is set when we are retransmitting
 	 * The output side is idle when both timers are zero.
 	 *
 	 * If send window is too small, there is data to transmit, and no
 	 * retransmit or persist is pending, then go to persist state.
 	 * If nothing happens soon, send when timer expires:
 	 * if window is nonzero, transmit what we can,
 	 * otherwise force out a byte.
 	 */
 	if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) &&
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tp->t_rxtshift = 0;
 		tcp_setpersist(tp);
 	}
 
 	/*
 	 * No reason to send a segment, just return.
 	 */
 just_return:
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 
 send:
 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 	if (len > 0) {
 		if (len >= tp->t_maxseg)
 			tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
 	}
 	/*
 	 * Before ESTABLISHED, force sending of initial options
 	 * unless TCP set not to do any options.
 	 * NOTE: we assume that the IP/TCP header plus TCP options
 	 * always fit in a single mbuf, leaving room for a maximum
 	 * link header, i.e.
 	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 	 */
 	optlen = 0;
 #ifdef INET6
 	if (isipv6)
 		hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	else
 #endif
 		hdrlen = sizeof (struct tcpiphdr);
 
 	/*
 	 * Compute options for segment.
 	 * We only have to care about SYN and established connection
 	 * segments.  Options for SYN-ACK segments are handled in TCP
 	 * syncache.
 	 */
 	if ((tp->t_flags & TF_NOOPT) == 0) {
 		to.to_flags = 0;
 		/* Maximum segment size. */
 		if (flags & TH_SYN) {
 			tp->snd_nxt = tp->iss;
 			to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
 			to.to_flags |= TOF_MSS;
 		}
 		/* Window scaling. */
 		if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
 			to.to_wscale = tp->request_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		/* Timestamps. */
 		if ((tp->t_flags & TF_RCVD_TSTMP) ||
 		    ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
 			to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 			/* Set receive buffer autosizing timestamp. */
 			if (tp->rfbuf_ts == 0 &&
 			    (so->so_rcv.sb_flags & SB_AUTOSIZE))
 				tp->rfbuf_ts = tcp_ts_getticks();
 		}
 		/* Selective ACK's. */
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (flags & TH_SYN)
 				to.to_flags |= TOF_SACKPERM;
 			else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 			    (tp->t_flags & TF_SACK_PERMIT) &&
 			    tp->rcv_numsacks > 0) {
 				to.to_flags |= TOF_SACK;
 				to.to_nsacks = tp->rcv_numsacks;
 				to.to_sacks = (u_char *)tp->sackblks;
 			}
 		}
 #ifdef TCP_SIGNATURE
 		/* TCP-MD5 (RFC2385). */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif /* TCP_SIGNATURE */
 
 		/* Processing the options. */
 		hdrlen += optlen = tcp_addoptions(&to, opt);
 	}
 
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
 	else
 #endif
 	if (tp->t_inpcb->inp_options)
 		ipoptlen = tp->t_inpcb->inp_options->m_len -
 				offsetof(struct ipoption, ipopt_list);
 	else
 		ipoptlen = 0;
 #ifdef IPSEC
 	ipoptlen += ipsec_optlen;
 #endif
 
 	/*
 	 * Adjust data length if insertion of options will
 	 * bump the packet length beyond the t_maxopd length.
 	 * Clear the FIN bit because we cut off the tail of
 	 * the segment.
 	 */
 	if (len + optlen + ipoptlen > tp->t_maxopd) {
 		flags &= ~TH_FIN;
 
 		if (tso) {
 			u_int if_hw_tsomax;
 			u_int if_hw_tsomaxsegcount;
 			u_int if_hw_tsomaxsegsize;
 			struct mbuf *mb;
 			u_int moff;
 			int max_len;
 
 			/* extract TSO information */
 			if_hw_tsomax = tp->t_tsomax;
 			if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
 			if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
 
 			/*
 			 * Limit a TSO burst to prevent it from
 			 * overflowing or exceeding the maximum length
 			 * allowed by the network interface:
 			 */
 			KASSERT(ipoptlen == 0,
 			    ("%s: TSO can't do IP options", __func__));
 
 			/*
 			 * Check if we should limit by maximum payload
 			 * length:
 			 */
 			if (if_hw_tsomax != 0) {
 				/* compute maximum TSO length */
 				max_len = (if_hw_tsomax - hdrlen);
 				if (max_len <= 0) {
 					len = 0;
 				} else if (len > (u_int)max_len) {
 					sendalot = 1;
 					len = (u_int)max_len;
 				}
 			}
 
 			/*
 			 * Check if we should limit by maximum segment
 			 * size and count:
 			 */
 			if (if_hw_tsomaxsegcount != 0 &&
 			    if_hw_tsomaxsegsize != 0) {
 				max_len = 0;
 				mb = sbsndmbuf(&so->so_snd, off, &moff);
 
 				while (mb != NULL && (u_int)max_len < len) {
 					u_int mlen;
 					u_int frags;
 
 					/*
 					 * Get length of mbuf fragment
 					 * and how many hardware frags,
 					 * rounded up, it would use:
 					 */
 					mlen = (mb->m_len - moff);
 					frags = howmany(mlen,
 					    if_hw_tsomaxsegsize);
 
 					/* Handle special case: Zero Length Mbuf */
 					if (frags == 0)
 						frags = 1;
 
 					/*
 					 * Check if the fragment limit
 					 * will be reached or exceeded:
 					 */
 					if (frags >= if_hw_tsomaxsegcount) {
 						max_len += min(mlen,
 						    if_hw_tsomaxsegcount *
 						    if_hw_tsomaxsegsize);
 						break;
 					}
 					max_len += mlen;
 					if_hw_tsomaxsegcount -= frags;
 					moff = 0;
 					mb = mb->m_next;
 				}
 				if (max_len <= 0) {
 					len = 0;
 				} else if (len > (u_int)max_len) {
 					sendalot = 1;
 					len = (u_int)max_len;
 				}
 			}
 
 			/*
 			 * Prevent the last segment from being
 			 * fractional unless the send sockbuf can be
 			 * emptied:
 			 */
 			max_len = (tp->t_maxopd - optlen);
 			if ((off + len) < so->so_snd.sb_cc) {
 				moff = len % (u_int)max_len;
 				if (moff != 0) {
 					len -= moff;
 					sendalot = 1;
 				}
 			}
 
 			/*
 			 * In case there are too many small fragments
 			 * don't use TSO:
 			 */
 			if (len <= (u_int)max_len) {
 				len = (u_int)max_len;
 				sendalot = 1;
 				tso = 0;
 			}
 
 			/*
 			 * Send the FIN in a separate segment
 			 * after the bulk sending is done.
 			 * We don't trust the TSO implementations
 			 * to clear the FIN flag on all but the
 			 * last segment.
 			 */
 			if (tp->t_flags & TF_NEEDFIN)
 				sendalot = 1;
 
 		} else {
 			len = tp->t_maxopd - optlen - ipoptlen;
 			sendalot = 1;
 		}
 	} else
 		tso = 0;
 
 	KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
 	    ("%s: len > IP_MAXPACKET", __func__));
 
 /*#ifdef DIAGNOSTIC*/
 #ifdef INET6
 	if (max_linkhdr + hdrlen > MCLBYTES)
 #else
 	if (max_linkhdr + hdrlen > MHLEN)
 #endif
 		panic("tcphdr too big");
 /*#endif*/
 
 	/*
 	 * This KASSERT is here to catch edge cases at a well defined place.
 	 * Before, those had triggered (random) panic conditions further down.
 	 */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 
 	/*
 	 * Grab a header mbuf, attaching a copy of data to
 	 * be transmitted, and initialize the header from
 	 * the template for sends on this connection.
 	 */
 	if (len) {
 		struct mbuf *mb;
 		u_int moff;
 
 		if ((tp->t_flags & TF_FORCEDATA) && len == 1)
 			TCPSTAT_INC(tcps_sndprobe);
 		else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
 			tp->t_sndrexmitpack++;
 			TCPSTAT_INC(tcps_sndrexmitpack);
 			TCPSTAT_ADD(tcps_sndrexmitbyte, len);
 		} else {
 			TCPSTAT_INC(tcps_sndpack);
 			TCPSTAT_ADD(tcps_sndbyte, len);
 		}
 #ifdef INET6
 		if (MHLEN < hdrlen + max_linkhdr)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 #endif
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 
 		if (m == NULL) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 
 		/*
 		 * Start the m_copy functions from the closest mbuf
 		 * to the offset in the socket buffer chain.
 		 */
 		mb = sbsndptr(&so->so_snd, off, len, &moff);
 
 		if (len <= MHLEN - hdrlen - max_linkhdr) {
 			m_copydata(mb, moff, (int)len,
 			    mtod(m, caddr_t) + hdrlen);
 			m->m_len += len;
 		} else {
 			m->m_next = m_copy(mb, moff, (int)len);
 			if (m->m_next == NULL) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				(void) m_free(m);
 				error = ENOBUFS;
 				sack_rxmit = 0;
 				goto out;
 			}
 		}
 
 		/*
 		 * If we're sending everything we've got, set PUSH.
 		 * (This will keep happy those implementations which only
 		 * give data to the user when a buffer fills or
 		 * a PUSH comes in.)
 		 */
 		if (off + len == so->so_snd.sb_cc)
 			flags |= TH_PUSH;
 		SOCKBUF_UNLOCK(&so->so_snd);
 	} else {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		if (tp->t_flags & TF_ACKNOW)
 			TCPSTAT_INC(tcps_sndacks);
 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
 			TCPSTAT_INC(tcps_sndctrl);
 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
 			TCPSTAT_INC(tcps_sndurg);
 		else
 			TCPSTAT_INC(tcps_sndwinup);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 #ifdef INET6
 		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
 		    MHLEN >= hdrlen) {
 			MH_ALIGN(m, hdrlen);
 		} else
 #endif
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 	}
 	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 	mac_inpcb_create_mbuf(tp->t_inpcb, m);
 #endif
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		tcpip_fillheaders(tp->t_inpcb, ip6, th);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 		ipov = (struct ipovly *)ip;
 		th = (struct tcphdr *)(ip + 1);
 		tcpip_fillheaders(tp->t_inpcb, ip, th);
 	}
 
 	/*
 	 * Fill in fields, remembering maximum advertised
 	 * window for use in delaying messages about window sizes.
 	 * If resending a FIN, be sure not to use a new sequence number.
 	 */
 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 	    tp->snd_nxt == tp->snd_max)
 		tp->snd_nxt--;
 	/*
 	 * If we are starting a connection, send ECN setup
 	 * SYN packet. If we are on a retransmit, we may
 	 * resend those bits a number of times as per
 	 * RFC 3168.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
 		if (tp->t_rxtshift >= 1) {
 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
 				flags |= TH_ECE|TH_CWR;
 		} else
 			flags |= TH_ECE|TH_CWR;
 	}
 	
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (tp->t_flags & TF_ECN_PERMIT)) {
 		/*
 		 * If the peer has ECN, mark data packets with
 		 * ECN capable transmission (ECT).
 		 * Ignore pure ack packets, retransmissions and window probes.
 		 */
 		if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
 		    !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
 #ifdef INET6
 			if (isipv6)
 				ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
 			else
 #endif
 				ip->ip_tos |= IPTOS_ECN_ECT0;
 			TCPSTAT_INC(tcps_ecn_ect0);
 		}
 		
 		/*
 		 * Reply with proper ECN notifications.
 		 */
 		if (tp->t_flags & TF_ECN_SND_CWR) {
 			flags |= TH_CWR;
 			tp->t_flags &= ~TF_ECN_SND_CWR;
 		} 
 		if (tp->t_flags & TF_ECN_SND_ECE)
 			flags |= TH_ECE;
 	}
 	
 	/*
 	 * If we are doing retransmissions, then snd_nxt will
 	 * not reflect the first unsent octet.  For ACK only
 	 * packets, we do not want the sequence number of the
 	 * retransmitted packet, we want the sequence number
 	 * of the next unsent octet.  So, if there is no data
 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
 	 * when filling in ti_seq.  But if we are in persist
 	 * state, snd_max might reflect one byte beyond the
 	 * right edge of the window, so use snd_nxt in that
 	 * case, since we know we aren't doing a retransmission.
 	 * (retransmit and persist are mutually exclusive...)
 	 */
 	if (sack_rxmit == 0) {
 		if (len || (flags & (TH_SYN|TH_FIN)) ||
 		    tcp_timer_active(tp, TT_PERSIST))
 			th->th_seq = htonl(tp->snd_nxt);
 		else
 			th->th_seq = htonl(tp->snd_max);
 	} else {
 		th->th_seq = htonl(p->rxmit);
 		p->rxmit += len;
 		tp->sackhint.sack_bytes_rexmit += len;
 	}
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 	}
 	th->th_flags = flags;
 	/*
 	 * Calculate receive window.  Don't shrink window,
 	 * but avoid silly window syndrome.
 	 */
 	if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
 	    recwin < (long)tp->t_maxseg)
 		recwin = 0;
 	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
 	    recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
 		recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
 	if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
 		recwin = (long)TCP_MAXWIN << tp->rcv_scale;
 
 	/*
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 	 * or <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK>
 	 * case is handled in syncache.
 	 */
 	if (flags & TH_SYN)
 		th->th_win = htons((u_short)
 				(min(sbspace(&so->so_rcv), TCP_MAXWIN)));
 	else
 		th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
 
 	/*
 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised
 	 * a 0 window.  This may cause the remote transmitter to stall.  This
 	 * flag tells soreceive() to disable delayed acknowledgements when
 	 * draining the buffer.  This can occur if the receiver is attempting
 	 * to read more data than can be buffered prior to transmitting on
 	 * the connection.
 	 */
 	if (th->th_win == 0) {
 		tp->t_sndzerowin++;
 		tp->t_flags |= TF_RXWIN0SENT;
 	} else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
 		th->th_flags |= TH_URG;
 	} else
 		/*
 		 * If no urgent pointer to send, then we pull
 		 * the urgent pointer to the left edge of the send window
 		 * so that it doesn't drift into the send window on sequence
 		 * number wraparound.
 		 */
 		tp->snd_up = tp->snd_una;		/* drag it along */
 
 #ifdef TCP_SIGNATURE
 	if (tp->t_flags & TF_SIGNATURE) {
 		int sigoff = to.to_signature - opt;
 		tcp_signature_compute(m, 0, len, optlen,
 		    (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
 	}
 #endif
 
 	/*
 	 * Put TCP length in extended header, and then
 	 * checksum extended header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * ip6_plen is not need to be filled now, and will be filled
 		 * in ip6_output.
 		 */
 		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 		th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
 		    optlen + len, IPPROTO_TCP, 0);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
 
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
 #endif
 
 	/*
 	 * Enable TSO and specify the size of the segments.
 	 * The TCP pseudo header checksum is always provided.
 	 * XXX: Fixme: This is currently not the case for IPv6.
 	 */
 	if (tso) {
 		KASSERT(len > tp->t_maxopd - optlen,
 		    ("%s: len <= tso_segsz", __func__));
 		m->m_pkthdr.csum_flags |= CSUM_TSO;
 		m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
 	}
 
 #ifdef IPSEC
 	KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
 	    ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u",
 	    __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
 #else
 	KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
 	    ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u",
 	    __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
 #endif
 
 	/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
 	hhook_run_tcp_est_out(tp, th, &to, len, tso);
 
 #ifdef TCPDEBUG
 	/*
 	 * Trace.
 	 */
 	if (so->so_options & SO_DEBUG) {
 		u_short save = 0;
 #ifdef INET6
 		if (!isipv6)
 #endif
 		{
 			save = ipov->ih_len;
 			ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
 		}
 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
 #ifdef INET6
 		if (!isipv6)
 #endif
 		ipov->ih_len = save;
 	}
 #endif /* TCPDEBUG */
 
 	/*
 	 * Fill in IP length and desired time to live and
 	 * send to IP level.  There should be a better way
 	 * to handle ttl and tos; we could keep them in
 	 * the template, but need a way to checksum without them.
 	 */
 	/*
 	 * m->m_pkthdr.len should have been set before checksum calculation,
 	 * because in6_cksum() need it.
 	 */
 #ifdef INET6
 	if (isipv6) {
 		struct route_in6 ro;
 
 		bzero(&ro, sizeof(ro));
 		/*
 		 * we separately set hoplimit for every segment, since the
 		 * user might want to change the value via setsockopt.
 		 * Also, desired default hop limit might be changed via
 		 * Neighbor Discovery.
 		 */
 		ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
 
 		/*
 		 * Set the packet size here for the benefit of DTrace probes.
 		 * ip6_output() will set it properly; it's supposed to include
 		 * the option header lengths as well.
 		 */
 		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
 
 		if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss)
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip6, tp, th);
 
 		/* TODO: IPv6 IP6TOS_ECT bit on */
 		error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro,
 		    ((so->so_options & SO_DONTROUTE) ?  IP_ROUTETOIF : 0),
 		    NULL, NULL, tp->t_inpcb);
 
 		if (error == EMSGSIZE && ro.ro_rt != NULL)
 			mtu = ro.ro_rt->rt_mtu;
 		RO_RTFREE(&ro);
 	}
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
     {
 	struct route ro;
 
 	bzero(&ro, sizeof(ro));
 	ip->ip_len = htons(m->m_pkthdr.len);
 #ifdef INET6
 	if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
 		ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
 #endif /* INET6 */
 	/*
 	 * If we do path MTU discovery, then we set DF on every packet.
 	 * This might not be the best thing to do according to RFC3390
 	 * Section 2. However the tcp hostcache migitates the problem
 	 * so it affects only the first tcp connection with a host.
 	 *
 	 * NB: Don't set DF on small MTU/MSS to have a safe fallback.
 	 */
 	if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) {
 		ip->ip_off |= htons(IP_DF);
 		tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 	} else {
 		tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 	}
 
 	if (tp->t_state == TCPS_SYN_SENT)
 		TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
 
 	TCP_PROBE5(send, NULL, tp, ip, tp, th);
 
 	error = ip_output(m, tp->t_inpcb->inp_options, &ro,
 	    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
 	    tp->t_inpcb);
 
 	if (error == EMSGSIZE && ro.ro_rt != NULL)
 		mtu = ro.ro_rt->rt_mtu;
 	RO_RTFREE(&ro);
     }
 #endif /* INET */
 
 out:
 	/*
 	 * In transmit state, time the transmission and arrange for
 	 * the retransmit.  In persist state, just set snd_max.
 	 */
 	if ((tp->t_flags & TF_FORCEDATA) == 0 || 
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tcp_seq startseq = tp->snd_nxt;
 
 		/*
 		 * Advance snd_nxt over sequence space of this segment.
 		 */
 		if (flags & (TH_SYN|TH_FIN)) {
 			if (flags & TH_SYN)
 				tp->snd_nxt++;
 			if (flags & TH_FIN) {
 				tp->snd_nxt++;
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
 		if (sack_rxmit)
 			goto timer;
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			tp->snd_max = tp->snd_nxt;
 			/*
 			 * Time this transmission if not a retransmission and
 			 * not currently timing anything.
 			 */
 			if (tp->t_rtttime == 0) {
 				tp->t_rtttime = ticks;
 				tp->t_rtseq = startseq;
 				TCPSTAT_INC(tcps_segstimed);
 			}
 		}
 
 		/*
 		 * Set retransmit timer if not currently set,
 		 * and not doing a pure ack or a keep-alive probe.
 		 * Initial value for retransmit timer is smoothed
 		 * round-trip time + 2 * round-trip time variance.
 		 * Initialize shift counter which is used for backoff
 		 * of retransmit time.
 		 */
 timer:
 		if (!tcp_timer_active(tp, TT_REXMT) &&
 		    ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
 		     (tp->snd_nxt != tp->snd_una))) {
 			if (tcp_timer_active(tp, TT_PERSIST)) {
 				tcp_timer_activate(tp, TT_PERSIST, 0);
 				tp->t_rxtshift = 0;
 			}
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 		}
 	} else {
 		/*
 		 * Persist case, update snd_max but since we are in
 		 * persist mode (no window) we do not update snd_nxt.
 		 */
 		int xlen = len;
 		if (flags & TH_SYN)
 			++xlen;
 		if (flags & TH_FIN) {
 			++xlen;
 			tp->t_flags |= TF_SENTFIN;
 		}
 		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
 			tp->snd_max = tp->snd_nxt + len;
 	}
 
 	if (error) {
 
 		/*
 		 * We know that the packet was lost, so back out the
 		 * sequence number advance, if any.
 		 *
 		 * If the error is EPERM the packet got blocked by the
 		 * local firewall.  Normally we should terminate the
 		 * connection but the blocking may have been spurious
 		 * due to a firewall reconfiguration cycle.  So we treat
 		 * it like a packet loss and let the retransmit timer and
 		 * timeouts do their work over time.
 		 * XXX: It is a POLA question whether calling tcp_drop right
 		 * away would be the really correct behavior instead.
 		 */
 		if (((tp->t_flags & TF_FORCEDATA) == 0 ||
 		    !tcp_timer_active(tp, TT_PERSIST)) &&
 		    ((flags & TH_SYN) == 0) &&
 		    (error != EPERM)) {
 			if (sack_rxmit) {
 				p->rxmit -= len;
 				tp->sackhint.sack_bytes_rexmit -= len;
 				KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
 				    ("sackhint bytes rtx >= 0"));
 			} else
 				tp->snd_nxt -= len;
 		}
 		SOCKBUF_UNLOCK_ASSERT(&so->so_snd);	/* Check gotos. */
 		switch (error) {
 		case EPERM:
 			tp->t_softerror = error;
 			return (error);
 		case ENOBUFS:
 	                if (!tcp_timer_active(tp, TT_REXMT) &&
 			    !tcp_timer_active(tp, TT_PERSIST))
 	                        tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 			tp->snd_cwnd = tp->t_maxseg;
 			return (0);
 		case EMSGSIZE:
 			/*
 			 * For some reason the interface we used initially
 			 * to send segments changed to another or lowered
 			 * its MTU.
 			 * If TSO was active we either got an interface
 			 * without TSO capabilits or TSO was turned off.
 			 * If we obtained mtu from ip_output() then update
 			 * it and try again.
 			 */
 			if (tso)
 				tp->t_flags &= ~TF_TSO;
 			if (mtu != 0) {
 				tcp_mss_update(tp, -1, mtu, NULL, NULL);
 				goto again;
 			}
 			return (error);
 		case EHOSTDOWN:
 		case EHOSTUNREACH:
 		case ENETDOWN:
 		case ENETUNREACH:
 			if (TCPS_HAVERCVDSYN(tp->t_state)) {
 				tp->t_softerror = error;
 				return (0);
 			}
 			/* FALLTHROUGH */
 		default:
 			return (error);
 		}
 	}
 	TCPSTAT_INC(tcps_sndtotal);
 
 	/*
 	 * Data sent (as far as we can tell).
 	 * If this advertises a larger window than any other segment,
 	 * then remember the size of the advertised window.
 	 * Any pending ACK has now been sent.
 	 */
 	if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
 		tp->rcv_adv = tp->rcv_nxt + recwin;
 	tp->last_ack_sent = tp->rcv_nxt;
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 	if (tcp_timer_active(tp, TT_DELACK))
 		tcp_timer_activate(tp, TT_DELACK, 0);
 #if 0
 	/*
 	 * This completely breaks TCP if newreno is turned on.  What happens
 	 * is that if delayed-acks are turned on on the receiver, this code
 	 * on the transmitter effectively destroys the TCP window, forcing
 	 * it to four packets (1.5Kx4 = 6K window).
 	 */
 	if (sendalot && --maxburst)
 		goto again;
 #endif
 	if (sendalot)
 		goto again;
 	return (0);
 }
 
 void
 tcp_setpersist(struct tcpcb *tp)
 {
 	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
 	int tt;
 
 	tp->t_flags &= ~TF_PREVVALID;
 	if (tcp_timer_active(tp, TT_REXMT))
 		panic("tcp_setpersist: retransmit pending");
 	/*
 	 * Start/restart persistance timer.
 	 */
 	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
 		      TCPTV_PERSMIN, TCPTV_PERSMAX);
 	tcp_timer_activate(tp, TT_PERSIST, tt);
 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 		tp->t_rxtshift++;
 }
 
 /*
  * Insert TCP options according to the supplied parameters to the place
  * optp in a consistent way.  Can handle unaligned destinations.
  *
  * The order of the option processing is crucial for optimal packing and
  * alignment for the scarce option space.
  *
  * The optimal order for a SYN/SYN-ACK segment is:
  *   MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
  *   Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
  *
  * The SACK options should be last.  SACK blocks consume 8*n+2 bytes.
  * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
  * At minimum we need 10 bytes (to generate 1 SACK block).  If both
  * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
  * we only have 10 bytes for SACK options (40 - (12 + 18)).
  */
 int
 tcp_addoptions(struct tcpopt *to, u_char *optp)
 {
 	u_int mask, optlen = 0;
 
 	for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
 		if ((to->to_flags & mask) != mask)
 			continue;
 		if (optlen == TCP_MAXOLEN)
 			break;
 		switch (to->to_flags & mask) {
 		case TOF_MSS:
 			while (optlen % 4) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)
 				continue;
 			optlen += TCPOLEN_MAXSEG;
 			*optp++ = TCPOPT_MAXSEG;
 			*optp++ = TCPOLEN_MAXSEG;
 			to->to_mss = htons(to->to_mss);
 			bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
 			optp += sizeof(to->to_mss);
 			break;
 		case TOF_SCALE:
 			while (!optlen || optlen % 2 != 1) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)
 				continue;
 			optlen += TCPOLEN_WINDOW;
 			*optp++ = TCPOPT_WINDOW;
 			*optp++ = TCPOLEN_WINDOW;
 			*optp++ = to->to_wscale;
 			break;
 		case TOF_SACKPERM:
 			while (optlen % 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)
 				continue;
 			optlen += TCPOLEN_SACK_PERMITTED;
 			*optp++ = TCPOPT_SACK_PERMITTED;
 			*optp++ = TCPOLEN_SACK_PERMITTED;
 			break;
 		case TOF_TS:
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)
 				continue;
 			optlen += TCPOLEN_TIMESTAMP;
 			*optp++ = TCPOPT_TIMESTAMP;
 			*optp++ = TCPOLEN_TIMESTAMP;
 			to->to_tsval = htonl(to->to_tsval);
 			to->to_tsecr = htonl(to->to_tsecr);
 			bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
 			optp += sizeof(to->to_tsval);
 			bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
 			optp += sizeof(to->to_tsecr);
 			break;
 		case TOF_SIGNATURE:
 			{
 			int siglen = TCPOLEN_SIGNATURE - 2;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE)
 				continue;
 			optlen += TCPOLEN_SIGNATURE;
 			*optp++ = TCPOPT_SIGNATURE;
 			*optp++ = TCPOLEN_SIGNATURE;
 			to->to_signature = optp;
 			while (siglen--)
 				 *optp++ = 0;
 			break;
 			}
 		case TOF_SACK:
 			{
 			int sackblks = 0;
 			struct sackblk *sack = (struct sackblk *)to->to_sacks;
 			tcp_seq sack_seq;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)
 				continue;
 			optlen += TCPOLEN_SACKHDR;
 			*optp++ = TCPOPT_SACK;
 			sackblks = min(to->to_nsacks,
 					(TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
 			*optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
 			while (sackblks--) {
 				sack_seq = htonl(sack->start);
 				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 				optp += sizeof(sack_seq);
 				sack_seq = htonl(sack->end);
 				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 				optp += sizeof(sack_seq);
 				optlen += TCPOLEN_SACK;
 				sack++;
 			}
 			TCPSTAT_INC(tcps_sack_send_blocks);
 			break;
 			}
 		default:
 			panic("%s: unknown TCP option type", __func__);
 			break;
 		}
 	}
 
 	/* Terminate and pad TCP options to a 4 byte boundary. */
 	if (optlen % 4) {
 		optlen += TCPOLEN_EOL;
 		*optp++ = TCPOPT_EOL;
 	}
 	/*
 	 * According to RFC 793 (STD0007):
 	 *   "The content of the header beyond the End-of-Option option
 	 *    must be header padding (i.e., zero)."
 	 *   and later: "The padding is composed of zeros."
 	 */
 	while (optlen % 4) {
 		optlen += TCPOLEN_PAD;
 		*optp++ = TCPOPT_PAD;
 	}
 
 	KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__));
 	return (optlen);
 }
Index: user/dchagin/lemul/sys/netinet/tcp_sack.c
===================================================================
--- user/dchagin/lemul/sys/netinet/tcp_sack.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/tcp_sack.c	(revision 274241)
@@ -1,687 +1,687 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_sack.c	8.12 (Berkeley) 5/24/95
  */
 
 /*-
  *	@@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
  *
  * NRL grants permission for redistribution and use in source and binary
  * forms, with or without modification, of the software and documentation
  * created at NRL provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgements:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  *	This product includes software developed at the Information
  *	Technology Division, US Naval Research Laboratory.
  * 4. Neither the name of the NRL nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * The views and conclusions contained in the software and documentation
  * are those of the authors and should not be interpreted as representing
  * official policies, either expressed or implied, of the US Naval
  * Research Laboratory (NRL).
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
 
 #include <machine/in_cksum.h>
 
 VNET_DECLARE(struct uma_zone *, sack_hole_zone);
 #define	V_sack_hole_zone		VNET(sack_hole_zone)
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK");
 VNET_DEFINE(int, tcp_do_sack) = 1;
 #define	V_tcp_do_sack			VNET(tcp_do_sack)
-SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_sack), 0, "Enable/Disable TCP SACK support");
 
 VNET_DEFINE(int, tcp_sack_maxholes) = 128;
 #define	V_tcp_sack_maxholes		VNET(tcp_sack_maxholes)
-SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_sack_maxholes), 0,
     "Maximum number of TCP SACK holes allowed per connection");
 
 VNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536;
 #define	V_tcp_sack_globalmaxholes	VNET(tcp_sack_globalmaxholes)
-SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_sack_globalmaxholes), 0, 
     "Global maximum number of TCP SACK holes");
 
 VNET_DEFINE(int, tcp_sack_globalholes) = 0;
 #define	V_tcp_sack_globalholes		VNET(tcp_sack_globalholes)
-SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD,
+SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD,
     &VNET_NAME(tcp_sack_globalholes), 0,
     "Global number of TCP SACK holes currently allocated");
 
 /*
  * This function is called upon receipt of new valid data (while not in
  * header prediction mode), and it updates the ordered list of sacks.
  */
 void
 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
 {
 	/*
 	 * First reported block MUST be the most recent one.  Subsequent
 	 * blocks SHOULD be in the order in which they arrived at the
 	 * receiver.  These two conditions make the implementation fully
 	 * compliant with RFC 2018.
 	 */
 	struct sackblk head_blk, saved_blks[MAX_SACK_BLKS];
 	int num_head, num_saved, i;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/* Check arguments. */
 	KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));
 
 	/* SACK block for the received segment. */
 	head_blk.start = rcv_start;
 	head_blk.end = rcv_end;
 
 	/*
 	 * Merge updated SACK blocks into head_blk, and save unchanged SACK
 	 * blocks into saved_blks[].  num_saved will have the number of the
 	 * saved SACK blocks.
 	 */
 	num_saved = 0;
 	for (i = 0; i < tp->rcv_numsacks; i++) {
 		tcp_seq start = tp->sackblks[i].start;
 		tcp_seq end = tp->sackblks[i].end;
 		if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
 			/*
 			 * Discard this SACK block.
 			 */
 		} else if (SEQ_LEQ(head_blk.start, end) &&
 			   SEQ_GEQ(head_blk.end, start)) {
 			/*
 			 * Merge this SACK block into head_blk.  This SACK
 			 * block itself will be discarded.
 			 */
 			if (SEQ_GT(head_blk.start, start))
 				head_blk.start = start;
 			if (SEQ_LT(head_blk.end, end))
 				head_blk.end = end;
 		} else {
 			/*
 			 * Save this SACK block.
 			 */
 			saved_blks[num_saved].start = start;
 			saved_blks[num_saved].end = end;
 			num_saved++;
 		}
 	}
 
 	/*
 	 * Update SACK list in tp->sackblks[].
 	 */
 	num_head = 0;
 	if (SEQ_GT(head_blk.start, tp->rcv_nxt)) {
 		/*
 		 * The received data segment is an out-of-order segment.  Put
 		 * head_blk at the top of SACK list.
 		 */
 		tp->sackblks[0] = head_blk;
 		num_head = 1;
 		/*
 		 * If the number of saved SACK blocks exceeds its limit,
 		 * discard the last SACK block.
 		 */
 		if (num_saved >= MAX_SACK_BLKS)
 			num_saved--;
 	}
 	if (num_saved > 0) {
 		/*
 		 * Copy the saved SACK blocks back.
 		 */
 		bcopy(saved_blks, &tp->sackblks[num_head],
 		      sizeof(struct sackblk) * num_saved);
 	}
 
 	/* Save the number of SACK blocks. */
 	tp->rcv_numsacks = num_head + num_saved;
 }
 
 /*
  * Delete all receiver-side SACK information.
  */
 void
 tcp_clean_sackreport(struct tcpcb *tp)
 {
 	int i;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	tp->rcv_numsacks = 0;
 	for (i = 0; i < MAX_SACK_BLKS; i++)
 		tp->sackblks[i].start = tp->sackblks[i].end=0;
 }
 
 /*
  * Allocate struct sackhole.
  */
 static struct sackhole *
 tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end)
 {
 	struct sackhole *hole;
 
 	if (tp->snd_numholes >= V_tcp_sack_maxholes ||
 	    V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) {
 		TCPSTAT_INC(tcps_sack_sboverflow);
 		return NULL;
 	}
 
 	hole = (struct sackhole *)uma_zalloc(V_sack_hole_zone, M_NOWAIT);
 	if (hole == NULL)
 		return NULL;
 
 	hole->start = start;
 	hole->end = end;
 	hole->rxmit = start;
 
 	tp->snd_numholes++;
 	atomic_add_int(&V_tcp_sack_globalholes, 1);
 
 	return hole;
 }
 
 /*
  * Free struct sackhole.
  */
 static void
 tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole)
 {
 
 	uma_zfree(V_sack_hole_zone, hole);
 
 	tp->snd_numholes--;
 	atomic_subtract_int(&V_tcp_sack_globalholes, 1);
 
 	KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0"));
 	KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0"));
 }
 
 /*
  * Insert new SACK hole into scoreboard.
  */
 static struct sackhole *
 tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end,
     struct sackhole *after)
 {
 	struct sackhole *hole;
 
 	/* Allocate a new SACK hole. */
 	hole = tcp_sackhole_alloc(tp, start, end);
 	if (hole == NULL)
 		return NULL;
 
 	/* Insert the new SACK hole into scoreboard. */
 	if (after != NULL)
 		TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink);
 	else
 		TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink);
 
 	/* Update SACK hint. */
 	if (tp->sackhint.nexthole == NULL)
 		tp->sackhint.nexthole = hole;
 
 	return hole;
 }
 
 /*
  * Remove SACK hole from scoreboard.
  */
 static void
 tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole)
 {
 
 	/* Update SACK hint. */
 	if (tp->sackhint.nexthole == hole)
 		tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink);
 
 	/* Remove this SACK hole. */
 	TAILQ_REMOVE(&tp->snd_holes, hole, scblink);
 
 	/* Free this SACK hole. */
 	tcp_sackhole_free(tp, hole);
 }
 
 /*
  * Process cumulative ACK and the TCP SACK option to update the scoreboard.
  * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of
  * the sequence space).
  */
 void
 tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
 {
 	struct sackhole *cur, *temp;
 	struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp;
 	int i, j, num_sack_blks;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	num_sack_blks = 0;
 	/*
 	 * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist,
 	 * treat [SND.UNA, SEG.ACK) as if it is a SACK block.
 	 */
 	if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) {
 		sack_blocks[num_sack_blks].start = tp->snd_una;
 		sack_blocks[num_sack_blks++].end = th_ack;
 	}
 	/*
 	 * Append received valid SACK blocks to sack_blocks[], but only if we
 	 * received new blocks from the other side.
 	 */
 	if (to->to_flags & TOF_SACK) {
 		for (i = 0; i < to->to_nsacks; i++) {
 			bcopy((to->to_sacks + i * TCPOLEN_SACK),
 			    &sack, sizeof(sack));
 			sack.start = ntohl(sack.start);
 			sack.end = ntohl(sack.end);
 			if (SEQ_GT(sack.end, sack.start) &&
 			    SEQ_GT(sack.start, tp->snd_una) &&
 			    SEQ_GT(sack.start, th_ack) &&
 			    SEQ_LT(sack.start, tp->snd_max) &&
 			    SEQ_GT(sack.end, tp->snd_una) &&
 			    SEQ_LEQ(sack.end, tp->snd_max))
 				sack_blocks[num_sack_blks++] = sack;
 		}
 	}
 	/*
 	 * Return if SND.UNA is not advanced and no valid SACK block is
 	 * received.
 	 */
 	if (num_sack_blks == 0)
 		return;
 
 	/*
 	 * Sort the SACK blocks so we can update the scoreboard with just one
 	 * pass. The overhead of sorting upto 4+1 elements is less than
 	 * making upto 4+1 passes over the scoreboard.
 	 */
 	for (i = 0; i < num_sack_blks; i++) {
 		for (j = i + 1; j < num_sack_blks; j++) {
 			if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
 				sack = sack_blocks[i];
 				sack_blocks[i] = sack_blocks[j];
 				sack_blocks[j] = sack;
 			}
 		}
 	}
 	if (TAILQ_EMPTY(&tp->snd_holes))
 		/*
 		 * Empty scoreboard. Need to initialize snd_fack (it may be
 		 * uninitialized or have a bogus value). Scoreboard holes
 		 * (from the sack blocks received) are created later below
 		 * (in the logic that adds holes to the tail of the
 		 * scoreboard).
 		 */
 		tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
 	/*
 	 * In the while-loop below, incoming SACK blocks (sack_blocks[]) and
 	 * SACK holes (snd_holes) are traversed from their tails with just
 	 * one pass in order to reduce the number of compares especially when
 	 * the bandwidth-delay product is large.
 	 *
 	 * Note: Typically, in the first RTT of SACK recovery, the highest
 	 * three or four SACK blocks with the same ack number are received.
 	 * In the second RTT, if retransmitted data segments are not lost,
 	 * the highest three or four SACK blocks with ack number advancing
 	 * are received.
 	 */
 	sblkp = &sack_blocks[num_sack_blks - 1];	/* Last SACK block */
 	tp->sackhint.last_sack_ack = sblkp->end;
 	if (SEQ_LT(tp->snd_fack, sblkp->start)) {
 		/*
 		 * The highest SACK block is beyond fack.  Append new SACK
 		 * hole at the tail.  If the second or later highest SACK
 		 * blocks are also beyond the current fack, they will be
 		 * inserted by way of hole splitting in the while-loop below.
 		 */
 		temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL);
 		if (temp != NULL) {
 			tp->snd_fack = sblkp->end;
 			/* Go to the previous sack block. */
 			sblkp--;
 		} else {
 			/* 
 			 * We failed to add a new hole based on the current 
 			 * sack block.  Skip over all the sack blocks that 
 			 * fall completely to the right of snd_fack and
 			 * proceed to trim the scoreboard based on the
 			 * remaining sack blocks.  This also trims the
 			 * scoreboard for th_ack (which is sack_blocks[0]).
 			 */
 			while (sblkp >= sack_blocks && 
 			       SEQ_LT(tp->snd_fack, sblkp->start))
 				sblkp--;
 			if (sblkp >= sack_blocks && 
 			    SEQ_LT(tp->snd_fack, sblkp->end))
 				tp->snd_fack = sblkp->end;
 		}
 	} else if (SEQ_LT(tp->snd_fack, sblkp->end))
 		/* fack is advanced. */
 		tp->snd_fack = sblkp->end;
 	/* We must have at least one SACK hole in scoreboard. */
 	KASSERT(!TAILQ_EMPTY(&tp->snd_holes),
 	    ("SACK scoreboard must not be empty"));
 	cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */
 	/*
 	 * Since the incoming sack blocks are sorted, we can process them
 	 * making one sweep of the scoreboard.
 	 */
 	while (sblkp >= sack_blocks  && cur != NULL) {
 		if (SEQ_GEQ(sblkp->start, cur->end)) {
 			/*
 			 * SACKs data beyond the current hole.  Go to the
 			 * previous sack block.
 			 */
 			sblkp--;
 			continue;
 		}
 		if (SEQ_LEQ(sblkp->end, cur->start)) {
 			/*
 			 * SACKs data before the current hole.  Go to the
 			 * previous hole.
 			 */
 			cur = TAILQ_PREV(cur, sackhole_head, scblink);
 			continue;
 		}
 		tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start);
 		KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
 		    ("sackhint bytes rtx >= 0"));
 		if (SEQ_LEQ(sblkp->start, cur->start)) {
 			/* Data acks at least the beginning of hole. */
 			if (SEQ_GEQ(sblkp->end, cur->end)) {
 				/* Acks entire hole, so delete hole. */
 				temp = cur;
 				cur = TAILQ_PREV(cur, sackhole_head, scblink);
 				tcp_sackhole_remove(tp, temp);
 				/*
 				 * The sack block may ack all or part of the
 				 * next hole too, so continue onto the next
 				 * hole.
 				 */
 				continue;
 			} else {
 				/* Move start of hole forward. */
 				cur->start = sblkp->end;
 				cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
 			}
 		} else {
 			/* Data acks at least the end of hole. */
 			if (SEQ_GEQ(sblkp->end, cur->end)) {
 				/* Move end of hole backward. */
 				cur->end = sblkp->start;
 				cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
 			} else {
 				/*
 				 * ACKs some data in middle of a hole; need
 				 * to split current hole
 				 */
 				temp = tcp_sackhole_insert(tp, sblkp->end,
 				    cur->end, cur);
 				if (temp != NULL) {
 					if (SEQ_GT(cur->rxmit, temp->rxmit)) {
 						temp->rxmit = cur->rxmit;
 						tp->sackhint.sack_bytes_rexmit
 						    += (temp->rxmit
 						    - temp->start);
 					}
 					cur->end = sblkp->start;
 					cur->rxmit = SEQ_MIN(cur->rxmit,
 					    cur->end);
 				}
 			}
 		}
 		tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start);
 		/*
 		 * Testing sblkp->start against cur->start tells us whether
 		 * we're done with the sack block or the sack hole.
 		 * Accordingly, we advance one or the other.
 		 */
 		if (SEQ_LEQ(sblkp->start, cur->start))
 			cur = TAILQ_PREV(cur, sackhole_head, scblink);
 		else
 			sblkp--;
 	}
 }
 
 /*
  * Free all SACK holes to clear the scoreboard.
  */
 void
 tcp_free_sackholes(struct tcpcb *tp)
 {
 	struct sackhole *q;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL)
 		tcp_sackhole_remove(tp, q);
 	tp->sackhint.sack_bytes_rexmit = 0;
 
 	KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0"));
 	KASSERT(tp->sackhint.nexthole == NULL,
 		("tp->sackhint.nexthole == NULL"));
 }
 
 /*
  * Partial ack handling within a sack recovery episode.  Keeping this very
  * simple for now.  When a partial ack is received, force snd_cwnd to a value
  * that will allow the sender to transmit no more than 2 segments.  If
  * necessary, a better scheme can be adopted at a later point, but for now,
  * the goal is to prevent the sender from bursting a large amount of data in
  * the midst of sack recovery.
  */
 void
 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
 {
 	int num_segs = 1;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	tcp_timer_activate(tp, TT_REXMT, 0);
 	tp->t_rtttime = 0;
 	/* Send one or 2 segments based on how much new data was acked. */
 	if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) >= 2)
 		num_segs = 2;
 	tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
 	    (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg);
 	if (tp->snd_cwnd > tp->snd_ssthresh)
 		tp->snd_cwnd = tp->snd_ssthresh;
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 }
 
 #if 0
 /*
  * Debug version of tcp_sack_output() that walks the scoreboard.  Used for
  * now to sanity check the hint.
  */
 static struct sackhole *
 tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt)
 {
 	struct sackhole *p;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	*sack_bytes_rexmt = 0;
 	TAILQ_FOREACH(p, &tp->snd_holes, scblink) {
 		if (SEQ_LT(p->rxmit, p->end)) {
 			if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
 				continue;
 			}
 			*sack_bytes_rexmt += (p->rxmit - p->start);
 			break;
 		}
 		*sack_bytes_rexmt += (p->rxmit - p->start);
 	}
 	return (p);
 }
 #endif
 
 /*
  * Returns the next hole to retransmit and the number of retransmitted bytes
  * from the scoreboard.  We store both the next hole and the number of
  * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK
  * reception).  This avoids scoreboard traversals completely.
  *
  * The loop here will traverse *at most* one link.  Here's the argument.  For
  * the loop to traverse more than 1 link before finding the next hole to
  * retransmit, we would need to have at least 1 node following the current
  * hint with (rxmit == end).  But, for all holes following the current hint,
  * (start == rxmit), since we have not yet retransmitted from them.
  * Therefore, in order to traverse more 1 link in the loop below, we need to
  * have at least one node following the current hint with (start == rxmit ==
  * end).  But that can't happen, (start == end) means that all the data in
  * that hole has been sacked, in which case, the hole would have been removed
  * from the scoreboard.
  */
 struct sackhole *
 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
 {
 	struct sackhole *hole = NULL;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	*sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit;
 	hole = tp->sackhint.nexthole;
 	if (hole == NULL || SEQ_LT(hole->rxmit, hole->end))
 		goto out;
 	while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) {
 		if (SEQ_LT(hole->rxmit, hole->end)) {
 			tp->sackhint.nexthole = hole;
 			break;
 		}
 	}
 out:
 	return (hole);
 }
 
 /*
  * After a timeout, the SACK list may be rebuilt.  This SACK information
  * should be used to avoid retransmitting SACKed data.  This function
  * traverses the SACK list to see if snd_nxt should be moved forward.
  */
 void
 tcp_sack_adjust(struct tcpcb *tp)
 {
 	struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes);
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (cur == NULL)
 		return; /* No holes */
 	if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack))
 		return; /* We're already beyond any SACKed blocks */
 	/*-
 	 * Two cases for which we want to advance snd_nxt:
 	 * i) snd_nxt lies between end of one hole and beginning of another
 	 * ii) snd_nxt lies between end of last hole and snd_fack
 	 */
 	while ((p = TAILQ_NEXT(cur, scblink)) != NULL) {
 		if (SEQ_LT(tp->snd_nxt, cur->end))
 			return;
 		if (SEQ_GEQ(tp->snd_nxt, p->start))
 			cur = p;
 		else {
 			tp->snd_nxt = p->start;
 			return;
 		}
 	}
 	if (SEQ_LT(tp->snd_nxt, cur->end))
 		return;
 	tp->snd_nxt = tp->snd_fack;
 }
Index: user/dchagin/lemul/sys/netinet/tcp_subr.c
===================================================================
--- user/dchagin/lemul/sys/netinet/tcp_subr.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/tcp_subr.c	(revision 274241)
@@ -1,2453 +1,2453 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/sysctl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
 #endif
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/cc.h>
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef INET6
 #include <netinet6/ip6protosw.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/key.h>
 #include <sys/syslog.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 #include <sys/md5.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS;
 #ifdef INET6
 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
 #endif
 
 static int
 sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_mssdflt = new;
 	}
 	return (error);
 }
 
-SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
-    CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0,
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0,
     &sysctl_net_inet_tcp_mss_check, "I",
     "Default TCP Maximum Segment Size");
 
 #ifdef INET6
 static int
 sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_v6mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_v6mssdflt = new;
 	}
 	return (error);
 }
 
-SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
-    CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0,
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0,
     &sysctl_net_inet_tcp_mss_v6_check, "I",
    "Default TCP Maximum Segment Size for IPv6");
 #endif /* INET6 */
 
 /*
  * Minimum MSS we accept and use. This prevents DoS attacks where
  * we are forced to a ridiculous low MSS like 20 and send hundreds
  * of packets instead of one. The effect scales with the available
  * bandwidth and quickly saturates the CPU and network interface
  * with packet generation and sending. Set to zero to disable MINMSS
  * checking. This setting prevents us from sending too small packets.
  */
 VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW,
      &VNET_NAME(tcp_minmss), 0,
     "Minimum TCP Maximum Segment Size");
 
 VNET_DEFINE(int, tcp_do_rfc1323) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc1323), 0,
     "Enable rfc1323 (high performance TCP) extensions");
 
 static int	tcp_log_debug = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
     &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
 
 static int	tcp_tcbhashsize;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 
 static int	do_tcpdrain = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
     "Enable tcp_drain routine for extra help when low on mbufs");
 
-SYSCTL_VNET_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD,
     &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs");
 
 static VNET_DEFINE(int, icmp_may_rst) = 1;
 #define	V_icmp_may_rst			VNET(icmp_may_rst)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(icmp_may_rst), 0,
     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 
 static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0;
 #define	V_tcp_isn_reseed_interval	VNET(tcp_isn_reseed_interval)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_isn_reseed_interval), 0,
     "Seconds between reseeding of ISN secret");
 
 static int	tcp_soreceive_stream;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
     &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets");
 
 #ifdef TCP_SIGNATURE
 static int	tcp_sig_checksigs = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, signature_verify_input, CTLFLAG_RW,
     &tcp_sig_checksigs, 0, "Verify RFC2385 digests on inbound traffic");
 #endif
 
 VNET_DEFINE(uma_zone_t, sack_hole_zone);
 #define	V_sack_hole_zone		VNET(sack_hole_zone)
 
 VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
 
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
 static char *	tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
 		    void *ip4hdr, const void *ip6hdr);
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
  * variable net.inet.tcp.tcbhashsize
  */
 #ifndef TCBHASHSIZE
 #define TCBHASHSIZE	0
 #endif
 
 /*
  * XXX
  * Callouts should be moved into struct tcp directly.  They are currently
  * separate because the tcpcb structure is exported to userland for sysctl
  * parsing purposes, which do not know about callouts.
  */
 struct tcpcb_mem {
 	struct	tcpcb		tcb;
 	struct	tcp_timer	tt;
 	struct	cc_var		ccv;
 	struct	osd		osd;
 };
 
 static VNET_DEFINE(uma_zone_t, tcpcb_zone);
 #define	V_tcpcb_zone			VNET(tcpcb_zone)
 
 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
 static struct mtx isn_mtx;
 
 #define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
 #define	ISN_LOCK()	mtx_lock(&isn_mtx)
 #define	ISN_UNLOCK()	mtx_unlock(&isn_mtx)
 
 /*
  * TCP initialization.
  */
 static void
 tcp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
 	uma_zone_set_max(V_tcpcb_zone, maxsockets);
 	tcp_tw_zone_change();
 }
 
 static int
 tcp_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_INIT(inp, "inp", "tcpinp");
 	return (0);
 }
 
 /*
  * Take a value and get the next power of 2 that doesn't overflow.
  * Used to size the tcp_inpcb hash buckets.
  */
 static int
 maketcp_hashsize(int size)
 {
 	int hashsize;
 
 	/*
 	 * auto tune.
 	 * get the next power of 2 higher than maxsockets.
 	 */
 	hashsize = 1 << fls(size);
 	/* catch overflow, and just go one power of 2 smaller */
 	if (hashsize < size) {
 		hashsize = 1 << (fls(size) - 1);
 	}
 	return (hashsize);
 }
 
 void
 tcp_init(void)
 {
 	const char *tcbhash_tuneable;
 	int hashsize;
 
 	tcbhash_tuneable = "net.inet.tcp.tcbhashsize";
 
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN,
 	    &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT,
 	    &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 
 	hashsize = TCBHASHSIZE;
 	TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
 	if (hashsize == 0) {
 		/*
 		 * Auto tune the hash size based on maxsockets.
 		 * A perfect hash would have a 1:1 mapping
 		 * (hashsize = maxsockets) however it's been
 		 * suggested that O(2) average is better.
 		 */
 		hashsize = maketcp_hashsize(maxsockets / 4);
 		/*
 		 * Our historical default is 512,
 		 * do not autotune lower than this.
 		 */
 		if (hashsize < 512)
 			hashsize = 512;
 		if (bootverbose)
 			printf("%s: %s auto tuned to %d\n", __func__,
 			    tcbhash_tuneable, hashsize);
 	}
 	/*
 	 * We require a hashsize to be a power of two.
 	 * Previously if it was not a power of two we would just reset it
 	 * back to 512, which could be a nasty surprise if you did not notice
 	 * the error message.
 	 * Instead what we do is clip it to the closest power of two lower
 	 * than the specified hash value.
 	 */
 	if (!powerof2(hashsize)) {
 		int oldhashsize = hashsize;
 
 		hashsize = maketcp_hashsize(hashsize);
 		/* prevent absurdly low value */
 		if (hashsize < 16)
 			hashsize = 16;
 		printf("%s: WARNING: TCB hash size not a power of 2, "
 		    "clipped from %d to %d.\n", __func__, oldhashsize,
 		    hashsize);
 	}
 	in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize,
 	    "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE,
 	    IPI_HASHFIELDS_4TUPLE);
 
 	/*
 	 * These have to be type stable for the benefit of the timers.
 	 */
 	V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(V_tcpcb_zone, maxsockets);
 	uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached");
 
 	tcp_tw_init();
 	syncache_init();
 	tcp_hc_init();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
 	V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 
 	/* Skip initialization of globals for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	/* XXX virtualize those bellow? */
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;
 	tcp_keepidle = TCPTV_KEEP_IDLE;
 	tcp_keepintvl = TCPTV_KEEPINTVL;
 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 	tcp_msl = TCPTV_MSL;
 	tcp_rexmit_min = TCPTV_MIN;
 	if (tcp_rexmit_min < 1)
 		tcp_rexmit_min = 1;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 	tcp_tcbhashsize = hashsize;
 
 	if (tcp_soreceive_stream) {
 #ifdef INET
 		tcp_usrreqs.pru_soreceive = soreceive_stream;
 #endif
 #ifdef INET6
 		tcp6_usrreqs.pru_soreceive = soreceive_stream;
 #endif /* INET6 */
 	}
 
 #ifdef INET6
 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
 #else /* INET6 */
 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
 #endif /* INET6 */
 	if (max_protohdr < TCP_MINPROTOHDR)
 		max_protohdr = TCP_MINPROTOHDR;
 	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
 		panic("tcp_init");
 #undef TCP_MINPROTOHDR
 
 	ISN_LOCK_INIT();
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 	EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
 		EVENTHANDLER_PRI_ANY);
 }
 
 #ifdef VIMAGE
 void
 tcp_destroy(void)
 {
 	int error;
 
 	tcp_hc_destroy();
 	syncache_destroy();
 	tcp_tw_destroy();
 	in_pcbinfo_destroy(&V_tcbinfo);
 	uma_zdestroy(V_sack_hole_zone);
 	uma_zdestroy(V_tcpcb_zone);
 
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error);
 	}
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error);
 	}
 }
 #endif
 
 void
 tcp_fini(void *xtp)
 {
 
 }
 
 /*
  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  * tcp_template used to store this data in mbufs, but we now recopy it out
  * of the tcpcb each time to conserve mbufs.
  */
 void
 tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
 {
 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
 
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		struct ip6_hdr *ip6;
 
 		ip6 = (struct ip6_hdr *)ip_ptr;
 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			(inp->inp_flow & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		struct ip *ip;
 
 		ip = (struct ip *)ip_ptr;
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = 5;
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_len = 0;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_ttl = inp->inp_ip_ttl;
 		ip->ip_sum = 0;
 		ip->ip_p = IPPROTO_TCP;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst = inp->inp_faddr;
 	}
 #endif /* INET */
 	th->th_sport = inp->inp_lport;
 	th->th_dport = inp->inp_fport;
 	th->th_seq = 0;
 	th->th_ack = 0;
 	th->th_x2 = 0;
 	th->th_off = 5;
 	th->th_flags = 0;
 	th->th_win = 0;
 	th->th_urp = 0;
 	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
 }
 
 /*
  * Create template to be used to send tcp packets on a connection.
  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  * use for this function is in keepalives, which use tcp_respond.
  */
 struct tcptemp *
 tcpip_maketemplate(struct inpcb *inp)
 {
 	struct tcptemp *t;
 
 	t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
 	if (t == NULL)
 		return (NULL);
 	tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
 	return (t);
 }
 
 /*
  * Send a single message to the TCP at address specified by
  * the given TCP/IP header.  If m == NULL, then we make a copy
  * of the tcpiphdr at th and send directly to the addressed host.
  * This is used to force keep alive messages out using the TCP
  * template for a connection.  If flags are given then we send
  * a message back to the TCP which originated the segment th,
  * and discard the mbuf containing it and any other attached mbufs.
  *
  * In any case the ack and sequence number of the transmitted
  * segment are as specified by the parameters.
  *
  * NOTE: If m != NULL, then th must point to *inside* the mbuf.
  */
 void
 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
     tcp_seq ack, tcp_seq seq, int flags)
 {
 	int tlen;
 	int win = 0;
 	struct ip *ip;
 	struct tcphdr *nth;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
 	int ipflags = 0;
 	struct inpcb *inp;
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 
 #ifdef INET6
 	isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4);
 	ip6 = ipgen;
 #endif /* INET6 */
 	ip = ipgen;
 
 	if (tp != NULL) {
 		inp = tp->t_inpcb;
 		KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
 		INP_WLOCK_ASSERT(inp);
 	} else
 		inp = NULL;
 
 	if (tp != NULL) {
 		if (!(flags & TH_RST)) {
 			win = sbspace(&inp->inp_socket->so_rcv);
 			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 				win = (long)TCP_MAXWIN << tp->rcv_scale;
 		}
 	}
 	if (m == NULL) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return;
 		tlen = 0;
 		m->m_data += max_linkhdr;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(m, struct ip6_hdr *);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 		{
 			bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 			ip = mtod(m, struct ip *);
 			nth = (struct tcphdr *)(ip + 1);
 		}
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		flags = TH_ACK;
 	} else {
 		/*
 		 *  reuse the mbuf. 
 		 * XXX MRT We inherrit the FIB, which is lucky.
 		 */
 		m_freem(m->m_next);
 		m->m_next = NULL;
 		m->m_data = (caddr_t)ipgen;
 		/* m_len is set later */
 		tlen = 0;
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 #ifdef INET6
 		if (isipv6) {
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 		{
 			xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
 			nth = (struct tcphdr *)(ip + 1);
 		}
 		if (th != nth) {
 			/*
 			 * this is usually a case when an extension header
 			 * exists between the IPv6 header and the
 			 * TCP header.
 			 */
 			nth->th_sport = th->th_sport;
 			nth->th_dport = th->th_dport;
 		}
 		xchg(nth->th_dport, nth->th_sport, uint16_t);
 #undef xchg
 	}
 #ifdef INET6
 	if (isipv6) {
 		ip6->ip6_flow = 0;
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 		ip6->ip6_plen = htons(tlen - sizeof(*ip6));
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		tlen += sizeof (struct tcpiphdr);
 		ip->ip_len = htons(tlen);
 		ip->ip_ttl = V_ip_defttl;
 		if (V_path_mtu_discovery)
 			ip->ip_off |= htons(IP_DF);
 	}
 #endif
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 	if (inp != NULL) {
 		/*
 		 * Packet is associated with a socket, so allow the
 		 * label of the response to reflect the socket label.
 		 */
 		INP_WLOCK_ASSERT(inp);
 		mac_inpcb_create_mbuf(inp, m);
 	} else {
 		/*
 		 * Packet is not associated with a socket, so possibly
 		 * update the label in place.
 		 */
 		mac_netinet_tcp_reply(m);
 	}
 #endif
 	nth->th_seq = htonl(seq);
 	nth->th_ack = htonl(ack);
 	nth->th_x2 = 0;
 	nth->th_off = sizeof (struct tcphdr) >> 2;
 	nth->th_flags = flags;
 	if (tp != NULL)
 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
 
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (isipv6) {
 		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 		nth->th_sum = in6_cksum_pseudo(ip6,
 		    tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0);
 		ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
 		    NULL, NULL);
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
 	}
 #endif /* INET */
 #ifdef TCPDEBUG
 	if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
 #endif
 	if (flags & TH_RST)
 		TCP_PROBE5(accept__refused, NULL, NULL, mtod(m, const char *),
 		    tp, nth);
 
 	TCP_PROBE5(send, NULL, tp, mtod(m, const char *), tp, nth);
 #ifdef INET6
 	if (isipv6)
 		(void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 		(void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
 #endif
 }
 
 /*
  * Create a new TCP control block, making an
  * empty reassembly queue and hooking it to the argument
  * protocol control block.  The `inp' parameter must have
  * come from the zone allocator set up in tcp_init().
  */
 struct tcpcb *
 tcp_newtcpcb(struct inpcb *inp)
 {
 	struct tcpcb_mem *tm;
 	struct tcpcb *tp;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO);
 	if (tm == NULL)
 		return (NULL);
 	tp = &tm->tcb;
 
 	/* Initialise cc_var struct for this tcpcb. */
 	tp->ccv = &tm->ccv;
 	tp->ccv->type = IPPROTO_TCP;
 	tp->ccv->ccvc.tcp = tp;
 
 	/*
 	 * Use the current system default CC algorithm.
 	 */
 	CC_LIST_RLOCK();
 	KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!"));
 	CC_ALGO(tp) = CC_DEFAULT();
 	CC_LIST_RUNLOCK();
 
 	if (CC_ALGO(tp)->cb_init != NULL)
 		if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
 			uma_zfree(V_tcpcb_zone, tm);
 			return (NULL);
 		}
 
 	tp->osd = &tm->osd;
 	if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) {
 		uma_zfree(V_tcpcb_zone, tm);
 		return (NULL);
 	}
 
 #ifdef VIMAGE
 	tp->t_vnet = inp->inp_vnet;
 #endif
 	tp->t_timers = &tm->tt;
 	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
 	tp->t_maxseg = tp->t_maxopd =
 #ifdef INET6
 		isipv6 ? V_tcp_v6mssdflt :
 #endif /* INET6 */
 		V_tcp_mssdflt;
 
 	/* Set up our timeouts. */
 	callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE);
 
 	if (V_tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (V_tcp_do_sack)
 		tp->t_flags |= TF_SACK_PERMIT;
 	TAILQ_INIT(&tp->snd_holes);
 	tp->t_inpcb = inp;	/* XXX */
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 	 * reasonable initial retransmit time.
 	 */
 	tp->t_srtt = TCPTV_SRTTBASE;
 	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = V_ip_defttl;
 	inp->inp_ppcb = tp;
 	return (tp);		/* XXX */
 }
 
 /*
  * Switch the congestion control algorithm back to NewReno for any active
  * control blocks using an algorithm which is about to go away.
  * This ensures the CC framework can allow the unload to proceed without leaving
  * any dangling pointers which would trigger a panic.
  * Returning non-zero would inform the CC framework that something went wrong
  * and it would be unsafe to allow the unload to proceed. However, there is no
  * way for this to occur with this implementation so we always return zero.
  */
 int
 tcp_ccalgounload(struct cc_algo *unload_algo)
 {
 	struct cc_algo *tmpalgo;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	/*
 	 * Check all active control blocks across all network stacks and change
 	 * any that are using "unload_algo" back to NewReno. If "unload_algo"
 	 * requires cleanup code to be run, call it.
 	 */
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		INP_INFO_RLOCK(&V_tcbinfo);
 		/*
 		 * New connections already part way through being initialised
 		 * with the CC algo we're removing will not race with this code
 		 * because the INP_INFO_WLOCK is held during initialisation. We
 		 * therefore don't enter the loop below until the connection
 		 * list has stabilised.
 		 */
 		LIST_FOREACH(inp, &V_tcb, inp_list) {
 			INP_WLOCK(inp);
 			/* Important to skip tcptw structs. */
 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
 			    (tp = intotcpcb(inp)) != NULL) {
 				/*
 				 * By holding INP_WLOCK here, we are assured
 				 * that the connection is not currently
 				 * executing inside the CC module's functions
 				 * i.e. it is safe to make the switch back to
 				 * NewReno.
 				 */
 				if (CC_ALGO(tp) == unload_algo) {
 					tmpalgo = CC_ALGO(tp);
 					/* NewReno does not require any init. */
 					CC_ALGO(tp) = &newreno_cc_algo;
 					if (tmpalgo->cb_destroy != NULL)
 						tmpalgo->cb_destroy(tp->ccv);
 				}
 			}
 			INP_WUNLOCK(inp);
 		}
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 struct tcpcb *
 tcp_drop(struct tcpcb *tp, int errno)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tcp_state_change(tp, TCPS_CLOSED);
 		(void) tcp_output(tp);
 		TCPSTAT_INC(tcps_drops);
 	} else
 		TCPSTAT_INC(tcps_conndrops);
 	if (errno == ETIMEDOUT && tp->t_softerror)
 		errno = tp->t_softerror;
 	so->so_error = errno;
 	return (tcp_close(tp));
 }
 
 void
 tcp_discardcb(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Make sure that all of our timers are stopped before we delete the
 	 * PCB.
 	 *
 	 * XXXRW: Really, we would like to use callout_drain() here in order
 	 * to avoid races experienced in tcp_timer.c where a timer is already
 	 * executing at this point.  However, we can't, both because we're
 	 * running in a context where we can't sleep, and also because we
 	 * hold locks required by the timers.  What we instead need to do is
 	 * test to see if callout_drain() is required, and if so, defer some
 	 * portion of the remainder of tcp_discardcb() to an asynchronous
 	 * context that can callout_drain() and then continue.  Some care
 	 * will be required to ensure that no further processing takes place
 	 * on the tcpcb, even though it hasn't been freed (a flag?).
 	 */
 	callout_stop(&tp->t_timers->tt_rexmt);
 	callout_stop(&tp->t_timers->tt_persist);
 	callout_stop(&tp->t_timers->tt_keep);
 	callout_stop(&tp->t_timers->tt_2msl);
 	callout_stop(&tp->t_timers->tt_delack);
 
 	/*
 	 * If we got enough samples through the srtt filter,
 	 * save the rtt and rttvar in the routing entry.
 	 * 'Enough' is arbitrarily defined as 4 rtt samples.
 	 * 4 samples is enough for the srtt filter to converge
 	 * to within enough % of the correct value; fewer samples
 	 * and we could save a bogus rtt. The danger is not high
 	 * as tcp quickly recovers from everything.
 	 * XXX: Works very well but needs some more statistics!
 	 */
 	if (tp->t_rttupdated >= 4) {
 		struct hc_metrics_lite metrics;
 		u_long ssthresh;
 
 		bzero(&metrics, sizeof(metrics));
 		/*
 		 * Update the ssthresh always when the conditions below
 		 * are satisfied. This gives us better new start value
 		 * for the congestion avoidance for new connections.
 		 * ssthresh is only set if packet loss occured on a session.
 		 *
 		 * XXXRW: 'so' may be NULL here, and/or socket buffer may be
 		 * being torn down.  Ideally this code would not use 'so'.
 		 */
 		ssthresh = tp->snd_ssthresh;
 		if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
 			/*
 			 * convert the limit from user data bytes to
 			 * packets then to packet data bytes.
 			 */
 			ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
 			if (ssthresh < 2)
 				ssthresh = 2;
 			ssthresh *= (u_long)(tp->t_maxseg +
 #ifdef INET6
 			    (isipv6 ? sizeof (struct ip6_hdr) +
 				sizeof (struct tcphdr) :
 #endif
 				sizeof (struct tcpiphdr)
 #ifdef INET6
 			    )
 #endif
 			    );
 		} else
 			ssthresh = 0;
 		metrics.rmx_ssthresh = ssthresh;
 
 		metrics.rmx_rtt = tp->t_srtt;
 		metrics.rmx_rttvar = tp->t_rttvar;
 		metrics.rmx_cwnd = tp->snd_cwnd;
 		metrics.rmx_sendpipe = 0;
 		metrics.rmx_recvpipe = 0;
 
 		tcp_hc_update(&inp->inp_inc, &metrics);
 	}
 
 	/* free the reassembly queue, if any */
 	tcp_reass_flush(tp);
 
 #ifdef TCP_OFFLOAD
 	/* Disconnect offload device, if any. */
 	if (tp->t_flags & TF_TOE)
 		tcp_offload_detach(tp);
 #endif
 		
 	tcp_free_sackholes(tp);
 
 	/* Allow the CC algorithm to clean up after itself. */
 	if (CC_ALGO(tp)->cb_destroy != NULL)
 		CC_ALGO(tp)->cb_destroy(tp->ccv);
 
 	khelp_destroy_osd(tp->osd);
 
 	CC_ALGO(tp) = NULL;
 	inp->inp_ppcb = NULL;
 	tp->t_inpcb = NULL;
 	uma_zfree(V_tcpcb_zone, tp);
 }
 
 /*
  * Attempt to close a TCP control block, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 struct tcpcb *
 tcp_close(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_state == TCPS_LISTEN)
 		tcp_offload_listen_stop(tp);
 #endif
 	in_pcbdrop(inp);
 	TCPSTAT_INC(tcps_closed);
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	if (inp->inp_flags & INP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("tcp_close: !SS_PROTOREF"));
 		inp->inp_flags &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		ACCEPT_LOCK();
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_PROTOREF;
 		sofree(so);
 		return (NULL);
 	}
 	return (tp);
 }
 
 void
 tcp_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	if (!do_tcpdrain)
 		return;
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		struct inpcb *inpb;
 		struct tcpcb *tcpb;
 
 	/*
 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
 	 * if there is one...
 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
 	 *      reassembly queue should be flushed, but in a situation
 	 *	where we're really low on mbufs, this is potentially
 	 *	useful.
 	 */
 		INP_INFO_RLOCK(&V_tcbinfo);
 		LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
 			if (inpb->inp_flags & INP_TIMEWAIT)
 				continue;
 			INP_WLOCK(inpb);
 			if ((tcpb = intotcpcb(inpb)) != NULL) {
 				tcp_reass_flush(tcpb);
 				tcp_clean_sackreport(tcpb);
 			}
 			INP_WUNLOCK(inpb);
 		}
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Notify a tcp user of an asynchronous error;
  * store error as soft error, but wake up user
  * (for now, won't do anything until can select for soft error).
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 static struct inpcb *
 tcp_notify(struct inpcb *inp, int error)
 {
 	struct tcpcb *tp;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 * If connection hasn't completed, has retransmitted several times,
 	 * and receives a second error, give up now.  This is better
 	 * than waiting a long time to establish a connection that
 	 * can never complete.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
 	     error == EHOSTDOWN)) {
 		return (inp);
 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
 	    tp->t_softerror) {
 		tp = tcp_drop(tp, error);
 		if (tp != NULL)
 			return (inp);
 		else
 			return (NULL);
 	} else {
 		tp->t_softerror = error;
 		return (inp);
 	}
 #if 0
 	wakeup( &so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 #endif
 }
 
 static int
 tcp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, m, n, pcb_count;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == NULL) {
 		n = V_tcbinfo.ipi_count + syncache_pcbcount();
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
 		return (0);
 	}
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_tcbinfo);
 	gencnt = V_tcbinfo.ipi_gencnt;
 	n = V_tcbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	m = syncache_pcbcount();
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ (n + m) * sizeof(struct xtcpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n + m;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	error = syncache_pcblist(req, m, &pcb_count);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == NULL)
 		return (ENOMEM);
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
 	    inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
 		INP_WLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			/*
 			 * XXX: This use of cr_cansee(), introduced with
 			 * TCP state changes, is not quite right, but for
 			 * now, better than nothing.
 			 */
 			if (inp->inp_flags & INP_TIMEWAIT) {
 				if (intotw(inp) != NULL)
 					error = cr_cansee(req->td->td_ucred,
 					    intotw(inp)->tw_cred);
 				else
 					error = EINVAL;	/* Skip this inp. */
 			} else
 				error = cr_canseeinpcb(req->td->td_ucred, inp);
 			if (error == 0) {
 				in_pcbref(inp);
 				inp_list[i++] = inp;
 			}
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xtcpcb xt;
 			void *inp_ppcb;
 
 			bzero(&xt, sizeof(xt));
 			xt.xt_len = sizeof xt;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xt.xt_inp, sizeof *inp);
 			inp_ppcb = inp->inp_ppcb;
 			if (inp_ppcb == NULL)
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 			else if (inp->inp_flags & INP_TIMEWAIT) {
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 				xt.xt_tp.t_state = TCPS_TIME_WAIT;
 			} else {
 				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
 				if (xt.xt_tp.t_timers)
 					tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer);
 			}
 			if (inp->inp_socket != NULL)
 				sotoxsocket(inp->inp_socket, &xt.xt_socket);
 			else {
 				bzero(&xt.xt_socket, sizeof xt.xt_socket);
 				xt.xt_socket.xso_protocol = IPPROTO_TCP;
 			}
 			xt.xt_inp.inp_gencnt = inp->inp_gencnt;
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WLOCK(&V_tcbinfo);
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_tcbinfo);
 		xig.xig_gen = V_tcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
     tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
 
 #ifdef INET
 static int
 tcp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error;
 #ifdef INET
 	int mapped = 0;
 #endif
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 #ifdef INET
 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 			mapped = 1;
 		else
 #endif
 			return (EINVAL);
 	}
 
 #ifdef INET
 	if (mapped == 1)
 		inp = in_pcblookup(&V_tcbinfo,
 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 			addrs[1].sin6_port,
 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 			addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL);
 	else
 #endif
 		inp = in6_pcblookup(&V_tcbinfo,
 			&addrs[1].sin6_addr, addrs[1].sin6_port,
 			&addrs[0].sin6_addr, addrs[0].sin6_port,
 			INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
 #endif /* INET6 */
 
 
 #ifdef INET
 void
 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct ip *ip = vip;
 	struct tcphdr *th;
 	struct in_addr faddr;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct icmp *icp;
 	struct in_conninfo inc;
 	tcp_seq icmp_tcp_seq;
 	int mtu;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc_notify;
 	else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
 		notify = tcp_drop_syn_sent;
 	/*
 	 * Redirects don't need to be handled up here.
 	 */
 	else if (PRC_IS_REDIRECT(cmd))
 		return;
 	/*
 	 * Source quench is depreciated.
 	 */
 	else if (cmd == PRC_QUENCH)
 		return;
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 * XXX: We never get this from ICMP, otherwise it makes an
 	 * excellent DoS attack on machines with many connections.
 	 */
 	else if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip != NULL) {
 		icp = (struct icmp *)((caddr_t)ip
 				      - offsetof(struct icmp, icmp_ip));
 		th = (struct tcphdr *)((caddr_t)ip
 				       + (ip->ip_hl << 2));
 		INP_INFO_WLOCK(&V_tcbinfo);
 		inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport,
 		    ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
 		if (inp != NULL)  {
 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
 			    !(inp->inp_flags & INP_DROPPED) &&
 			    !(inp->inp_socket == NULL)) {
 				icmp_tcp_seq = htonl(th->th_seq);
 				tp = intotcpcb(inp);
 				if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
 				    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
 					if (cmd == PRC_MSGSIZE) {
 					    /*
 					     * MTU discovery:
 					     * If we got a needfrag set the MTU
 					     * in the route to the suggested new
 					     * value (if given) and then notify.
 					     */
 					    bzero(&inc, sizeof(inc));
 					    inc.inc_faddr = faddr;
 					    inc.inc_fibnum =
 						inp->inp_inc.inc_fibnum;
 
 					    mtu = ntohs(icp->icmp_nextmtu);
 					    /*
 					     * If no alternative MTU was
 					     * proposed, try the next smaller
 					     * one.
 					     */
 					    if (!mtu)
 						mtu = ip_next_mtu(
 						 ntohs(ip->ip_len), 1);
 					    if (mtu < V_tcp_minmss
 						 + sizeof(struct tcpiphdr))
 						mtu = V_tcp_minmss
 						 + sizeof(struct tcpiphdr);
 					    /*
 					     * Only cache the MTU if it
 					     * is smaller than the interface
 					     * or route MTU.  tcp_mtudisc()
 					     * will do right thing by itself.
 					     */
 					    if (mtu <= tcp_maxmtu(&inc, NULL))
 						tcp_hc_updatemtu(&inc, mtu);
 					    tcp_mtudisc(inp, mtu);
 					} else
 						inp = (*notify)(inp,
 						    inetctlerrmap[cmd]);
 				}
 			}
 			if (inp != NULL)
 				INP_WUNLOCK(inp);
 		} else {
 			bzero(&inc, sizeof(inc));
 			inc.inc_fport = th->th_dport;
 			inc.inc_lport = th->th_sport;
 			inc.inc_faddr = faddr;
 			inc.inc_laddr = ip->ip_src;
 			syncache_unreach(&inc, th);
 		}
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	} else
 		in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
 }
 #endif /* INET */
 
 #ifdef INET6
 void
 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 	struct tcphdr th;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	int off;
 	struct tcp_portonly {
 		u_int16_t th_sport;
 		u_int16_t th_dport;
 	} *thp;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc_notify;
 	else if (!PRC_IS_REDIRECT(cmd) &&
 		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
 		return;
 	/* Source quench is depreciated. */
 	else if (cmd == PRC_QUENCH)
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		off = 0;	/* fool gcc */
 		sa6_src = &sa6_any;
 	}
 
 	if (ip6 != NULL) {
 		struct in_conninfo inc;
 		/*
 		 * XXX: We assume that when IPV6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		/* check if we can safely examine src and dst ports */
 		if (m->m_pkthdr.len < off + sizeof(*thp))
 			return;
 
 		bzero(&th, sizeof(th));
 		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
 
 		in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
 		    (struct sockaddr *)ip6cp->ip6c_src,
 		    th.th_sport, cmd, NULL, notify);
 
 		bzero(&inc, sizeof(inc));
 		inc.inc_fport = th.th_dport;
 		inc.inc_lport = th.th_sport;
 		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
 		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
 		inc.inc_flags |= INC_ISIPV6;
 		INP_INFO_WLOCK(&V_tcbinfo);
 		syncache_unreach(&inc, &th);
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 	} else
 		in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
 			      0, cmd, NULL, notify);
 }
 #endif /* INET6 */
 
 
 /*
  * Following is where TCP initial sequence number generation occurs.
  *
  * There are two places where we must use initial sequence numbers:
  * 1.  In SYN-ACK packets.
  * 2.  In SYN packets.
  *
  * All ISNs for SYN-ACK packets are generated by the syncache.  See
  * tcp_syncache.c for details.
  *
  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
  * depends on this property.  In addition, these ISNs should be
  * unguessable so as to prevent connection hijacking.  To satisfy
  * the requirements of this situation, the algorithm outlined in
  * RFC 1948 is used, with only small modifications.
  *
  * Implementation details:
  *
  * Time is based off the system timer, and is corrected so that it
  * increases by one megabyte per second.  This allows for proper
  * recycling on high speed LANs while still leaving over an hour
  * before rollover.
  *
  * As reading the *exact* system time is too expensive to be done
  * whenever setting up a TCP connection, we increment the time
  * offset in two ways.  First, a small random positive increment
  * is added to isn_offset for each connection that is set up.
  * Second, the function tcp_isn_tick fires once per clock tick
  * and increments isn_offset as necessary so that sequence numbers
  * are incremented at approximately ISN_BYTES_PER_SECOND.  The
  * random positive increments serve only to ensure that the same
  * exact sequence number is never sent out twice (as could otherwise
  * happen when a port is recycled in less than the system tick
  * interval.)
  *
  * net.inet.tcp.isn_reseed_interval controls the number of seconds
  * between seeding of isn_secret.  This is normally set to zero,
  * as reseeding should not be necessary.
  *
  * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
  * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock.  In
  * general, this means holding an exclusive (write) lock.
  */
 
 #define ISN_BYTES_PER_SECOND 1048576
 #define ISN_STATIC_INCREMENT 4096
 #define ISN_RANDOM_INCREMENT (4096 - 1)
 
 static VNET_DEFINE(u_char, isn_secret[32]);
 static VNET_DEFINE(int, isn_last);
 static VNET_DEFINE(int, isn_last_reseed);
 static VNET_DEFINE(u_int32_t, isn_offset);
 static VNET_DEFINE(u_int32_t, isn_offset_old);
 
 #define	V_isn_secret			VNET(isn_secret)
 #define	V_isn_last			VNET(isn_last)
 #define	V_isn_last_reseed		VNET(isn_last_reseed)
 #define	V_isn_offset			VNET(isn_offset)
 #define	V_isn_offset_old		VNET(isn_offset_old)
 
 tcp_seq
 tcp_new_isn(struct tcpcb *tp)
 {
 	MD5_CTX isn_ctx;
 	u_int32_t md5_buffer[4];
 	tcp_seq new_isn;
 	u_int32_t projected_offset;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	ISN_LOCK();
 	/* Seed if this is the first use, reseed if requested. */
 	if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) &&
 	     (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
 		< (u_int)ticks))) {
 		read_random(&V_isn_secret, sizeof(V_isn_secret));
 		V_isn_last_reseed = ticks;
 	}
 
 	/* Compute the md5 hash and return the ISN. */
 	MD5Init(&isn_ctx);
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
 #ifdef INET6
 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
 			  sizeof(struct in6_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
 			  sizeof(struct in6_addr));
 	} else
 #endif
 	{
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
 			  sizeof(struct in_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
 			  sizeof(struct in_addr));
 	}
 	MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret));
 	MD5Final((u_char *) &md5_buffer, &isn_ctx);
 	new_isn = (tcp_seq) md5_buffer[0];
 	V_isn_offset += ISN_STATIC_INCREMENT +
 		(arc4random() & ISN_RANDOM_INCREMENT);
 	if (ticks != V_isn_last) {
 		projected_offset = V_isn_offset_old +
 		    ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last);
 		if (SEQ_GT(projected_offset, V_isn_offset))
 			V_isn_offset = projected_offset;
 		V_isn_offset_old = V_isn_offset;
 		V_isn_last = ticks;
 	}
 	new_isn += V_isn_offset;
 	ISN_UNLOCK();
 	return (new_isn);
 }
 
 /*
  * When a specific ICMP unreachable message is received and the
  * connection state is SYN-SENT, drop the connection.  This behavior
  * is controlled by the icmp_may_rst sysctl.
  */
 struct inpcb *
 tcp_drop_syn_sent(struct inpcb *inp, int errno)
 {
 	struct tcpcb *tp;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	if (tp->t_state != TCPS_SYN_SENT)
 		return (inp);
 
 	tp = tcp_drop(tp, errno);
 	if (tp != NULL)
 		return (inp);
 	else
 		return (NULL);
 }
 
 /*
  * When `need fragmentation' ICMP is received, update our idea of the MSS
  * based on the new value. Also nudge TCP to send something, since we
  * know the packet we just sent was dropped.
  * This duplicates some code in the tcp_mss() function in tcp_input.c.
  */
 static struct inpcb *
 tcp_mtudisc_notify(struct inpcb *inp, int error)
 {
 
 	return (tcp_mtudisc(inp, -1));
 }
 
 struct inpcb *
 tcp_mtudisc(struct inpcb *inp, int mtuoffer)
 {
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(inp);
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
 
 	tcp_mss_update(tp, -1, mtuoffer, NULL, NULL);
   
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	/* If the mss is larger than the socket buffer, decrease the mss. */
 	if (so->so_snd.sb_hiwat < tp->t_maxseg)
 		tp->t_maxseg = so->so_snd.sb_hiwat;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	TCPSTAT_INC(tcps_mturesent);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = tp->snd_una;
 	tcp_free_sackholes(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp->t_flags);
 	tcp_output(tp);
 	return (inp);
 }
 
 #ifdef INET
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated, then return 0.  This routine
  * is called by TCP routines that access the rmx structure and by
  * tcp_mss_update to get the peer/interface MTU.
  */
 u_long
 tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct route sro;
 	struct sockaddr_in *dst;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
 
 	bzero(&sro, sizeof(sro));
 	if (inc->inc_faddr.s_addr != INADDR_ANY) {
 	        dst = (struct sockaddr_in *)&sro.ro_dst;
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = inc->inc_faddr;
 		in_rtalloc_ign(&sro, 0, inc->inc_fibnum);
 	}
 	if (sro.ro_rt != NULL) {
 		ifp = sro.ro_rt->rt_ifp;
 		if (sro.ro_rt->rt_mtu == 0)
 			maxmtu = ifp->if_mtu;
 		else
 			maxmtu = min(sro.ro_rt->rt_mtu, ifp->if_mtu);
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO4 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 				cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 				cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 			}
 		}
 		RTFREE(sro.ro_rt);
 	}
 	return (maxmtu);
 }
 #endif /* INET */
 
 #ifdef INET6
 u_long
 tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct route_in6 sro6;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
 
 	bzero(&sro6, sizeof(sro6));
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 		sro6.ro_dst.sin6_family = AF_INET6;
 		sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
 		sro6.ro_dst.sin6_addr = inc->inc6_faddr;
 		in6_rtalloc_ign(&sro6, 0, inc->inc_fibnum);
 	}
 	if (sro6.ro_rt != NULL) {
 		ifp = sro6.ro_rt->rt_ifp;
 		if (sro6.ro_rt->rt_mtu == 0)
 			maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
 		else
 			maxmtu = min(sro6.ro_rt->rt_mtu,
 				     IN6_LINKMTU(sro6.ro_rt->rt_ifp));
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO6 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 				cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 				cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 			}
 		}
 		RTFREE(sro6.ro_rt);
 	}
 
 	return (maxmtu);
 }
 #endif /* INET6 */
 
 #ifdef IPSEC
 /* compute ESP/AH header size for TCP, including outer IP header. */
 size_t
 ipsec_hdrsiz_tcp(struct tcpcb *tp)
 {
 	struct inpcb *inp;
 	struct mbuf *m;
 	size_t hdrsiz;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct tcphdr *th;
 
 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
 		return (0);
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (!m)
 		return (0);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		m->m_pkthdr.len = m->m_len =
 			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		tcpip_fillheaders(inp, ip6, th);
 		hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)(ip + 1);
 		m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
 		tcpip_fillheaders(inp, ip, th);
 		hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	}
 
 	m_free(m);
 	return (hdrsiz);
 }
 #endif /* IPSEC */
 
 #ifdef TCP_SIGNATURE
 /*
  * Callback function invoked by m_apply() to digest TCP segment data
  * contained within an mbuf chain.
  */
 static int
 tcp_signature_apply(void *fstate, void *data, u_int len)
 {
 
 	MD5Update(fstate, (u_char *)data, len);
 	return (0);
 }
 
 /*
  * XXX The key is retrieved from the system's PF_KEY SADB, by keying a
  * search with the destination IP address, and a 'magic SPI' to be
  * determined by the application. This is hardcoded elsewhere to 1179
 */
 struct secasvar *
 tcp_get_sav(struct mbuf *m, u_int direction)
 {
 	union sockaddr_union dst;
 	struct secasvar *sav;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	/* Extract the destination from the IP header in the mbuf. */
 	bzero(&dst, sizeof(union sockaddr_union));
 	ip = mtod(m, struct ip *);
 #ifdef INET6
 	ip6 = NULL;	/* Make the compiler happy. */
 #endif
 	switch (ip->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		dst.sa.sa_len = sizeof(struct sockaddr_in);
 		dst.sa.sa_family = AF_INET;
 		dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
 		    ip->ip_src : ip->ip_dst;
 		break;
 #endif
 #ifdef INET6
 	case (IPV6_VERSION >> 4):
 		ip6 = mtod(m, struct ip6_hdr *);
 		dst.sa.sa_len = sizeof(struct sockaddr_in6);
 		dst.sa.sa_family = AF_INET6;
 		dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ?
 		    ip6->ip6_src : ip6->ip6_dst;
 		break;
 #endif
 	default:
 		return (NULL);
 		/* NOTREACHED */
 		break;
 	}
 
 	/* Look up an SADB entry which matches the address of the peer. */
 	sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
 	if (sav == NULL) {
 		ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__,
 		    (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) :
 #ifdef INET6
 			(ip->ip_v == (IPV6_VERSION >> 4)) ?
 			    ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) :
 #endif
 			"(unsupported)"));
 	}
 
 	return (sav);
 }
 
 /*
  * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Parameters:
  * m		pointer to head of mbuf chain
  * len		length of TCP segment data, excluding options
  * optlen	length of TCP segment options
  * buf		pointer to storage for computed MD5 digest
  * sav		pointer to security assosiation
  *
  * We do this over ip, tcphdr, segment data, and the key in the SADB.
  * When called from tcp_input(), we can be sure that th_sum has been
  * zeroed out and verified already.
  *
  * Releases reference to SADB key before return. 
  *
  * Return 0 if successful, otherwise return -1.
  *
  */
 int
 tcp_signature_do_compute(struct mbuf *m, int len, int optlen,
     u_char *buf, struct secasvar *sav)
 {
 #ifdef INET
 	struct ippseudo ippseudo;
 #endif
 	MD5_CTX ctx;
 	int doff;
 	struct ip *ip;
 #ifdef INET
 	struct ipovly *ipovly;
 #endif
 	struct tcphdr *th;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	struct in6_addr in6;
 	uint32_t plen;
 	uint16_t nhdr;
 #endif
 	u_short savecsum;
 
 	KASSERT(m != NULL, ("NULL mbuf chain"));
 	KASSERT(buf != NULL, ("NULL signature pointer"));
 
 	/* Extract the destination from the IP header in the mbuf. */
 	ip = mtod(m, struct ip *);
 #ifdef INET6
 	ip6 = NULL;	/* Make the compiler happy. */
 #endif
 
 	MD5Init(&ctx);
 	/*
 	 * Step 1: Update MD5 hash with IP(v6) pseudo-header.
 	 *
 	 * XXX The ippseudo header MUST be digested in network byte order,
 	 * or else we'll fail the regression test. Assume all fields we've
 	 * been doing arithmetic on have been in host byte order.
 	 * XXX One cannot depend on ipovly->ih_len here. When called from
 	 * tcp_output(), the underlying ip_len member has not yet been set.
 	 */
 	switch (ip->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		ipovly = (struct ipovly *)ip;
 		ippseudo.ippseudo_src = ipovly->ih_src;
 		ippseudo.ippseudo_dst = ipovly->ih_dst;
 		ippseudo.ippseudo_pad = 0;
 		ippseudo.ippseudo_p = IPPROTO_TCP;
 		ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) +
 		    optlen);
 		MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));
 
 		th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip));
 		doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen;
 		break;
 #endif
 #ifdef INET6
 	/*
 	 * RFC 2385, 2.0  Proposal
 	 * For IPv6, the pseudo-header is as described in RFC 2460, namely the
 	 * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero-
 	 * extended next header value (to form 32 bits), and 32-bit segment
 	 * length.
 	 * Note: Upper-Layer Packet Length comes before Next Header.
 	 */
 	case (IPV6_VERSION >> 4):
 		in6 = ip6->ip6_src;
 		in6_clearscope(&in6);
 		MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
 		in6 = ip6->ip6_dst;
 		in6_clearscope(&in6);
 		MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
 		plen = htonl(len + sizeof(struct tcphdr) + optlen);
 		MD5Update(&ctx, (char *)&plen, sizeof(uint32_t));
 		nhdr = 0;
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		nhdr = IPPROTO_TCP;
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 
 		th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr));
 		doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen;
 		break;
 #endif
 	default:
 		return (-1);
 		/* NOTREACHED */
 		break;
 	}
 
 
 	/*
 	 * Step 2: Update MD5 hash with TCP header, excluding options.
 	 * The TCP checksum must be set to zero.
 	 */
 	savecsum = th->th_sum;
 	th->th_sum = 0;
 	MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
 	th->th_sum = savecsum;
 
 	/*
 	 * Step 3: Update MD5 hash with TCP segment data.
 	 *         Use m_apply() to avoid an early m_pullup().
 	 */
 	if (len > 0)
 		m_apply(m, doff, len, tcp_signature_apply, &ctx);
 
 	/*
 	 * Step 4: Update MD5 hash with shared secret.
 	 */
 	MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth));
 	MD5Final(buf, &ctx);
 
 	key_sa_recordxfer(sav, m);
 	KEY_FREESAV(&sav);
 	return (0);
 }
 
 /*
  * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Return 0 if successful, otherwise return -1.
  */
 int
 tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
     u_char *buf, u_int direction)
 {
 	struct secasvar *sav;
 
 	if ((sav = tcp_get_sav(m, direction)) == NULL)
 		return (-1);
 
 	return (tcp_signature_do_compute(m, len, optlen, buf, sav));
 }
 
 /*
  * Verify the TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Parameters:
  * m		pointer to head of mbuf chain
  * len		length of TCP segment data, excluding options
  * optlen	length of TCP segment options
  * buf		pointer to storage for computed MD5 digest
  * direction	direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
  *
  * Return 1 if successful, otherwise return 0.
  */
 int
 tcp_signature_verify(struct mbuf *m, int off0, int tlen, int optlen,
     struct tcpopt *to, struct tcphdr *th, u_int tcpbflag)
 {
 	char tmpdigest[TCP_SIGLEN];
 
 	if (tcp_sig_checksigs == 0)
 		return (1);
 	if ((tcpbflag & TF_SIGNATURE) == 0) {
 		if ((to->to_flags & TOF_SIGNATURE) != 0) {
 
 			/*
 			 * If this socket is not expecting signature but
 			 * the segment contains signature just fail.
 			 */
 			TCPSTAT_INC(tcps_sig_err_sigopt);
 			TCPSTAT_INC(tcps_sig_rcvbadsig);
 			return (0);
 		}
 
 		/* Signature is not expected, and not present in segment. */
 		return (1);
 	}
 
 	/*
 	 * If this socket is expecting signature but the segment does not
 	 * contain any just fail.
 	 */
 	if ((to->to_flags & TOF_SIGNATURE) == 0) {
 		TCPSTAT_INC(tcps_sig_err_nosigopt);
 		TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (0);
 	}
 	if (tcp_signature_compute(m, off0, tlen, optlen, &tmpdigest[0],
 	    IPSEC_DIR_INBOUND) == -1) {
 		TCPSTAT_INC(tcps_sig_err_buildsig);
 		TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (0);
 	}
 	
 	if (bcmp(to->to_signature, &tmpdigest[0], TCP_SIGLEN) != 0) {
 		TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (0);
 	}
 	TCPSTAT_INC(tcps_sig_rcvgoodsig);
 	return (1);
 }
 #endif /* TCP_SIGNATURE */
 
 static int
 sysctl_drop(SYSCTL_HANDLER_ARGS)
 {
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct tcptw *tw;
 	struct sockaddr_in *fin, *lin;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 #endif
 	int error;
 
 	inp = NULL;
 	fin = lin = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 			break;
 		}
 		error = sa6_embedscope(fin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 #endif
 	default:
 		return (EINVAL);
 	}
 	INP_INFO_WLOCK(&V_tcbinfo);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr,
 		    fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port,
 		    INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port,
 		    lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 	}
 	if (inp != NULL) {
 		if (inp->inp_flags & INP_TIMEWAIT) {
 			/*
 			 * XXXRW: There currently exists a state where an
 			 * inpcb is present, but its timewait state has been
 			 * discarded.  For now, don't allow dropping of this
 			 * type of inpcb.
 			 */
 			tw = intotw(inp);
 			if (tw != NULL)
 				tcp_twclose(tw, 0);
 			else
 				INP_WUNLOCK(inp);
 		} else if (!(inp->inp_flags & INP_DROPPED) &&
 			   !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
 			tp = intotcpcb(inp);
 			tp = tcp_drop(tp, ECONNABORTED);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 		} else
 			INP_WUNLOCK(inp);
 	} else
 		error = ESRCH;
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 
-SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
-    CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
+    CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL,
     0, sysctl_drop, "", "Drop TCP connection");
 
 /*
  * Generate a standardized TCP log line for use throughout the
  * tcp subsystem.  Memory allocation is done with M_NOWAIT to
  * allow use in the interrupt context.
  *
  * NB: The caller MUST free(s, M_TCPLOG) the returned string.
  * NB: The function may return NULL if memory allocation failed.
  *
  * Due to header inclusion and ordering limitations the struct ip
  * and ip6_hdr pointers have to be passed as void pointers.
  */
 char *
 tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (tcp_log_in_vain == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 char *
 tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (tcp_log_debug == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 static char *
 tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 	char *s, *sp;
 	size_t size;
 	struct ip *ip;
 #ifdef INET6
 	const struct ip6_hdr *ip6;
 
 	ip6 = (const struct ip6_hdr *)ip6hdr;
 #endif /* INET6 */
 	ip = (struct ip *)ip4hdr;
 
 	/*
 	 * The log line looks like this:
 	 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
 	 */
 	size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
 	    sizeof(PRINT_TH_FLAGS) + 1 +
 #ifdef INET6
 	    2 * INET6_ADDRSTRLEN;
 #else
 	    2 * INET_ADDRSTRLEN;
 #endif /* INET6 */
 
 	s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
 	if (s == NULL)
 		return (NULL);
 
 	strcat(s, "TCP: [");
 	sp = s + strlen(s);
 
 	if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) {
 		inet_ntoa_r(inc->inc_faddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		inet_ntoa_r(inc->inc_laddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 #ifdef INET6
 	} else if (inc) {
 		ip6_sprintf(sp, &inc->inc6_faddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &inc->inc6_laddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 	} else if (ip6 && th) {
 		ip6_sprintf(sp, &ip6->ip6_src);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &ip6->ip6_dst);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET6 */
 #ifdef INET
 	} else if (ip && th) {
 		inet_ntoa_r(ip->ip_src, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		inet_ntoa_r(ip->ip_dst, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET */
 	} else {
 		free(s, M_TCPLOG);
 		return (NULL);
 	}
 	sp = s + strlen(s);
 	if (th)
 		sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS);
 	if (*(s + size - 1) != '\0')
 		panic("%s: string too long", __func__);
 	return (s);
 }
 
 /*
  * A subroutine which makes it easy to track TCP state changes with DTrace.
  * This function shouldn't be called for t_state initializations that don't
  * correspond to actual TCP state transitions.
  */
 void
 tcp_state_change(struct tcpcb *tp, int newstate)
 {
 #if defined(KDTRACE_HOOKS)
 	int pstate = tp->t_state;
 #endif
 
 	tp->t_state = newstate;
 	TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate);
 }
Index: user/dchagin/lemul/sys/netinet/tcp_syncache.c
===================================================================
--- user/dchagin/lemul/sys/netinet/tcp_syncache.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/tcp_syncache.c	(revision 274241)
@@ -1,2046 +1,2046 @@
 /*-
  * Copyright (c) 2001 McAfee, Inc.
  * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jonathan Lemon
  * and McAfee Research, the Security Research Division of McAfee, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program. [2001 McAfee, Inc.]
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_pcbgroup.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/random.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 
 #include <sys/md5.h>
 #include <crypto/siphash/siphash.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/toecore.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/key.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 static VNET_DEFINE(int, tcp_syncookies) = 1;
 #define	V_tcp_syncookies		VNET(tcp_syncookies)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_syncookies), 0,
     "Use TCP SYN cookies if the syncache overflows");
 
 static VNET_DEFINE(int, tcp_syncookiesonly) = 0;
 #define	V_tcp_syncookiesonly		VNET(tcp_syncookiesonly)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_syncookiesonly), 0,
     "Use only TCP SYN cookies");
 
 #ifdef TCP_OFFLOAD
 #define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL)
 #endif
 
 static void	 syncache_drop(struct syncache *, struct syncache_head *);
 static void	 syncache_free(struct syncache *);
 static void	 syncache_insert(struct syncache *, struct syncache_head *);
 static int	 syncache_respond(struct syncache *, struct syncache_head *, int);
 static struct	 socket *syncache_socket(struct syncache *, struct socket *,
 		    struct mbuf *m);
 static void	 syncache_timeout(struct syncache *sc, struct syncache_head *sch,
 		    int docallout);
 static void	 syncache_timer(void *);
 
 static uint32_t	 syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t,
 		    uint8_t *, uintptr_t);
 static tcp_seq	 syncookie_generate(struct syncache_head *, struct syncache *);
 static struct syncache
 		*syncookie_lookup(struct in_conninfo *, struct syncache_head *,
 		    struct syncache *, struct tcphdr *, struct tcpopt *,
 		    struct socket *);
 static void	 syncookie_reseed(void *);
 #ifdef INVARIANTS
 static int	 syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch,
 		    struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
 		    struct socket *lso);
 #endif
 
 /*
  * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
  * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds,
  * the odds are that the user has given up attempting to connect by then.
  */
 #define SYNCACHE_MAXREXMTS		3
 
 /* Arbitrary values */
 #define TCP_SYNCACHE_HASHSIZE		512
 #define TCP_SYNCACHE_BUCKETLIMIT	30
 
 static VNET_DEFINE(struct tcp_syncache, tcp_syncache);
 #define	V_tcp_syncache			VNET(tcp_syncache)
 
 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0,
     "TCP SYN cache");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
+SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_VNET | CTLFLAG_RDTUN,
     &VNET_NAME(tcp_syncache.bucket_limit), 0,
     "Per-bucket hash limit for syncache");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
+SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN,
     &VNET_NAME(tcp_syncache.cache_limit), 0,
     "Overall entry limit for syncache");
 
 SYSCTL_UMA_CUR(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_VNET,
     &VNET_NAME(tcp_syncache.zone), "Current number of entries in syncache");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
+SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN,
     &VNET_NAME(tcp_syncache.hashsize), 0,
     "Size of TCP syncache hashtable");
 
-SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW,
+SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_syncache.rexmt_limit), 0,
     "Limit on SYN/ACK retransmissions");
 
 VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail,
-    CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0,
+SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0,
     "Send reset on socket allocation failure");
 
 static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
 
 #define SYNCACHE_HASH(inc, mask)					\
 	((V_tcp_syncache.hash_secret ^					\
 	  (inc)->inc_faddr.s_addr ^					\
 	  ((inc)->inc_faddr.s_addr >> 16) ^				\
 	  (inc)->inc_fport ^ (inc)->inc_lport) & mask)
 
 #define SYNCACHE_HASH6(inc, mask)					\
 	((V_tcp_syncache.hash_secret ^					\
 	  (inc)->inc6_faddr.s6_addr32[0] ^				\
 	  (inc)->inc6_faddr.s6_addr32[3] ^				\
 	  (inc)->inc_fport ^ (inc)->inc_lport) & mask)
 
 #define ENDPTS_EQ(a, b) (						\
 	(a)->ie_fport == (b)->ie_fport &&				\
 	(a)->ie_lport == (b)->ie_lport &&				\
 	(a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr &&			\
 	(a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr			\
 )
 
 #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
 
 #define	SCH_LOCK(sch)		mtx_lock(&(sch)->sch_mtx)
 #define	SCH_UNLOCK(sch)		mtx_unlock(&(sch)->sch_mtx)
 #define	SCH_LOCK_ASSERT(sch)	mtx_assert(&(sch)->sch_mtx, MA_OWNED)
 
 /*
  * Requires the syncache entry to be already removed from the bucket list.
  */
 static void
 syncache_free(struct syncache *sc)
 {
 
 	if (sc->sc_ipopts)
 		(void) m_free(sc->sc_ipopts);
 	if (sc->sc_cred)
 		crfree(sc->sc_cred);
 #ifdef MAC
 	mac_syncache_destroy(&sc->sc_label);
 #endif
 
 	uma_zfree(V_tcp_syncache.zone, sc);
 }
 
 void
 syncache_init(void)
 {
 	int i;
 
 	V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
 	V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
 	V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
 	V_tcp_syncache.hash_secret = arc4random();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
 	    &V_tcp_syncache.hashsize);
 	TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
 	    &V_tcp_syncache.bucket_limit);
 	if (!powerof2(V_tcp_syncache.hashsize) ||
 	    V_tcp_syncache.hashsize == 0) {
 		printf("WARNING: syncache hash size is not a power of 2.\n");
 		V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
 	}
 	V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1;
 
 	/* Set limits. */
 	V_tcp_syncache.cache_limit =
 	    V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit;
 	TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
 	    &V_tcp_syncache.cache_limit);
 
 	/* Allocate the hash table. */
 	V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize *
 	    sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO);
 
 #ifdef VIMAGE
 	V_tcp_syncache.vnet = curvnet;
 #endif
 
 	/* Initialize the hash buckets. */
 	for (i = 0; i < V_tcp_syncache.hashsize; i++) {
 		TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket);
 		mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head",
 			 NULL, MTX_DEF);
 		callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer,
 			 &V_tcp_syncache.hashbase[i].sch_mtx, 0);
 		V_tcp_syncache.hashbase[i].sch_length = 0;
 		V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache;
 	}
 
 	/* Create the syncache entry zone. */
 	V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone,
 	    V_tcp_syncache.cache_limit);
 
 	/* Start the SYN cookie reseeder callout. */
 	callout_init(&V_tcp_syncache.secret.reseed, 1);
 	arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0);
 	arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0);
 	callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz,
 	    syncookie_reseed, &V_tcp_syncache);
 }
 
 #ifdef VIMAGE
 void
 syncache_destroy(void)
 {
 	struct syncache_head *sch;
 	struct syncache *sc, *nsc;
 	int i;
 
 	/* Cleanup hash buckets: stop timers, free entries, destroy locks. */
 	for (i = 0; i < V_tcp_syncache.hashsize; i++) {
 
 		sch = &V_tcp_syncache.hashbase[i];
 		callout_drain(&sch->sch_timer);
 
 		SCH_LOCK(sch);
 		TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc)
 			syncache_drop(sc, sch);
 		SCH_UNLOCK(sch);
 		KASSERT(TAILQ_EMPTY(&sch->sch_bucket),
 		    ("%s: sch->sch_bucket not empty", __func__));
 		KASSERT(sch->sch_length == 0, ("%s: sch->sch_length %d not 0",
 		    __func__, sch->sch_length));
 		mtx_destroy(&sch->sch_mtx);
 	}
 
 	KASSERT(uma_zone_get_cur(V_tcp_syncache.zone) == 0,
 	    ("%s: cache_count not 0", __func__));
 
 	/* Free the allocated global resources. */
 	uma_zdestroy(V_tcp_syncache.zone);
 	free(V_tcp_syncache.hashbase, M_SYNCACHE);
 
 	callout_drain(&V_tcp_syncache.secret.reseed);
 }
 #endif
 
 /*
  * Inserts a syncache entry into the specified bucket row.
  * Locks and unlocks the syncache_head autonomously.
  */
 static void
 syncache_insert(struct syncache *sc, struct syncache_head *sch)
 {
 	struct syncache *sc2;
 
 	SCH_LOCK(sch);
 
 	/*
 	 * Make sure that we don't overflow the per-bucket limit.
 	 * If the bucket is full, toss the oldest element.
 	 */
 	if (sch->sch_length >= V_tcp_syncache.bucket_limit) {
 		KASSERT(!TAILQ_EMPTY(&sch->sch_bucket),
 			("sch->sch_length incorrect"));
 		sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head);
 		syncache_drop(sc2, sch);
 		TCPSTAT_INC(tcps_sc_bucketoverflow);
 	}
 
 	/* Put it into the bucket. */
 	TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length++;
 
 #ifdef TCP_OFFLOAD
 	if (ADDED_BY_TOE(sc)) {
 		struct toedev *tod = sc->sc_tod;
 
 		tod->tod_syncache_added(tod, sc->sc_todctx);
 	}
 #endif
 
 	/* Reinitialize the bucket row's timer. */
 	if (sch->sch_length == 1)
 		sch->sch_nextc = ticks + INT_MAX;
 	syncache_timeout(sc, sch, 1);
 
 	SCH_UNLOCK(sch);
 
 	TCPSTAT_INC(tcps_sc_added);
 }
 
 /*
  * Remove and free entry from syncache bucket row.
  * Expects locked syncache head.
  */
 static void
 syncache_drop(struct syncache *sc, struct syncache_head *sch)
 {
 
 	SCH_LOCK_ASSERT(sch);
 
 	TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length--;
 
 #ifdef TCP_OFFLOAD
 	if (ADDED_BY_TOE(sc)) {
 		struct toedev *tod = sc->sc_tod;
 
 		tod->tod_syncache_removed(tod, sc->sc_todctx);
 	}
 #endif
 
 	syncache_free(sc);
 }
 
 /*
  * Engage/reengage time on bucket row.
  */
 static void
 syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout)
 {
 	sc->sc_rxttime = ticks +
 		TCPTV_RTOBASE * (tcp_syn_backoff[sc->sc_rxmits]);
 	sc->sc_rxmits++;
 	if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) {
 		sch->sch_nextc = sc->sc_rxttime;
 		if (docallout)
 			callout_reset(&sch->sch_timer, sch->sch_nextc - ticks,
 			    syncache_timer, (void *)sch);
 	}
 }
 
 /*
  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
  * If we have retransmitted an entry the maximum number of times, expire it.
  * One separate timer for each bucket row.
  */
 static void
 syncache_timer(void *xsch)
 {
 	struct syncache_head *sch = (struct syncache_head *)xsch;
 	struct syncache *sc, *nsc;
 	int tick = ticks;
 	char *s;
 
 	CURVNET_SET(sch->sch_sc->vnet);
 
 	/* NB: syncache_head has already been locked by the callout. */
 	SCH_LOCK_ASSERT(sch);
 
 	/*
 	 * In the following cycle we may remove some entries and/or
 	 * advance some timeouts, so re-initialize the bucket timer.
 	 */
 	sch->sch_nextc = tick + INT_MAX;
 
 	TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) {
 		/*
 		 * We do not check if the listen socket still exists
 		 * and accept the case where the listen socket may be
 		 * gone by the time we resend the SYN/ACK.  We do
 		 * not expect this to happens often. If it does,
 		 * then the RST will be sent by the time the remote
 		 * host does the SYN/ACK->ACK.
 		 */
 		if (TSTMP_GT(sc->sc_rxttime, tick)) {
 			if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc))
 				sch->sch_nextc = sc->sc_rxttime;
 			continue;
 		}
 		if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) {
 			if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 				log(LOG_DEBUG, "%s; %s: Retransmits exhausted, "
 				    "giving up and removing syncache entry\n",
 				    s, __func__);
 				free(s, M_TCPLOG);
 			}
 			syncache_drop(sc, sch);
 			TCPSTAT_INC(tcps_sc_stale);
 			continue;
 		}
 		if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Response timeout, "
 			    "retransmitting (%u) SYN|ACK\n",
 			    s, __func__, sc->sc_rxmits);
 			free(s, M_TCPLOG);
 		}
 
 		syncache_respond(sc, sch, 1);
 		TCPSTAT_INC(tcps_sc_retransmitted);
 		syncache_timeout(sc, sch, 0);
 	}
 	if (!TAILQ_EMPTY(&(sch)->sch_bucket))
 		callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick,
 			syncache_timer, (void *)(sch));
 	CURVNET_RESTORE();
 }
 
 /*
  * Find an entry in the syncache.
  * Returns always with locked syncache_head plus a matching entry or NULL.
  */
 static struct syncache *
 syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp)
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		sch = &V_tcp_syncache.hashbase[
 		    SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)];
 		*schp = sch;
 
 		SCH_LOCK(sch);
 
 		/* Circle through bucket row to find matching entry. */
 		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
 			if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
 				return (sc);
 		}
 	} else
 #endif
 	{
 		sch = &V_tcp_syncache.hashbase[
 		    SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)];
 		*schp = sch;
 
 		SCH_LOCK(sch);
 
 		/* Circle through bucket row to find matching entry. */
 		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
 #ifdef INET6
 			if (sc->sc_inc.inc_flags & INC_ISIPV6)
 				continue;
 #endif
 			if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
 				return (sc);
 		}
 	}
 	SCH_LOCK_ASSERT(*schp);
 	return (NULL);			/* always returns with locked sch */
 }
 
 /*
  * This function is called when we get a RST for a
  * non-existent connection, so that we can see if the
  * connection is in the syn cache.  If it is, zap it.
  */
 void
 syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th)
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 	char *s = NULL;
 
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 
 	/*
 	 * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags.
 	 * See RFC 793 page 65, section SEGMENT ARRIVES.
 	 */
 	if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or "
 			    "FIN flag set, segment ignored\n", s, __func__);
 		TCPSTAT_INC(tcps_badrst);
 		goto done;
 	}
 
 	/*
 	 * No corresponding connection was found in syncache.
 	 * If syncookies are enabled and possibly exclusively
 	 * used, or we are under memory pressure, a valid RST
 	 * may not find a syncache entry.  In that case we're
 	 * done and no SYN|ACK retransmissions will happen.
 	 * Otherwise the RST was misdirected or spoofed.
 	 */
 	if (sc == NULL) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Spurious RST without matching "
 			    "syncache entry (possibly syncookie only), "
 			    "segment ignored\n", s, __func__);
 		TCPSTAT_INC(tcps_badrst);
 		goto done;
 	}
 
 	/*
 	 * If the RST bit is set, check the sequence number to see
 	 * if this is a valid reset segment.
 	 * RFC 793 page 37:
 	 *   In all states except SYN-SENT, all reset (RST) segments
 	 *   are validated by checking their SEQ-fields.  A reset is
 	 *   valid if its sequence number is in the window.
 	 *
 	 *   The sequence number in the reset segment is normally an
 	 *   echo of our outgoing acknowlegement numbers, but some hosts
 	 *   send a reset with the sequence number at the rightmost edge
 	 *   of our receive window, and we have to handle this case.
 	 */
 	if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
 	    SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
 		syncache_drop(sc, sch);
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, "
 			    "connection attempt aborted by remote endpoint\n",
 			    s, __func__);
 		TCPSTAT_INC(tcps_sc_reset);
 	} else {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != "
 			    "IRS %u (+WND %u), segment ignored\n",
 			    s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd);
 		TCPSTAT_INC(tcps_badrst);
 	}
 
 done:
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	SCH_UNLOCK(sch);
 }
 
 void
 syncache_badack(struct in_conninfo *inc)
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 	if (sc != NULL) {
 		syncache_drop(sc, sch);
 		TCPSTAT_INC(tcps_sc_badack);
 	}
 	SCH_UNLOCK(sch);
 }
 
 void
 syncache_unreach(struct in_conninfo *inc, struct tcphdr *th)
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 	if (sc == NULL)
 		goto done;
 
 	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
 	if (ntohl(th->th_seq) != sc->sc_iss)
 		goto done;
 
 	/*
 	 * If we've rertransmitted 3 times and this is our second error,
 	 * we remove the entry.  Otherwise, we allow it to continue on.
 	 * This prevents us from incorrectly nuking an entry during a
 	 * spurious network outage.
 	 *
 	 * See tcp_notify().
 	 */
 	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) {
 		sc->sc_flags |= SCF_UNREACH;
 		goto done;
 	}
 	syncache_drop(sc, sch);
 	TCPSTAT_INC(tcps_sc_unreach);
 done:
 	SCH_UNLOCK(sch);
 }
 
 /*
  * Build a new TCP socket structure from a syncache entry.
  */
 static struct socket *
 syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
 {
 	struct inpcb *inp = NULL;
 	struct socket *so;
 	struct tcpcb *tp;
 	int error;
 	char *s;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 	/*
 	 * Ok, create the full blown connection, and set things up
 	 * as they would have been set up if we had created the
 	 * connection when the SYN arrived.  If we can't create
 	 * the connection, abort it.
 	 */
 	so = sonewconn(lso, 0);
 	if (so == NULL) {
 		/*
 		 * Drop the connection; we will either send a RST or
 		 * have the peer retransmit its SYN again after its
 		 * RTO and try again.
 		 */
 		TCPSTAT_INC(tcps_listendrop);
 		if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Socket create failed "
 			    "due to limits or memory shortage\n",
 			    s, __func__);
 			free(s, M_TCPLOG);
 		}
 		goto abort2;
 	}
 #ifdef MAC
 	mac_socketpeer_set_from_mbuf(m, so);
 #endif
 
 	inp = sotoinpcb(so);
 	inp->inp_inc.inc_fibnum = so->so_fibnum;
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(&V_tcbinfo);
 
 	/* Insert new socket into PCB hash list. */
 	inp->inp_inc.inc_flags = sc->sc_inc.inc_flags;
 #ifdef INET6
 	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
 		inp->in6p_laddr = sc->sc_inc.inc6_laddr;
 	} else {
 		inp->inp_vflag &= ~INP_IPV6;
 		inp->inp_vflag |= INP_IPV4;
 #endif
 		inp->inp_laddr = sc->sc_inc.inc_laddr;
 #ifdef INET6
 	}
 #endif
 
 	/*
 	 * If there's an mbuf and it has a flowid, then let's initialise the
 	 * inp with that particular flowid.
 	 */
 	if (m != NULL && m->m_flags & M_FLOWID) {
 		inp->inp_flags |= INP_HW_FLOWID;
 		inp->inp_flags &= ~INP_SW_FLOWID;
 		inp->inp_flowid = m->m_pkthdr.flowid;
 		inp->inp_flowtype = M_HASHTYPE_GET(m);
 	}
 
 	/*
 	 * Install in the reservation hash table for now, but don't yet
 	 * install a connection group since the full 4-tuple isn't yet
 	 * configured.
 	 */
 	inp->inp_lport = sc->sc_inc.inc_lport;
 	if ((error = in_pcbinshash_nopcbgroup(inp)) != 0) {
 		/*
 		 * Undo the assignments above if we failed to
 		 * put the PCB on the hash lists.
 		 */
 #ifdef INET6
 		if (sc->sc_inc.inc_flags & INC_ISIPV6)
 			inp->in6p_laddr = in6addr_any;
 		else
 #endif
 			inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: in_pcbinshash failed "
 			    "with error %i\n",
 			    s, __func__, error);
 			free(s, M_TCPLOG);
 		}
 		INP_HASH_WUNLOCK(&V_tcbinfo);
 		goto abort;
 	}
 #ifdef IPSEC
 	/* Copy old policy into new socket's. */
 	if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
 		printf("syncache_socket: could not copy policy\n");
 #endif
 #ifdef INET6
 	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
 		struct inpcb *oinp = sotoinpcb(lso);
 		struct in6_addr laddr6;
 		struct sockaddr_in6 sin6;
 		/*
 		 * Inherit socket options from the listening socket.
 		 * Note that in6p_inputopts are not (and should not be)
 		 * copied, since it stores previously received options and is
 		 * used to detect if each new option is different than the
 		 * previous one and hence should be passed to a user.
 		 * If we copied in6p_inputopts, a user would not be able to
 		 * receive options just after calling the accept system call.
 		 */
 		inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
 		if (oinp->in6p_outputopts)
 			inp->in6p_outputopts =
 			    ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
 
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_len = sizeof(sin6);
 		sin6.sin6_addr = sc->sc_inc.inc6_faddr;
 		sin6.sin6_port = sc->sc_inc.inc_fport;
 		sin6.sin6_flowinfo = sin6.sin6_scope_id = 0;
 		laddr6 = inp->in6p_laddr;
 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 			inp->in6p_laddr = sc->sc_inc.inc6_laddr;
 		if ((error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6,
 		    thread0.td_ucred, m)) != 0) {
 			inp->in6p_laddr = laddr6;
 			if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 				log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed "
 				    "with error %i\n",
 				    s, __func__, error);
 				free(s, M_TCPLOG);
 			}
 			INP_HASH_WUNLOCK(&V_tcbinfo);
 			goto abort;
 		}
 		/* Override flowlabel from in6_pcbconnect. */
 		inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
 		inp->inp_flow |= sc->sc_flowlabel;
 	}
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		struct in_addr laddr;
 		struct sockaddr_in sin;
 
 		inp->inp_options = (m) ? ip_srcroute(m) : NULL;
 		
 		if (inp->inp_options == NULL) {
 			inp->inp_options = sc->sc_ipopts;
 			sc->sc_ipopts = NULL;
 		}
 
 		sin.sin_family = AF_INET;
 		sin.sin_len = sizeof(sin);
 		sin.sin_addr = sc->sc_inc.inc_faddr;
 		sin.sin_port = sc->sc_inc.inc_fport;
 		bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero));
 		laddr = inp->inp_laddr;
 		if (inp->inp_laddr.s_addr == INADDR_ANY)
 			inp->inp_laddr = sc->sc_inc.inc_laddr;
 		if ((error = in_pcbconnect_mbuf(inp, (struct sockaddr *)&sin,
 		    thread0.td_ucred, m)) != 0) {
 			inp->inp_laddr = laddr;
 			if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 				log(LOG_DEBUG, "%s; %s: in_pcbconnect failed "
 				    "with error %i\n",
 				    s, __func__, error);
 				free(s, M_TCPLOG);
 			}
 			INP_HASH_WUNLOCK(&V_tcbinfo);
 			goto abort;
 		}
 	}
 #endif /* INET */
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 	tp = intotcpcb(inp);
 	tcp_state_change(tp, TCPS_SYN_RECEIVED);
 	tp->iss = sc->sc_iss;
 	tp->irs = sc->sc_irs;
 	tcp_rcvseqinit(tp);
 	tcp_sendseqinit(tp);
 	tp->snd_wl1 = sc->sc_irs;
 	tp->snd_max = tp->iss + 1;
 	tp->snd_nxt = tp->iss + 1;
 	tp->rcv_up = sc->sc_irs + 1;
 	tp->rcv_wnd = sc->sc_wnd;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
 	if (sc->sc_flags & SCF_NOOPT)
 		tp->t_flags |= TF_NOOPT;
 	else {
 		if (sc->sc_flags & SCF_WINSCALE) {
 			tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
 			tp->snd_scale = sc->sc_requested_s_scale;
 			tp->request_r_scale = sc->sc_requested_r_scale;
 		}
 		if (sc->sc_flags & SCF_TIMESTAMP) {
 			tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
 			tp->ts_recent = sc->sc_tsreflect;
 			tp->ts_recent_age = tcp_ts_getticks();
 			tp->ts_offset = sc->sc_tsoff;
 		}
 #ifdef TCP_SIGNATURE
 		if (sc->sc_flags & SCF_SIGNATURE)
 			tp->t_flags |= TF_SIGNATURE;
 #endif
 		if (sc->sc_flags & SCF_SACK)
 			tp->t_flags |= TF_SACK_PERMIT;
 	}
 
 	if (sc->sc_flags & SCF_ECN)
 		tp->t_flags |= TF_ECN_PERMIT;
 
 	/*
 	 * Set up MSS and get cached values from tcp_hostcache.
 	 * This might overwrite some of the defaults we just set.
 	 */
 	tcp_mss(tp, sc->sc_peer_mss);
 
 	/*
 	 * If the SYN,ACK was retransmitted, indicate that CWND to be
 	 * limited to one segment in cc_conn_init().
 	 * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits.
 	 */
 	if (sc->sc_rxmits > 1)
 		tp->snd_cwnd = 1;
 
 #ifdef TCP_OFFLOAD
 	/*
 	 * Allow a TOE driver to install its hooks.  Note that we hold the
 	 * pcbinfo lock too and that prevents tcp_usr_accept from accepting a
 	 * new connection before the TOE driver has done its thing.
 	 */
 	if (ADDED_BY_TOE(sc)) {
 		struct toedev *tod = sc->sc_tod;
 
 		tod->tod_offload_socket(tod, sc->sc_todctx, so);
 	}
 #endif
 	/*
 	 * Copy and activate timers.
 	 */
 	tp->t_keepinit = sototcpcb(lso)->t_keepinit;
 	tp->t_keepidle = sototcpcb(lso)->t_keepidle;
 	tp->t_keepintvl = sototcpcb(lso)->t_keepintvl;
 	tp->t_keepcnt = sototcpcb(lso)->t_keepcnt;
 	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 
 	INP_WUNLOCK(inp);
 
 	soisconnected(so);
 
 	TCPSTAT_INC(tcps_accepts);
 	return (so);
 
 abort:
 	INP_WUNLOCK(inp);
 abort2:
 	if (so != NULL)
 		soabort(so);
 	return (NULL);
 }
 
 /*
  * This function gets called when we receive an ACK for a
  * socket in the LISTEN state.  We look up the connection
  * in the syncache, and if its there, we pull it out of
  * the cache and turn it into a full-blown connection in
  * the SYN-RECEIVED state.
  */
 int
 syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
     struct socket **lsop, struct mbuf *m)
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 	struct syncache scs;
 	char *s;
 
 	/*
 	 * Global TCP locks are held because we manipulate the PCB lists
 	 * and create a new socket.
 	 */
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK,
 	    ("%s: can handle only ACK", __func__));
 
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 
 #ifdef INVARIANTS
 	/*
 	 * Test code for syncookies comparing the syncache stored
 	 * values with the reconstructed values from the cookie.
 	 */
 	if (sc != NULL)
 		syncookie_cmp(inc, sch, sc, th, to, *lsop);
 #endif
 
 	if (sc == NULL) {
 		/*
 		 * There is no syncache entry, so see if this ACK is
 		 * a returning syncookie.  To do this, first:
 		 *  A. See if this socket has had a syncache entry dropped in
 		 *     the past.  We don't want to accept a bogus syncookie
 		 *     if we've never received a SYN.
 		 *  B. check that the syncookie is valid.  If it is, then
 		 *     cobble up a fake syncache entry, and return.
 		 */
 		if (!V_tcp_syncookies) {
 			SCH_UNLOCK(sch);
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Spurious ACK, "
 				    "segment rejected (syncookies disabled)\n",
 				    s, __func__);
 			goto failed;
 		}
 		bzero(&scs, sizeof(scs));
 		sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop);
 		SCH_UNLOCK(sch);
 		if (sc == NULL) {
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Segment failed "
 				    "SYNCOOKIE authentication, segment rejected "
 				    "(probably spoofed)\n", s, __func__);
 			goto failed;
 		}
 	} else {
 		/* Pull out the entry to unlock the bucket row. */
 		TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 		sch->sch_length--;
 #ifdef TCP_OFFLOAD
 		if (ADDED_BY_TOE(sc)) {
 			struct toedev *tod = sc->sc_tod;
 
 			tod->tod_syncache_removed(tod, sc->sc_todctx);
 		}
 #endif
 		SCH_UNLOCK(sch);
 	}
 
 	/*
 	 * Segment validation:
 	 * ACK must match our initial sequence number + 1 (the SYN|ACK).
 	 */
 	if (th->th_ack != sc->sc_iss + 1) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_ack, sc->sc_iss);
 		goto failed;
 	}
 
 	/*
 	 * The SEQ must fall in the window starting at the received
 	 * initial receive sequence number + 1 (the SYN).
 	 */
 	if (SEQ_LEQ(th->th_seq, sc->sc_irs) ||
 	    SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_seq, sc->sc_irs);
 		goto failed;
 	}
 
 	/*
 	 * If timestamps were not negotiated during SYN/ACK they
 	 * must not appear on any segment during this session.
 	 */
 	if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
 			    "segment rejected\n", s, __func__);
 		goto failed;
 	}
 
 	/*
 	 * If timestamps were negotiated during SYN/ACK they should
 	 * appear on every segment during this session.
 	 * XXXAO: This is only informal as there have been unverified
 	 * reports of non-compliants stacks.
 	 */
 	if ((sc->sc_flags & SCF_TIMESTAMP) && !(to->to_flags & TOF_TS)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Timestamp missing, "
 			    "no action\n", s, __func__);
 			free(s, M_TCPLOG);
 			s = NULL;
 		}
 	}
 
 	/*
 	 * If timestamps were negotiated the reflected timestamp
 	 * must be equal to what we actually sent in the SYN|ACK.
 	 */
 	if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, "
 			    "segment rejected\n",
 			    s, __func__, to->to_tsecr, sc->sc_ts);
 		goto failed;
 	}
 
 	*lsop = syncache_socket(sc, *lsop, m);
 
 	if (*lsop == NULL)
 		TCPSTAT_INC(tcps_sc_aborted);
 	else
 		TCPSTAT_INC(tcps_sc_completed);
 
 /* how do we find the inp for the new socket? */
 	if (sc != &scs)
 		syncache_free(sc);
 	return (1);
 failed:
 	if (sc != NULL && sc != &scs)
 		syncache_free(sc);
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	*lsop = NULL;
 	return (0);
 }
 
 /*
  * Given a LISTEN socket and an inbound SYN request, add
  * this to the syn cache, and send back a segment:
  *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
  * to the source.
  *
  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
  * Doing so would require that we hold onto the data and deliver it
  * to the application.  However, if we are the target of a SYN-flood
  * DoS attack, an attacker could send data which would eventually
  * consume all available buffer space if it were ACKed.  By not ACKing
  * the data, we avoid this DoS scenario.
  */
 void
 syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
     struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod,
     void *todctx)
 {
 	struct tcpcb *tp;
 	struct socket *so;
 	struct syncache *sc = NULL;
 	struct syncache_head *sch;
 	struct mbuf *ipopts = NULL;
 	u_int ltflags;
 	int win, sb_hiwat, ip_ttl, ip_tos;
 	char *s;
 #ifdef INET6
 	int autoflowlabel = 0;
 #endif
 #ifdef MAC
 	struct label *maclabel;
 #endif
 	struct syncache scs;
 	struct ucred *cred;
 
 	INP_WLOCK_ASSERT(inp);			/* listen socket */
 	KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN,
 	    ("%s: unexpected tcp flags", __func__));
 
 	/*
 	 * Combine all so/tp operations very early to drop the INP lock as
 	 * soon as possible.
 	 */
 	so = *lsop;
 	tp = sototcpcb(so);
 	cred = crhold(so->so_cred);
 
 #ifdef INET6
 	if ((inc->inc_flags & INC_ISIPV6) &&
 	    (inp->inp_flags & IN6P_AUTOFLOWLABEL))
 		autoflowlabel = 1;
 #endif
 	ip_ttl = inp->inp_ip_ttl;
 	ip_tos = inp->inp_ip_tos;
 	win = sbspace(&so->so_rcv);
 	sb_hiwat = so->so_rcv.sb_hiwat;
 	ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE));
 
 	/* By the time we drop the lock these should no longer be used. */
 	so = NULL;
 	tp = NULL;
 
 #ifdef MAC
 	if (mac_syncache_init(&maclabel) != 0) {
 		INP_WUNLOCK(inp);
 		goto done;
 	} else
 		mac_syncache_create(maclabel, inp);
 #endif
 	INP_WUNLOCK(inp);
 
 	/*
 	 * Remember the IP options, if any.
 	 */
 #ifdef INET6
 	if (!(inc->inc_flags & INC_ISIPV6))
 #endif
 #ifdef INET
 		ipopts = (m) ? ip_srcroute(m) : NULL;
 #else
 		ipopts = NULL;
 #endif
 
 	/*
 	 * See if we already have an entry for this connection.
 	 * If we do, resend the SYN,ACK, and reset the retransmit timer.
 	 *
 	 * XXX: should the syncache be re-initialized with the contents
 	 * of the new SYN here (which may have different options?)
 	 *
 	 * XXX: We do not check the sequence number to see if this is a
 	 * real retransmit or a new connection attempt.  The question is
 	 * how to handle such a case; either ignore it as spoofed, or
 	 * drop the current entry and create a new one?
 	 */
 	sc = syncache_lookup(inc, &sch);	/* returns locked entry */
 	SCH_LOCK_ASSERT(sch);
 	if (sc != NULL) {
 		TCPSTAT_INC(tcps_sc_dupsyn);
 		if (ipopts) {
 			/*
 			 * If we were remembering a previous source route,
 			 * forget it and use the new one we've been given.
 			 */
 			if (sc->sc_ipopts)
 				(void) m_free(sc->sc_ipopts);
 			sc->sc_ipopts = ipopts;
 		}
 		/*
 		 * Update timestamp if present.
 		 */
 		if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS))
 			sc->sc_tsreflect = to->to_tsval;
 		else
 			sc->sc_flags &= ~SCF_TIMESTAMP;
 #ifdef MAC
 		/*
 		 * Since we have already unconditionally allocated label
 		 * storage, free it up.  The syncache entry will already
 		 * have an initialized label we can use.
 		 */
 		mac_syncache_destroy(&maclabel);
 #endif
 		/* Retransmit SYN|ACK and reset retransmit count. */
 		if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Received duplicate SYN, "
 			    "resetting timer and retransmitting SYN|ACK\n",
 			    s, __func__);
 			free(s, M_TCPLOG);
 		}
 		if (syncache_respond(sc, sch, 1) == 0) {
 			sc->sc_rxmits = 0;
 			syncache_timeout(sc, sch, 1);
 			TCPSTAT_INC(tcps_sndacks);
 			TCPSTAT_INC(tcps_sndtotal);
 		}
 		SCH_UNLOCK(sch);
 		goto done;
 	}
 
 	sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO);
 	if (sc == NULL) {
 		/*
 		 * The zone allocator couldn't provide more entries.
 		 * Treat this as if the cache was full; drop the oldest
 		 * entry and insert the new one.
 		 */
 		TCPSTAT_INC(tcps_sc_zonefail);
 		if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL)
 			syncache_drop(sc, sch);
 		sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO);
 		if (sc == NULL) {
 			if (V_tcp_syncookies) {
 				bzero(&scs, sizeof(scs));
 				sc = &scs;
 			} else {
 				SCH_UNLOCK(sch);
 				if (ipopts)
 					(void) m_free(ipopts);
 				goto done;
 			}
 		}
 	}
 	
 	/*
 	 * Fill in the syncache values.
 	 */
 #ifdef MAC
 	sc->sc_label = maclabel;
 #endif
 	sc->sc_cred = cred;
 	cred = NULL;
 	sc->sc_ipopts = ipopts;
 	bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
 #ifdef INET6
 	if (!(inc->inc_flags & INC_ISIPV6))
 #endif
 	{
 		sc->sc_ip_tos = ip_tos;
 		sc->sc_ip_ttl = ip_ttl;
 	}
 #ifdef TCP_OFFLOAD
 	sc->sc_tod = tod;
 	sc->sc_todctx = todctx;
 #endif
 	sc->sc_irs = th->th_seq;
 	sc->sc_iss = arc4random();
 	sc->sc_flags = 0;
 	sc->sc_flowlabel = 0;
 
 	/*
 	 * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN].
 	 * win was derived from socket earlier in the function.
 	 */
 	win = imax(win, 0);
 	win = imin(win, TCP_MAXWIN);
 	sc->sc_wnd = win;
 
 	if (V_tcp_do_rfc1323) {
 		/*
 		 * A timestamp received in a SYN makes
 		 * it ok to send timestamp requests and replies.
 		 */
 		if (to->to_flags & TOF_TS) {
 			sc->sc_tsreflect = to->to_tsval;
 			sc->sc_ts = tcp_ts_getticks();
 			sc->sc_flags |= SCF_TIMESTAMP;
 		}
 		if (to->to_flags & TOF_SCALE) {
 			int wscale = 0;
 
 			/*
 			 * Pick the smallest possible scaling factor that
 			 * will still allow us to scale up to sb_max, aka
 			 * kern.ipc.maxsockbuf.
 			 *
 			 * We do this because there are broken firewalls that
 			 * will corrupt the window scale option, leading to
 			 * the other endpoint believing that our advertised
 			 * window is unscaled.  At scale factors larger than
 			 * 5 the unscaled window will drop below 1500 bytes,
 			 * leading to serious problems when traversing these
 			 * broken firewalls.
 			 *
 			 * With the default maxsockbuf of 256K, a scale factor
 			 * of 3 will be chosen by this algorithm.  Those who
 			 * choose a larger maxsockbuf should watch out
 			 * for the compatiblity problems mentioned above.
 			 *
 			 * RFC1323: The Window field in a SYN (i.e., a <SYN>
 			 * or <SYN,ACK>) segment itself is never scaled.
 			 */
 			while (wscale < TCP_MAX_WINSHIFT &&
 			    (TCP_MAXWIN << wscale) < sb_max)
 				wscale++;
 			sc->sc_requested_r_scale = wscale;
 			sc->sc_requested_s_scale = to->to_wscale;
 			sc->sc_flags |= SCF_WINSCALE;
 		}
 	}
 #ifdef TCP_SIGNATURE
 	/*
 	 * If listening socket requested TCP digests, OR received SYN
 	 * contains the option, flag this in the syncache so that
 	 * syncache_respond() will do the right thing with the SYN+ACK.
 	 */
 	if (to->to_flags & TOF_SIGNATURE || ltflags & TF_SIGNATURE)
 		sc->sc_flags |= SCF_SIGNATURE;
 #endif
 	if (to->to_flags & TOF_SACKPERM)
 		sc->sc_flags |= SCF_SACK;
 	if (to->to_flags & TOF_MSS)
 		sc->sc_peer_mss = to->to_mss;	/* peer mss may be zero */
 	if (ltflags & TF_NOOPT)
 		sc->sc_flags |= SCF_NOOPT;
 	if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn)
 		sc->sc_flags |= SCF_ECN;
 
 	if (V_tcp_syncookies)
 		sc->sc_iss = syncookie_generate(sch, sc);
 #ifdef INET6
 	if (autoflowlabel) {
 		if (V_tcp_syncookies)
 			sc->sc_flowlabel = sc->sc_iss;
 		else
 			sc->sc_flowlabel = ip6_randomflowlabel();
 		sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK;
 	}
 #endif
 	SCH_UNLOCK(sch);
 
 	/*
 	 * Do a standard 3-way handshake.
 	 */
 	if (syncache_respond(sc, sch, 0) == 0) {
 		if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs)
 			syncache_free(sc);
 		else if (sc != &scs)
 			syncache_insert(sc, sch);   /* locks and unlocks sch */
 		TCPSTAT_INC(tcps_sndacks);
 		TCPSTAT_INC(tcps_sndtotal);
 	} else {
 		if (sc != &scs)
 			syncache_free(sc);
 		TCPSTAT_INC(tcps_sc_dropped);
 	}
 
 done:
 	if (cred != NULL)
 		crfree(cred);
 #ifdef MAC
 	if (sc == &scs)
 		mac_syncache_destroy(&maclabel);
 #endif
 	if (m) {
 		
 		*lsop = NULL;
 		m_freem(m);
 	}
 }
 
 static int
 syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked)
 {
 	struct ip *ip = NULL;
 	struct mbuf *m;
 	struct tcphdr *th = NULL;
 	int optlen, error = 0;	/* Make compiler happy */
 	u_int16_t hlen, tlen, mssopt;
 	struct tcpopt to;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 #endif
 #ifdef TCP_SIGNATURE
 	struct secasvar *sav;
 #endif
 
 	hlen =
 #ifdef INET6
 	       (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) :
 #endif
 		sizeof(struct ip);
 	tlen = hlen + sizeof(struct tcphdr);
 
 	/* Determine MSS we advertize to other end of connection. */
 	mssopt = tcp_mssopt(&sc->sc_inc);
 	if (sc->sc_peer_mss)
 		mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss);
 
 	/* XXX: Assume that the entire packet will fit in a header mbuf. */
 	KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN,
 	    ("syncache: mbuf too small"));
 
 	/* Create the IP+TCP header from scratch. */
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 #ifdef MAC
 	mac_syncache_create_mbuf(sc->sc_label, m);
 #endif
 	m->m_data += max_linkhdr;
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 
 #ifdef INET6
 	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_src = sc->sc_inc.inc6_laddr;
 		ip6->ip6_dst = sc->sc_inc.inc6_faddr;
 		ip6->ip6_plen = htons(tlen - hlen);
 		/* ip6_hlim is set after checksum */
 		ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
 		ip6->ip6_flow |= sc->sc_flowlabel;
 
 		th = (struct tcphdr *)(ip6 + 1);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		ip = mtod(m, struct ip *);
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(struct ip) >> 2;
 		ip->ip_len = htons(tlen);
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_sum = 0;
 		ip->ip_p = IPPROTO_TCP;
 		ip->ip_src = sc->sc_inc.inc_laddr;
 		ip->ip_dst = sc->sc_inc.inc_faddr;
 		ip->ip_ttl = sc->sc_ip_ttl;
 		ip->ip_tos = sc->sc_ip_tos;
 
 		/*
 		 * See if we should do MTU discovery.  Route lookups are
 		 * expensive, so we will only unset the DF bit if:
 		 *
 		 *	1) path_mtu_discovery is disabled
 		 *	2) the SCF_UNREACH flag has been set
 		 */
 		if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
 		       ip->ip_off |= htons(IP_DF);
 
 		th = (struct tcphdr *)(ip + 1);
 	}
 #endif /* INET */
 	th->th_sport = sc->sc_inc.inc_lport;
 	th->th_dport = sc->sc_inc.inc_fport;
 
 	th->th_seq = htonl(sc->sc_iss);
 	th->th_ack = htonl(sc->sc_irs + 1);
 	th->th_off = sizeof(struct tcphdr) >> 2;
 	th->th_x2 = 0;
 	th->th_flags = TH_SYN|TH_ACK;
 	th->th_win = htons(sc->sc_wnd);
 	th->th_urp = 0;
 
 	if (sc->sc_flags & SCF_ECN) {
 		th->th_flags |= TH_ECE;
 		TCPSTAT_INC(tcps_ecn_shs);
 	}
 
 	/* Tack on the TCP options. */
 	if ((sc->sc_flags & SCF_NOOPT) == 0) {
 		to.to_flags = 0;
 
 		to.to_mss = mssopt;
 		to.to_flags = TOF_MSS;
 		if (sc->sc_flags & SCF_WINSCALE) {
 			to.to_wscale = sc->sc_requested_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		if (sc->sc_flags & SCF_TIMESTAMP) {
 			/* Virgin timestamp or TCP cookie enhanced one. */
 			to.to_tsval = sc->sc_ts;
 			to.to_tsecr = sc->sc_tsreflect;
 			to.to_flags |= TOF_TS;
 		}
 		if (sc->sc_flags & SCF_SACK)
 			to.to_flags |= TOF_SACKPERM;
 #ifdef TCP_SIGNATURE
 		sav = NULL;
 		if (sc->sc_flags & SCF_SIGNATURE) {
 			sav = tcp_get_sav(m, IPSEC_DIR_OUTBOUND);
 			if (sav != NULL)
 				to.to_flags |= TOF_SIGNATURE;
 			else {
 
 				/*
 				 * We've got SCF_SIGNATURE flag
 				 * inherited from listening socket,
 				 * but no SADB key for given source
 				 * address. Assume signature is not
 				 * required and remove signature flag
 				 * instead of silently dropping
 				 * connection.
 				 */
 				if (locked == 0)
 					SCH_LOCK(sch);
 				sc->sc_flags &= ~SCF_SIGNATURE;
 				if (locked == 0)
 					SCH_UNLOCK(sch);
 			}
 		}
 #endif
 		optlen = tcp_addoptions(&to, (u_char *)(th + 1));
 
 		/* Adjust headers by option size. */
 		th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 		m->m_len += optlen;
 		m->m_pkthdr.len += optlen;
 
 #ifdef TCP_SIGNATURE
 		if (sc->sc_flags & SCF_SIGNATURE)
 			tcp_signature_do_compute(m, 0, optlen,
 			    to.to_signature, sav);
 #endif
 #ifdef INET6
 		if (sc->sc_inc.inc_flags & INC_ISIPV6)
 			ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen);
 		else
 #endif
 			ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
 	} else
 		optlen = 0;
 
 	M_SETFIB(m, sc->sc_inc.inc_fibnum);
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
 		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 		th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen,
 		    IPPROTO_TCP, 0);
 		ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
 #ifdef TCP_OFFLOAD
 		if (ADDED_BY_TOE(sc)) {
 			struct toedev *tod = sc->sc_tod;
 
 			error = tod->tod_syncache_respond(tod, sc->sc_todctx, m);
 
 			return (error);
 		}
 #endif
 		error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(tlen + optlen - hlen + IPPROTO_TCP));
 #ifdef TCP_OFFLOAD
 		if (ADDED_BY_TOE(sc)) {
 			struct toedev *tod = sc->sc_tod;
 
 			error = tod->tod_syncache_respond(tod, sc->sc_todctx, m);
 
 			return (error);
 		}
 #endif
 		error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
 	}
 #endif
 	return (error);
 }
 
 /*
  * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks
  * that exceed the capacity of the syncache by avoiding the storage of any
  * of the SYNs we receive.  Syncookies defend against blind SYN flooding
  * attacks where the attacker does not have access to our responses.
  *
  * Syncookies encode and include all necessary information about the
  * connection setup within the SYN|ACK that we send back.  That way we
  * can avoid keeping any local state until the ACK to our SYN|ACK returns
  * (if ever).  Normally the syncache and syncookies are running in parallel
  * with the latter taking over when the former is exhausted.  When matching
  * syncache entry is found the syncookie is ignored.
  *
  * The only reliable information persisting the 3WHS is our inital sequence
  * number ISS of 32 bits.  Syncookies embed a cryptographically sufficient
  * strong hash (MAC) value and a few bits of TCP SYN options in the ISS
  * of our SYN|ACK.  The MAC can be recomputed when the ACK to our SYN|ACK
  * returns and signifies a legitimate connection if it matches the ACK.
  *
  * The available space of 32 bits to store the hash and to encode the SYN
  * option information is very tight and we should have at least 24 bits for
  * the MAC to keep the number of guesses by blind spoofing reasonably high.
  *
  * SYN option information we have to encode to fully restore a connection:
  * MSS: is imporant to chose an optimal segment size to avoid IP level
  *   fragmentation along the path.  The common MSS values can be encoded
  *   in a 3-bit table.  Uncommon values are captured by the next lower value
  *   in the table leading to a slight increase in packetization overhead.
  * WSCALE: is necessary to allow large windows to be used for high delay-
  *   bandwidth product links.  Not scaling the window when it was initially
  *   negotiated is bad for performance as lack of scaling further decreases
  *   the apparent available send window.  We only need to encode the WSCALE
  *   we received from the remote end.  Our end can be recalculated at any
  *   time.  The common WSCALE values can be encoded in a 3-bit table.
  *   Uncommon values are captured by the next lower value in the table
  *   making us under-estimate the available window size halving our
  *   theoretically possible maximum throughput for that connection.
  * SACK: Greatly assists in packet loss recovery and requires 1 bit.
  * TIMESTAMP and SIGNATURE is not encoded because they are permanent options
  *   that are included in all segments on a connection.  We enable them when
  *   the ACK has them.
  *
  * Security of syncookies and attack vectors:
  *
  * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod)
  * together with the gloabl secret to make it unique per connection attempt.
  * Thus any change of any of those parameters results in a different MAC output
  * in an unpredictable way unless a collision is encountered.  24 bits of the
  * MAC are embedded into the ISS.
  *
  * To prevent replay attacks two rotating global secrets are updated with a
  * new random value every 15 seconds.  The life-time of a syncookie is thus
  * 15-30 seconds.
  *
  * Vector 1: Attacking the secret.  This requires finding a weakness in the
  * MAC itself or the way it is used here.  The attacker can do a chosen plain
  * text attack by varying and testing the all parameters under his control.
  * The strength depends on the size and randomness of the secret, and the
  * cryptographic security of the MAC function.  Due to the constant updating
  * of the secret the attacker has at most 29.999 seconds to find the secret
  * and launch spoofed connections.  After that he has to start all over again.
  *
  * Vector 2: Collision attack on the MAC of a single ACK.  With a 24 bit MAC
  * size an average of 4,823 attempts are required for a 50% chance of success
  * to spoof a single syncookie (birthday collision paradox).  However the
  * attacker is blind and doesn't know if one of his attempts succeeded unless
  * he has a side channel to interfere success from.  A single connection setup
  * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets.
  * This many attempts are required for each one blind spoofed connection.  For
  * every additional spoofed connection he has to launch another N attempts.
  * Thus for a sustained rate 100 spoofed connections per second approximately
  * 1,800,000 packets per second would have to be sent.
  *
  * NB: The MAC function should be fast so that it doesn't become a CPU
  * exhaustion attack vector itself.
  *
  * References:
  *  RFC4987 TCP SYN Flooding Attacks and Common Mitigations
  *  SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996
  *   http://cr.yp.to/syncookies.html    (overview)
  *   http://cr.yp.to/syncookies/archive (details)
  *
  *
  * Schematic construction of a syncookie enabled Initial Sequence Number:
  *  0        1         2         3
  *  12345678901234567890123456789012
  * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP|
  *
  *  x 24 MAC (truncated)
  *  W  3 Send Window Scale index
  *  M  3 MSS index
  *  S  1 SACK permitted
  *  P  1 Odd/even secret
  */
 
 /*
  * Distribution and probability of certain MSS values.  Those in between are
  * rounded down to the next lower one.
  * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011]
  *                            .2%  .3%   5%    7%    7%    20%   15%   45%
  */
 static int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 };
 
 /*
  * Distribution and probability of certain WSCALE values.  We have to map the
  * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3
  * bits based on prevalence of certain values.  Where we don't have an exact
  * match for are rounded down to the next lower one letting us under-estimate
  * the true available window.  At the moment this would happen only for the
  * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer
  * and window size).  The absence of the WSCALE option (no scaling in either
  * direction) is encoded with index zero.
  * [WSCALE values histograms, Allman, 2012]
  *                            X 10 10 35  5  6 14 10%   by host
  *                            X 11  4  5  5 18 49  3%   by connections
  */
 static int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 };
 
 /*
  * Compute the MAC for the SYN cookie.  SIPHASH-2-4 is chosen for its speed
  * and good cryptographic properties.
  */
 static uint32_t
 syncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags,
     uint8_t *secbits, uintptr_t secmod)
 {
 	SIPHASH_CTX ctx;
 	uint32_t siphash[2];
 
 	SipHash24_Init(&ctx);
 	SipHash_SetKey(&ctx, secbits);
 	switch (inc->inc_flags & INC_ISIPV6) {
 #ifdef INET
 	case 0:
 		SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr));
 		SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr));
 		break;
 #endif
 #ifdef INET6
 	case INC_ISIPV6:
 		SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr));
 		SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr));
 		break;
 #endif
 	}
 	SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport));
 	SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport));
 	SipHash_Update(&ctx, &flags, sizeof(flags));
 	SipHash_Update(&ctx, &secmod, sizeof(secmod));
 	SipHash_Final((u_int8_t *)&siphash, &ctx);
 
 	return (siphash[0] ^ siphash[1]);
 }
 
 static tcp_seq
 syncookie_generate(struct syncache_head *sch, struct syncache *sc)
 {
 	u_int i, mss, secbit, wscale;
 	uint32_t iss, hash;
 	uint8_t *secbits;
 	union syncookie cookie;
 
 	SCH_LOCK_ASSERT(sch);
 
 	cookie.cookie = 0;
 
 	/* Map our computed MSS into the 3-bit index. */
 	mss = min(tcp_mssopt(&sc->sc_inc), max(sc->sc_peer_mss, V_tcp_minmss));
 	for (i = sizeof(tcp_sc_msstab) / sizeof(*tcp_sc_msstab) - 1;
 	     tcp_sc_msstab[i] > mss && i > 0;
 	     i--)
 		;
 	cookie.flags.mss_idx = i;
 
 	/*
 	 * Map the send window scale into the 3-bit index but only if
 	 * the wscale option was received.
 	 */
 	if (sc->sc_flags & SCF_WINSCALE) {
 		wscale = sc->sc_requested_s_scale;
 		for (i = sizeof(tcp_sc_wstab) / sizeof(*tcp_sc_wstab) - 1;
 		     tcp_sc_wstab[i] > wscale && i > 0;
 		     i--)
 			;
 		cookie.flags.wscale_idx = i;
 	}
 
 	/* Can we do SACK? */
 	if (sc->sc_flags & SCF_SACK)
 		cookie.flags.sack_ok = 1;
 
 	/* Which of the two secrets to use. */
 	secbit = sch->sch_sc->secret.oddeven & 0x1;
 	cookie.flags.odd_even = secbit;
 
 	secbits = sch->sch_sc->secret.key[secbit];
 	hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits,
 	    (uintptr_t)sch);
 
 	/*
 	 * Put the flags into the hash and XOR them to get better ISS number
 	 * variance.  This doesn't enhance the cryptographic strength and is
 	 * done to prevent the 8 cookie bits from showing up directly on the
 	 * wire.
 	 */
 	iss = hash & ~0xff;
 	iss |= cookie.cookie ^ (hash >> 24);
 
 	/* Randomize the timestamp. */
 	if (sc->sc_flags & SCF_TIMESTAMP) {
 		sc->sc_ts = arc4random();
 		sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks();
 	}
 
 	TCPSTAT_INC(tcps_sc_sendcookie);
 	return (iss);
 }
 
 static struct syncache *
 syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, 
     struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
     struct socket *lso)
 {
 	uint32_t hash;
 	uint8_t *secbits;
 	tcp_seq ack, seq;
 	int wnd, wscale = 0;
 	union syncookie cookie;
 
 	SCH_LOCK_ASSERT(sch);
 
 	/*
 	 * Pull information out of SYN-ACK/ACK and revert sequence number
 	 * advances.
 	 */
 	ack = th->th_ack - 1;
 	seq = th->th_seq - 1;
 
 	/*
 	 * Unpack the flags containing enough information to restore the
 	 * connection.
 	 */
 	cookie.cookie = (ack & 0xff) ^ (ack >> 24);
 
 	/* Which of the two secrets to use. */
 	secbits = sch->sch_sc->secret.key[cookie.flags.odd_even];
 
 	hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch);
 
 	/* The recomputed hash matches the ACK if this was a genuine cookie. */
 	if ((ack & ~0xff) != (hash & ~0xff))
 		return (NULL);
 
 	/* Fill in the syncache values. */
 	sc->sc_flags = 0;
 	bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
 	sc->sc_ipopts = NULL;
 	
 	sc->sc_irs = seq;
 	sc->sc_iss = ack;
 
 	switch (inc->inc_flags & INC_ISIPV6) {
 #ifdef INET
 	case 0:
 		sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl;
 		sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos;
 		break;
 #endif
 #ifdef INET6
 	case INC_ISIPV6:
 		if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL)
 			sc->sc_flowlabel = sc->sc_iss & IPV6_FLOWLABEL_MASK;
 		break;
 #endif
 	}
 
 	sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx];
 
 	/* We can simply recompute receive window scale we sent earlier. */
 	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max)
 		wscale++;
 
 	/* Only use wscale if it was enabled in the orignal SYN. */
 	if (cookie.flags.wscale_idx > 0) {
 		sc->sc_requested_r_scale = wscale;
 		sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx];
 		sc->sc_flags |= SCF_WINSCALE;
 	}
 
 	wnd = sbspace(&lso->so_rcv);
 	wnd = imax(wnd, 0);
 	wnd = imin(wnd, TCP_MAXWIN);
 	sc->sc_wnd = wnd;
 
 	if (cookie.flags.sack_ok)
 		sc->sc_flags |= SCF_SACK;
 
 	if (to->to_flags & TOF_TS) {
 		sc->sc_flags |= SCF_TIMESTAMP;
 		sc->sc_tsreflect = to->to_tsval;
 		sc->sc_ts = to->to_tsecr;
 		sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks();
 	}
 
 	if (to->to_flags & TOF_SIGNATURE)
 		sc->sc_flags |= SCF_SIGNATURE;
 
 	sc->sc_rxmits = 0;
 
 	TCPSTAT_INC(tcps_sc_recvcookie);
 	return (sc);
 }
 
 #ifdef INVARIANTS
 static int
 syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch,
     struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
     struct socket *lso)
 {
 	struct syncache scs, *scx;
 	char *s;
 
 	bzero(&scs, sizeof(scs));
 	scx = syncookie_lookup(inc, sch, &scs, th, to, lso);
 
 	if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL)
 		return (0);
 
 	if (scx != NULL) {
 		if (sc->sc_peer_mss != scx->sc_peer_mss)
 			log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n",
 			    s, __func__, sc->sc_peer_mss, scx->sc_peer_mss);
 
 		if (sc->sc_requested_r_scale != scx->sc_requested_r_scale)
 			log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n",
 			    s, __func__, sc->sc_requested_r_scale,
 			    scx->sc_requested_r_scale);
 
 		if (sc->sc_requested_s_scale != scx->sc_requested_s_scale)
 			log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n",
 			    s, __func__, sc->sc_requested_s_scale,
 			    scx->sc_requested_s_scale);
 
 		if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK))
 			log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__);
 	}
 
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	return (0);
 }
 #endif /* INVARIANTS */
 
 static void
 syncookie_reseed(void *arg)
 {
 	struct tcp_syncache *sc = arg;
 	uint8_t *secbits;
 	int secbit;
 
 	/*
 	 * Reseeding the secret doesn't have to be protected by a lock.
 	 * It only must be ensured that the new random values are visible
 	 * to all CPUs in a SMP environment.  The atomic with release
 	 * semantics ensures that.
 	 */
 	secbit = (sc->secret.oddeven & 0x1) ? 0 : 1;
 	secbits = sc->secret.key[secbit];
 	arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0);
 	atomic_add_rel_int(&sc->secret.oddeven, 1);
 
 	/* Reschedule ourself. */
 	callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz);
 }
 
 /*
  * Returns the current number of syncache entries.  This number
  * will probably change before you get around to calling 
  * syncache_pcblist.
  */
 int
 syncache_pcbcount(void)
 {
 	struct syncache_head *sch;
 	int count, i;
 
 	for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) {
 		/* No need to lock for a read. */
 		sch = &V_tcp_syncache.hashbase[i];
 		count += sch->sch_length;
 	}
 	return count;
 }
 
 /*
  * Exports the syncache entries to userland so that netstat can display
  * them alongside the other sockets.  This function is intended to be
  * called only from tcp_pcblist.
  *
  * Due to concurrency on an active system, the number of pcbs exported
  * may have no relation to max_pcbs.  max_pcbs merely indicates the
  * amount of space the caller allocated for this function to use.
  */
 int
 syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported)
 {
 	struct xtcpcb xt;
 	struct syncache *sc;
 	struct syncache_head *sch;
 	int count, error, i;
 
 	for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) {
 		sch = &V_tcp_syncache.hashbase[i];
 		SCH_LOCK(sch);
 		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
 			if (count >= max_pcbs) {
 				SCH_UNLOCK(sch);
 				goto exit;
 			}
 			if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0)
 				continue;
 			bzero(&xt, sizeof(xt));
 			xt.xt_len = sizeof(xt);
 			if (sc->sc_inc.inc_flags & INC_ISIPV6)
 				xt.xt_inp.inp_vflag = INP_IPV6;
 			else
 				xt.xt_inp.inp_vflag = INP_IPV4;
 			bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo));
 			xt.xt_tp.t_inpcb = &xt.xt_inp;
 			xt.xt_tp.t_state = TCPS_SYN_RECEIVED;
 			xt.xt_socket.xso_protocol = IPPROTO_TCP;
 			xt.xt_socket.xso_len = sizeof (struct xsocket);
 			xt.xt_socket.so_type = SOCK_STREAM;
 			xt.xt_socket.so_state = SS_ISCONNECTING;
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 			if (error) {
 				SCH_UNLOCK(sch);
 				goto exit;
 			}
 			count++;
 		}
 		SCH_UNLOCK(sch);
 	}
 exit:
 	*pcbs_exported = count;
 	return error;
 }
Index: user/dchagin/lemul/sys/netinet/tcp_timewait.c
===================================================================
--- user/dchagin/lemul/sys/netinet/tcp_timewait.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/tcp_timewait.c	(revision 274241)
@@ -1,723 +1,723 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef INET6
 #include <netinet6/ip6protosw.h>
 #endif
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 static VNET_DEFINE(uma_zone_t, tcptw_zone);
 #define	V_tcptw_zone		VNET(tcptw_zone)
 static int	maxtcptw;
 
 /*
  * The timed wait queue contains references to each of the TCP sessions
  * currently in the TIME_WAIT state.  The queue pointers, including the
  * queue pointers in each tcptw structure, are protected using the global
  * timewait lock, which must be held over queue iteration and modification.
  *
  * Rules on tcptw usage:
  *  - a inpcb is always freed _after_ its tcptw
  *  - a tcptw relies on its inpcb reference counting for memory stability
  *  - a tcptw is dereferenceable only while its inpcb is locked
  */
 static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
 #define	V_twq_2msl		VNET(twq_2msl)
 
 /* Global timewait lock */
 static VNET_DEFINE(struct rwlock, tw_lock);
 #define	V_tw_lock		VNET(tw_lock)
 
 #define	TW_LOCK_INIT(tw, d)	rw_init_flags(&(tw), (d), 0)
 #define	TW_LOCK_DESTROY(tw)	rw_destroy(&(tw))
 #define	TW_RLOCK(tw)		rw_rlock(&(tw))
 #define	TW_WLOCK(tw)		rw_wlock(&(tw))
 #define	TW_RUNLOCK(tw)		rw_runlock(&(tw))
 #define	TW_WUNLOCK(tw)		rw_wunlock(&(tw))
 #define	TW_LOCK_ASSERT(tw)	rw_assert(&(tw), RA_LOCKED)
 #define	TW_RLOCK_ASSERT(tw)	rw_assert(&(tw), RA_RLOCKED)
 #define	TW_WLOCK_ASSERT(tw)	rw_assert(&(tw), RA_WLOCKED)
 #define	TW_UNLOCK_ASSERT(tw)	rw_assert(&(tw), RA_UNLOCKED)
 
 static void	tcp_tw_2msl_reset(struct tcptw *, int);
 static void	tcp_tw_2msl_stop(struct tcptw *, int);
 static int	tcp_twrespond(struct tcptw *, int);
 
 static int
 tcptw_auto_size(void)
 {
 	int halfrange;
 
 	/*
 	 * Max out at half the ephemeral port range so that TIME_WAIT
 	 * sockets don't tie up too many ephemeral ports.
 	 */
 	if (V_ipport_lastauto > V_ipport_firstauto)
 		halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2;
 	else
 		halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2;
 	/* Protect against goofy port ranges smaller than 32. */
 	return (imin(imax(halfrange, 32), maxsockets / 5));
 }
 
 static int
 sysctl_maxtcptw(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	if (maxtcptw == 0)
 		new = tcptw_auto_size();
 	else
 		new = maxtcptw;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr)
 		if (new >= 32) {
 			maxtcptw = new;
 			uma_zone_set_max(V_tcptw_zone, maxtcptw);
 		}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW,
     &maxtcptw, 0, sysctl_maxtcptw, "IU",
     "Maximum number of compressed TCP TIME_WAIT entries");
 
 VNET_DEFINE(int, nolocaltimewait) = 0;
 #define	V_nolocaltimewait	VNET(nolocaltimewait)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(nolocaltimewait), 0,
     "Do not create compressed TCP TIME_WAIT entries for local connections");
 
 void
 tcp_tw_zone_change(void)
 {
 
 	if (maxtcptw == 0)
 		uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
 }
 
 void
 tcp_tw_init(void)
 {
 
 	V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw);
 	if (maxtcptw == 0)
 		uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
 	else
 		uma_zone_set_max(V_tcptw_zone, maxtcptw);
 	TAILQ_INIT(&V_twq_2msl);
 	TW_LOCK_INIT(V_tw_lock, "tcptw");
 }
 
 #ifdef VIMAGE
 void
 tcp_tw_destroy(void)
 {
 	struct tcptw *tw;
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
 		tcp_twclose(tw, 0);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 	TW_LOCK_DESTROY(V_tw_lock);
 	uma_zdestroy(V_tcptw_zone);
 }
 #endif
 
 /*
  * Move a TCP connection into TIME_WAIT state.
  *    tcbinfo is locked.
  *    inp is locked, and is unlocked before returning.
  */
 void
 tcp_twstart(struct tcpcb *tp)
 {
 	struct tcptw *tw;
 	struct inpcb *inp = tp->t_inpcb;
 	int acknow;
 	struct socket *so;
 #ifdef INET6
 	int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
 #endif
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if (V_nolocaltimewait) {
 		int error = 0;
 #ifdef INET6
 		if (isipv6)
 			error = in6_localaddr(&inp->in6p_faddr);
 #endif
 #if defined(INET6) && defined(INET)
 		else
 #endif
 #ifdef INET
 			error = in_localip(inp->inp_faddr);
 #endif
 		if (error) {
 			tp = tcp_close(tp);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 			return;
 		}
 	}
 
 	tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
 	if (tw == NULL) {
 		/*
 		 * Reached limit on total number of TIMEWAIT connections
 		 * allowed. Remove a connection from TIMEWAIT queue in LRU
 		 * fashion to make room for this connection.
 		 *
 		 * pcbinfo lock is needed here to prevent deadlock as
 		 * two inpcb locks can be acquired simultaneously.
 		 */
 		tw = tcp_tw_2msl_scan(1);
 		if (tw == NULL) {
 			tp = tcp_close(tp);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 			return;
 		}
 	}
 	/*
 	 * The tcptw will hold a reference on its inpcb until tcp_twclose
 	 * is called
 	 */
 	tw->tw_inpcb = inp;
 	in_pcbref(inp);	/* Reference from tw */
 
 	/*
 	 * Recover last window size sent.
 	 */
 	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
 		tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
 	else
 		tw->last_win = 0;
 
 	/*
 	 * Set t_recent if timestamps are used on the connection.
 	 */
 	if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
 	    (TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
 		tw->t_recent = tp->ts_recent;
 		tw->ts_offset = tp->ts_offset;
 	} else {
 		tw->t_recent = 0;
 		tw->ts_offset = 0;
 	}
 
 	tw->snd_nxt = tp->snd_nxt;
 	tw->rcv_nxt = tp->rcv_nxt;
 	tw->iss     = tp->iss;
 	tw->irs     = tp->irs;
 	tw->t_starttime = tp->t_starttime;
 	tw->tw_time = 0;
 
 /* XXX
  * If this code will
  * be used for fin-wait-2 state also, then we may need
  * a ts_recent from the last segment.
  */
 	acknow = tp->t_flags & TF_ACKNOW;
 
 	/*
 	 * First, discard tcpcb state, which includes stopping its timers and
 	 * freeing it.  tcp_discardcb() used to also release the inpcb, but
 	 * that work is now done in the caller.
 	 *
 	 * Note: soisdisconnected() call used to be made in tcp_discardcb(),
 	 * and might not be needed here any longer.
 	 */
 	tcp_discardcb(tp);
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	tw->tw_cred = crhold(so->so_cred);
 	SOCK_LOCK(so);
 	tw->tw_so_options = so->so_options;
 	SOCK_UNLOCK(so);
 	if (acknow)
 		tcp_twrespond(tw, TH_ACK);
 	inp->inp_ppcb = tw;
 	inp->inp_flags |= INP_TIMEWAIT;
 	tcp_tw_2msl_reset(tw, 0);
 
 	/*
 	 * If the inpcb owns the sole reference to the socket, then we can
 	 * detach and free the socket as it is not needed in time wait.
 	 */
 	if (inp->inp_flags & INP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("tcp_twstart: !SS_PROTOREF"));
 		inp->inp_flags &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		ACCEPT_LOCK();
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_PROTOREF;
 		sofree(so);
 	} else
 		INP_WUNLOCK(inp);
 }
 
 /*
  * Returns 1 if the TIME_WAIT state was killed and we should start over,
  * looking for a pcb in the listen state.  Returns 0 otherwise.
  */
 int
 tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th,
     struct mbuf *m, int tlen)
 {
 	struct tcptw *tw;
 	int thflags;
 	tcp_seq seq;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * XXXRW: Time wait state for inpcb has been recycled, but inpcb is
 	 * still present.  This is undesirable, but temporarily necessary
 	 * until we work out how to handle inpcb's who's timewait state has
 	 * been removed.
 	 */
 	tw = intotw(inp);
 	if (tw == NULL)
 		goto drop;
 
 	thflags = th->th_flags;
 
 	/*
 	 * NOTE: for FIN_WAIT_2 (to be added later),
 	 * must validate sequence number before accepting RST
 	 */
 
 	/*
 	 * If the segment contains RST:
 	 *	Drop the segment - see Stevens, vol. 2, p. 964 and
 	 *      RFC 1337.
 	 */
 	if (thflags & TH_RST)
 		goto drop;
 
 #if 0
 /* PAWS not needed at the moment */
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
 	 * and it's less than ts_recent, drop it.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 		if ((thflags & TH_ACK) == 0)
 			goto drop;
 		goto ack;
 	}
 	/*
 	 * ts_recent is never updated because we never accept new segments.
 	 */
 #endif
 
 	/*
 	 * If a new connection request is received
 	 * while in TIME_WAIT, drop the old connection
 	 * and start over if the sequence numbers
 	 * are above the previous ones.
 	 */
 	if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
 		tcp_twclose(tw, 0);
 		return (1);
 	}
 
 	/*
 	 * Drop the segment if it does not contain an ACK.
 	 */
 	if ((thflags & TH_ACK) == 0)
 		goto drop;
 
 	/*
 	 * Reset the 2MSL timer if this is a duplicate FIN.
 	 */
 	if (thflags & TH_FIN) {
 		seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0);
 		if (seq + 1 == tw->rcv_nxt)
 			tcp_tw_2msl_reset(tw, 1);
 	}
 
 	/*
 	 * Acknowledge the segment if it has data or is not a duplicate ACK.
 	 */
 	if (thflags != TH_ACK || tlen != 0 ||
 	    th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt)
 		tcp_twrespond(tw, TH_ACK);
 drop:
 	INP_WUNLOCK(inp);
 	m_freem(m);
 	return (0);
 }
 
 void
 tcp_twclose(struct tcptw *tw, int reuse)
 {
 	struct socket *so;
 	struct inpcb *inp;
 
 	/*
 	 * At this point, we are in one of two situations:
 	 *
 	 * (1) We have no socket, just an inpcb<->twtcp pair.  We can free
 	 *     all state.
 	 *
 	 * (2) We have a socket -- if we own a reference, release it and
 	 *     notify the socket layer.
 	 */
 	inp = tw->tw_inpcb;
 	KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
 	KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);	/* in_pcbfree() */
 	INP_WLOCK_ASSERT(inp);
 
 	tcp_tw_2msl_stop(tw, reuse);
 	inp->inp_ppcb = NULL;
 	in_pcbdrop(inp);
 
 	so = inp->inp_socket;
 	if (so != NULL) {
 		/*
 		 * If there's a socket, handle two cases: first, we own a
 		 * strong reference, which we will now release, or we don't
 		 * in which case another reference exists (XXXRW: think
 		 * about this more), and we don't need to take action.
 		 */
 		if (inp->inp_flags & INP_SOCKREF) {
 			inp->inp_flags &= ~INP_SOCKREF;
 			INP_WUNLOCK(inp);
 			ACCEPT_LOCK();
 			SOCK_LOCK(so);
 			KASSERT(so->so_state & SS_PROTOREF,
 			    ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF"));
 			so->so_state &= ~SS_PROTOREF;
 			sofree(so);
 		} else {
 			/*
 			 * If we don't own the only reference, the socket and
 			 * inpcb need to be left around to be handled by
 			 * tcp_usr_detach() later.
 			 */
 			INP_WUNLOCK(inp);
 		}
 	} else {
 		/*
 		 * The socket has been already cleaned-up for us, only free the
 		 * inpcb.
 		 */
 		in_pcbfree(inp);
 	}
 	TCPSTAT_INC(tcps_closed);
 }
 
 static int
 tcp_twrespond(struct tcptw *tw, int flags)
 {
 	struct inpcb *inp = tw->tw_inpcb;
 #if defined(INET6) || defined(INET)
 	struct tcphdr *th = NULL;
 #endif
 	struct mbuf *m;
 #ifdef INET
 	struct ip *ip = NULL;
 #endif
 	u_int hdrlen, optlen;
 	int error = 0;			/* Keep compiler happy */
 	struct tcpopt to;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
 #endif
 	hdrlen = 0;                     /* Keep compiler happy */
 
 	INP_WLOCK_ASSERT(inp);
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_data += max_linkhdr;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 #ifdef INET6
 	if (isipv6) {
 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		tcpip_fillheaders(inp, ip6, th);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		hdrlen = sizeof(struct tcpiphdr);
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)(ip + 1);
 		tcpip_fillheaders(inp, ip, th);
 	}
 #endif
 	to.to_flags = 0;
 
 	/*
 	 * Send a timestamp and echo-reply if both our side and our peer
 	 * have sent timestamps in our SYN's and this is not a RST.
 	 */
 	if (tw->t_recent && flags == TH_ACK) {
 		to.to_flags |= TOF_TS;
 		to.to_tsval = tcp_ts_getticks() + tw->ts_offset;
 		to.to_tsecr = tw->t_recent;
 	}
 	optlen = tcp_addoptions(&to, (u_char *)(th + 1));
 
 	m->m_len = hdrlen + optlen;
 	m->m_pkthdr.len = m->m_len;
 
 	KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
 
 	th->th_seq = htonl(tw->snd_nxt);
 	th->th_ack = htonl(tw->rcv_nxt);
 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 	th->th_flags = flags;
 	th->th_win = htons(tw->last_win);
 
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (isipv6) {
 		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 		th->th_sum = in6_cksum_pseudo(ip6,
 		    sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0);
 		ip6->ip6_hlim = in6_selecthlim(inp, NULL);
 		error = ip6_output(m, inp->in6p_outputopts, NULL,
 		    (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
 		ip->ip_len = htons(m->m_pkthdr.len);
 		if (V_path_mtu_discovery)
 			ip->ip_off |= htons(IP_DF);
 		error = ip_output(m, inp->inp_options, NULL,
 		    ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
 		    NULL, inp);
 	}
 #endif
 	if (flags & TH_ACK)
 		TCPSTAT_INC(tcps_sndacks);
 	else
 		TCPSTAT_INC(tcps_sndctrl);
 	TCPSTAT_INC(tcps_sndtotal);
 	return (error);
 }
 
 static void
 tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
 {
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tw->tw_inpcb);
 
 	TW_WLOCK(V_tw_lock);
 	if (rearm)
 		TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
 	tw->tw_time = ticks + 2 * tcp_msl;
 	TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
 	TW_WUNLOCK(V_tw_lock);
 }
 
 static void
 tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
 {
 	struct ucred *cred;
 	struct inpcb *inp;
 	int released;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 	TW_WLOCK(V_tw_lock);
 	inp = tw->tw_inpcb;
 	tw->tw_inpcb = NULL;
 
 	TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
 	cred = tw->tw_cred;
 	tw->tw_cred = NULL;
 	TW_WUNLOCK(V_tw_lock);
 
 	if (cred != NULL)
 		crfree(cred);
 
 	released = in_pcbrele_wlocked(inp);
 	KASSERT(!released, ("%s: inp should not be released here", __func__));
 
 	if (!reuse)
 		uma_zfree(V_tcptw_zone, tw);
 }
 
 struct tcptw *
 tcp_tw_2msl_scan(int reuse)
 {
 	struct tcptw *tw;
 	struct inpcb *inp;
 
 #ifdef INVARIANTS
 	if (reuse) {
 		/*
 		 * pcbinfo lock is needed in reuse case to prevent deadlock
 		 * as two inpcb locks can be acquired simultaneously:
 		 *  - the inpcb transitioning to TIME_WAIT state in
 		 *    tcp_tw_start(),
 		 *  - the inpcb closed by tcp_twclose().
 		 */
 		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	}
 #endif
 
 	for (;;) {
 		TW_RLOCK(V_tw_lock);
 		tw = TAILQ_FIRST(&V_twq_2msl);
 		if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
 			TW_RUNLOCK(V_tw_lock);
 			break;
 		}
 		KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
 		    __func__));
 
 		inp = tw->tw_inpcb;
 		in_pcbref(inp);
 		TW_RUNLOCK(V_tw_lock);
 
 		if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
 
 			INP_WLOCK(inp);
 			tw = intotw(inp);
 			if (in_pcbrele_wlocked(inp)) {
 				KASSERT(tw == NULL, ("%s: held last inp "
 				    "reference but tw not NULL", __func__));
 				INP_INFO_WUNLOCK(&V_tcbinfo);
 				continue;
 			}
 
 			if (tw == NULL) {
 				/* tcp_twclose() has already been called */
 				INP_WUNLOCK(inp);
 				INP_INFO_WUNLOCK(&V_tcbinfo);
 				continue;
 			}
 
 			tcp_twclose(tw, reuse);
 			INP_INFO_WUNLOCK(&V_tcbinfo);
 			if (reuse)
 			    return tw;
 		} else {
 			/* INP_INFO lock is busy, continue later. */
 			INP_WLOCK(inp);
 			if (!in_pcbrele_wlocked(inp))
 				INP_WUNLOCK(inp);
 			break;
 		}
 	}
 
 	return NULL;
 }
Index: user/dchagin/lemul/sys/netinet/udp_usrreq.c
===================================================================
--- user/dchagin/lemul/sys/netinet/udp_usrreq.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet/udp_usrreq.c	(revision 274241)
@@ -1,1903 +1,1903 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2008 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * Copyright (c) 2014 Kevin Lo
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/udplite.h>
 #include <netinet/in_rss.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/esp.h>
 #endif
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * UDP and UDP-Lite protocols implementation.
  * Per RFC 768, August, 1980.
  * Per RFC 3828, July, 2004.
  */
 
 /*
  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
  * removes the only data integrity mechanism for packets and malformed
  * packets that would otherwise be discarded due to bad checksums, and may
  * cause problems (especially for NFS data blocks).
  */
 VNET_DEFINE(int, udp_cksum) = 1;
-SYSCTL_VNET_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(udp_cksum), 0, "compute udp checksum");
 
 int	udp_log_in_vain = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &udp_log_in_vain, 0, "Log all incoming UDP packets");
 
 VNET_DEFINE(int, udp_blackhole) = 0;
-SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(udp_blackhole), 0,
     "Do not send port unreachables for refused connects");
 
 u_long	udp_sendspace = 9216;		/* really max datagram size */
 					/* 40 1K datagrams */
 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
 
 u_long	udp_recvspace = 40 * (1024 +
 #ifdef INET6
 				      sizeof(struct sockaddr_in6)
 #else
 				      sizeof(struct sockaddr_in)
 #endif
 				      );
 
 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
 
 VNET_DEFINE(struct inpcbhead, udb);		/* from udp_var.h */
 VNET_DEFINE(struct inpcbinfo, udbinfo);
 VNET_DEFINE(struct inpcbhead, ulitecb);
 VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
 static VNET_DEFINE(uma_zone_t, udpcb_zone);
 #define	V_udpcb_zone			VNET(udpcb_zone)
 
 #ifndef UDBHASHSIZE
 #define	UDBHASHSIZE	128
 #endif
 
 VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);		/* from udp_var.h */
 VNET_PCPUSTAT_SYSINIT(udpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
     udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(udpstat);
 #endif /* VIMAGE */
 #ifdef INET
 static void	udp_detach(struct socket *so);
 static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
 		    struct mbuf *, struct thread *);
 #endif
 
 #ifdef IPSEC
 #ifdef IPSEC_NAT_T
 #define	UF_ESPINUDP_ALL	(UF_ESPINUDP_NON_IKE|UF_ESPINUDP)
 #ifdef INET
 static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int);
 #endif
 #endif /* IPSEC_NAT_T */
 #endif /* IPSEC */
 
 static void
 udp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
 	uma_zone_set_max(V_udpcb_zone, maxsockets);
 }
 
 static int
 udp_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp;
 
 	inp = mem;
 	INP_LOCK_INIT(inp, "inp", "udpinp");
 	return (0);
 }
 
 static int
 udplite_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp;
 
 	inp = mem;
 	INP_LOCK_INIT(inp, "inp", "udpliteinp");
 	return (0);
 }
 
 void
 udp_init(void)
 {
 
 	/*
 	 * For now default to 2-tuple UDP hashing - until the fragment
 	 * reassembly code can also update the flowid.
 	 *
 	 * Once we can calculate the flowid that way and re-establish
 	 * a 4-tuple, flip this to 4-tuple.
 	 */
 	in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
 	    "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE,
 	    IPI_HASHFIELDS_2TUPLE);
 	V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(V_udpcb_zone, maxsockets);
 	uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
 	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 void
 udplite_init(void)
 {
 
 	in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE,
 	    UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL,
 	    UMA_ZONE_NOFREE, IPI_HASHFIELDS_2TUPLE);
 }
 
 /*
  * Kernel module interface for updating udpstat.  The argument is an index
  * into udpstat treated as an array of u_long.  While this encodes the
  * general layout of udpstat into the caller, it doesn't encode its location,
  * so that future changes to add, for example, per-CPU stats support won't
  * cause binary compatibility problems for kernel modules.
  */
 void
 kmod_udpstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(udpstat)[statnum], 1);
 }
 
 int
 udp_newudpcb(struct inpcb *inp)
 {
 	struct udpcb *up;
 
 	up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
 	if (up == NULL)
 		return (ENOBUFS);
 	inp->inp_ppcb = up;
 	return (0);
 }
 
 void
 udp_discardcb(struct udpcb *up)
 {
 
 	uma_zfree(V_udpcb_zone, up);
 }
 
 #ifdef VIMAGE
 void
 udp_destroy(void)
 {
 
 	in_pcbinfo_destroy(&V_udbinfo);
 	uma_zdestroy(V_udpcb_zone);
 }
 
 void
 udplite_destroy(void)
 {
 
 	in_pcbinfo_destroy(&V_ulitecbinfo);
 }
 #endif
 
 #ifdef INET
 /*
  * Subroutine of udp_input(), which appends the provided mbuf chain to the
  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
  * contains the source address.  If the socket ends up being an IPv6 socket,
  * udp_append() will convert to a sockaddr_in6 before passing the address
  * into the socket code.
  */
 static void
 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
     struct sockaddr_in *udp_in)
 {
 	struct sockaddr *append_sa;
 	struct socket *so;
 	struct mbuf *opts = 0;
 #ifdef INET6
 	struct sockaddr_in6 udp_in6;
 #endif
 	struct udpcb *up;
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * Engage the tunneling protocol.
 	 */
 	up = intoudpcb(inp);
 	if (up->u_tun_func != NULL) {
 		(*up->u_tun_func)(n, off, inp, (struct sockaddr *)udp_in,
 		    up->u_tun_ctx);
 		return;
 	}
 
 	off += sizeof(struct udphdr);
 
 #ifdef IPSEC
 	/* Check AH/ESP integrity. */
 	if (ipsec4_in_reject(n, inp)) {
 		m_freem(n);
 		IPSECSTAT_INC(ips_in_polvio);
 		return;
 	}
 #ifdef IPSEC_NAT_T
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
 	if (up->u_flags & UF_ESPINUDP_ALL) {	/* IPSec UDP encaps. */
 		n = udp4_espdecap(inp, n, off);
 		if (n == NULL)				/* Consumed. */
 			return;
 	}
 #endif /* IPSEC_NAT_T */
 #endif /* IPSEC */
 #ifdef MAC
 	if (mac_inpcb_check_deliver(inp, n) != 0) {
 		m_freem(n);
 		return;
 	}
 #endif /* MAC */
 	if (inp->inp_flags & INP_CONTROLOPTS ||
 	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6)
 			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
 		else
 #endif /* INET6 */
 			ip_savecontrol(inp, &opts, ip, n);
 	}
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		bzero(&udp_in6, sizeof(udp_in6));
 		udp_in6.sin6_len = sizeof(udp_in6);
 		udp_in6.sin6_family = AF_INET6;
 		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
 		append_sa = (struct sockaddr *)&udp_in6;
 	} else
 #endif /* INET6 */
 		append_sa = (struct sockaddr *)udp_in;
 	m_adj(n, off);
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		UDPSTAT_INC(udps_fullsock);
 	} else
 		sorwakeup_locked(so);
 }
 
 int
 udp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct ip *ip;
 	struct udphdr *uh;
 	struct ifnet *ifp;
 	struct inpcb *inp;
 	uint16_t len, ip_len;
 	struct inpcbinfo *pcbinfo;
 	struct ip save_ip;
 	struct sockaddr_in udp_in;
 	struct mbuf *m;
 	struct m_tag *fwd_tag;
 	int cscov_partial, iphlen;
 
 	m = *mp;
 	iphlen = *offp;
 	ifp = m->m_pkthdr.rcvif;
 	*mp = NULL;
 	UDPSTAT_INC(udps_ipackets);
 
 	/*
 	 * Strip IP options, if any; should skip this, make available to
 	 * user, and use on returned packets, but we don't yet have a way to
 	 * check the checksum with options still present.
 	 */
 	if (iphlen > sizeof (struct ip)) {
 		ip_stripoptions(m);
 		iphlen = sizeof(struct ip);
 	}
 
 	/*
 	 * Get IP and UDP header together in first mbuf.
 	 */
 	ip = mtod(m, struct ip *);
 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
 			UDPSTAT_INC(udps_hdrops);
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 	}
 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
 	cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
 
 	/*
 	 * Destination port of 0 is illegal, based on RFC768.
 	 */
 	if (uh->uh_dport == 0)
 		goto badunlocked;
 
 	/*
 	 * Construct sockaddr format source address.  Stuff source address
 	 * and datagram in user buffer.
 	 */
 	bzero(&udp_in, sizeof(udp_in));
 	udp_in.sin_len = sizeof(udp_in);
 	udp_in.sin_family = AF_INET;
 	udp_in.sin_port = uh->uh_sport;
 	udp_in.sin_addr = ip->ip_src;
 
 	/*
 	 * Make mbuf data length reflect UDP length.  If not enough data to
 	 * reflect UDP length, drop.
 	 */
 	len = ntohs((u_short)uh->uh_ulen);
 	ip_len = ntohs(ip->ip_len) - iphlen;
 	if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) {
 		/* Zero means checksum over the complete packet. */
 		if (len == 0)
 			len = ip_len;
 		cscov_partial = 0;
 	}
 	if (ip_len != len) {
 		if (len > ip_len || len < sizeof(struct udphdr)) {
 			UDPSTAT_INC(udps_badlen);
 			goto badunlocked;
 		}
 		if (proto == IPPROTO_UDP)
 			m_adj(m, len - ip_len);
 	}
 
 	/*
 	 * Save a copy of the IP header in case we want restore it for
 	 * sending an ICMP error message in response.
 	 */
 	if (!V_udp_blackhole)
 		save_ip = *ip;
 	else
 		memset(&save_ip, 0, sizeof(save_ip));
 
 	/*
 	 * Checksum extended UDP header and data.
 	 */
 	if (uh->uh_sum) {
 		u_short uh_sum;
 
 		if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
 		    !cscov_partial) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				uh_sum = m->m_pkthdr.csum_data;
 			else
 				uh_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr, htonl((u_short)len +
 				    m->m_pkthdr.csum_data + proto));
 			uh_sum ^= 0xffff;
 		} else {
 			char b[9];
 
 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
 			bzero(((struct ipovly *)ip)->ih_x1, 9);
 			((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ?
 			    uh->uh_ulen : htons(ip_len);
 			uh_sum = in_cksum(m, len + sizeof (struct ip));
 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
 		}
 		if (uh_sum) {
 			UDPSTAT_INC(udps_badsum);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	} else {
 		if (proto == IPPROTO_UDP) {
 			UDPSTAT_INC(udps_nosum);
 		} else {
 			/* UDPLite requires a checksum */
 			/* XXX: What is the right UDPLite MIB counter here? */
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	}
 
 	pcbinfo = get_inpcbinfo(proto);
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    in_broadcast(ip->ip_dst, ifp)) {
 		struct inpcb *last;
 		struct inpcbhead *pcblist;
 		struct ip_moptions *imo;
 
 		INP_INFO_RLOCK(pcbinfo);
 		pcblist = get_pcblist(proto);
 		last = NULL;
 		LIST_FOREACH(inp, pcblist, inp_list) {
 			if (inp->inp_lport != uh->uh_dport)
 				continue;
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_laddr.s_addr != INADDR_ANY &&
 			    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 				continue;
 			if (inp->inp_faddr.s_addr != INADDR_ANY &&
 			    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 				continue;
 			if (inp->inp_fport != 0 &&
 			    inp->inp_fport != uh->uh_sport)
 				continue;
 
 			INP_RLOCK(inp);
 
 			/*
 			 * XXXRW: Because we weren't holding either the inpcb
 			 * or the hash lock when we checked for a match
 			 * before, we should probably recheck now that the
 			 * inpcb lock is held.
 			 */
 
 			/*
 			 * Handle socket delivery policy for any-source
 			 * and source-specific multicast. [RFC3678]
 			 */
 			imo = inp->inp_moptions;
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 				struct sockaddr_in	 group;
 				int			 blocked;
 				if (imo == NULL) {
 					INP_RUNLOCK(inp);
 					continue;
 				}
 				bzero(&group, sizeof(struct sockaddr_in));
 				group.sin_len = sizeof(struct sockaddr_in);
 				group.sin_family = AF_INET;
 				group.sin_addr = ip->ip_dst;
 
 				blocked = imo_multi_filter(imo, ifp,
 					(struct sockaddr *)&group,
 					(struct sockaddr *)&udp_in);
 				if (blocked != MCAST_PASS) {
 					if (blocked == MCAST_NOTGMEMBER)
 						IPSTAT_INC(ips_notmember);
 					if (blocked == MCAST_NOTSMEMBER ||
 					    blocked == MCAST_MUTED)
 						UDPSTAT_INC(udps_filtermcast);
 					INP_RUNLOCK(inp);
 					continue;
 				}
 			}
 			if (last != NULL) {
 				struct mbuf *n;
 
 				if ((n = m_copy(m, 0, M_COPYALL)) != NULL) {
 					UDP_PROBE(receive, NULL, last, ip,
 					    last, uh);
 					udp_append(last, ip, n, iphlen,
 					    &udp_in);
 				}
 				INP_RUNLOCK(last);
 			}
 			last = inp;
 			/*
 			 * Don't look for additional matches if this one does
 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
 			 * socket options set.  This heuristic avoids
 			 * searching through all pcbs in the common case of a
 			 * non-shared port.  It assumes that an application
 			 * will never clear these options after setting them.
 			 */
 			if ((last->inp_socket->so_options &
 			    (SO_REUSEPORT|SO_REUSEADDR)) == 0)
 				break;
 		}
 
 		if (last == NULL) {
 			/*
 			 * No matching pcb found; discard datagram.  (No need
 			 * to send an ICMP Port Unreachable for a broadcast
 			 * or multicast datgram.)
 			 */
 			UDPSTAT_INC(udps_noportbcast);
 			if (inp)
 				INP_RUNLOCK(inp);
 			INP_INFO_RUNLOCK(pcbinfo);
 			goto badunlocked;
 		}
 		UDP_PROBE(receive, NULL, last, ip, last, uh);
 		udp_append(last, ip, m, iphlen, &udp_in);
 		INP_RUNLOCK(last);
 		INP_INFO_RUNLOCK(pcbinfo);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Locate pcb for datagram.
 	 */
 
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		struct sockaddr_in *next_hop;
 
 		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
 
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * Already got one like this?
 		 */
 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in_pcblookup(pcbinfo, ip->ip_src,
 			    uh->uh_sport, next_hop->sin_addr,
 			    next_hop->sin_port ? htons(next_hop->sin_port) :
 			    uh->uh_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_RLOCKPCB, ifp);
 		}
 		/* Remove the tag from the packet. We don't need it anymore. */
 		m_tag_delete(m, fwd_tag);
 		m->m_flags &= ~M_IP_NEXTHOP;
 	} else
 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
 		    INPLOOKUP_RLOCKPCB, ifp, m);
 	if (inp == NULL) {
 		if (udp_log_in_vain) {
 			char buf[4*sizeof "123"];
 
 			strcpy(buf, inet_ntoa(ip->ip_dst));
 			log(LOG_INFO,
 			    "Connection attempt to UDP %s:%d from %s:%d\n",
 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
 			    ntohs(uh->uh_sport));
 		}
 		UDPSTAT_INC(udps_noport);
 		if (m->m_flags & (M_BCAST | M_MCAST)) {
 			UDPSTAT_INC(udps_noportbcast);
 			goto badunlocked;
 		}
 		if (V_udp_blackhole)
 			goto badunlocked;
 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
 			goto badunlocked;
 		*ip = save_ip;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Check the minimum TTL for socket.
 	 */
 	INP_RLOCK_ASSERT(inp);
 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
 		INP_RUNLOCK(inp);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	if (cscov_partial) {
 		struct udpcb *up;
 
 		up = intoudpcb(inp);
 		if (up->u_rxcslen == 0 || up->u_rxcslen > len) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	}
 
 	UDP_PROBE(receive, NULL, inp, ip, inp, uh);
 	udp_append(inp, ip, m, iphlen, &udp_in);
 	INP_RUNLOCK(inp);
 	return (IPPROTO_DONE);
 
 badunlocked:
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 #endif /* INET */
 
 /*
  * Notify a udp user of an asynchronous error; just wake up so that they can
  * collect error status.
  */
 struct inpcb *
 udp_notify(struct inpcb *inp, int errno)
 {
 
 	/*
 	 * While udp_ctlinput() always calls udp_notify() with a read lock
 	 * when invoking it directly, in_pcbnotifyall() currently uses write
 	 * locks due to sharing code with TCP.  For now, accept either a read
 	 * or a write lock, but a read lock is sufficient.
 	 */
 	INP_LOCK_ASSERT(inp);
 
 	inp->inp_socket->so_error = errno;
 	sorwakeup(inp->inp_socket);
 	sowwakeup(inp->inp_socket);
 	return (inp);
 }
 
 #ifdef INET
 static void
 udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
     struct inpcbinfo *pcbinfo)
 {
 	struct ip *ip = vip;
 	struct udphdr *uh;
 	struct in_addr faddr;
 	struct inpcb *inp;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	/*
 	 * Redirects don't need to be handled up here.
 	 */
 	if (PRC_IS_REDIRECT(cmd))
 		return;
 
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 *
 	 * XXX: We never get this from ICMP, otherwise it makes an excellent
 	 * DoS attack on machines with many connections.
 	 */
 	if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip != NULL) {
 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
 		    ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL);
 		if (inp != NULL) {
 			INP_RLOCK_ASSERT(inp);
 			if (inp->inp_socket != NULL) {
 				udp_notify(inp, inetctlerrmap[cmd]);
 			}
 			INP_RUNLOCK(inp);
 		}
 	} else
 		in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
 		    udp_notify);
 }
 void
 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 
 	return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
 }
 
 void
 udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 
 	return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
 }
 #endif /* INET */
 
 static int
 udp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the PCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = V_udbinfo.ipi_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return (0);
 	}
 
 	if (req->newptr != 0)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_udbinfo);
 	gencnt = V_udbinfo.ipi_gencnt;
 	n = V_udbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_udbinfo);
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ n * sizeof(struct xinpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return (ENOMEM);
 
 	INP_INFO_RLOCK(&V_udbinfo);
 	for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_WLOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
 			in_pcbref(inp);
 			inp_list[i++] = inp;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_udbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
 
 			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WLOCK(&V_udbinfo);
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_udbinfo);
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.  If the
 		 * generation differs from what we told her before, she knows
 		 * that something happened while we were processing this
 		 * request, and it might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_udbinfo);
 		xig.xig_gen = V_udbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_udbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&V_udbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
     udp_pcblist, "S,xinpcb", "List of active UDP sockets");
 
 #ifdef INET
 static int
 udp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port,
 	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		INP_RLOCK_ASSERT(inp);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
 #endif /* INET */
 
 int
 udp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 	int isudplite, error, optval;
 
 	error = 0;
 	isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	INP_WLOCK(inp);
 	if (sopt->sopt_level != so->so_proto->pr_protocol) {
 #ifdef INET6
 		if (INP_CHECK_SOCKAF(so, AF_INET6)) {
 			INP_WUNLOCK(inp);
 			error = ip6_ctloutput(so, sopt);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			INP_WUNLOCK(inp);
 			error = ip_ctloutput(so, sopt);
 		}
 #endif
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case UDP_ENCAP:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 			inp = sotoinpcb(so);
 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 			INP_WLOCK(inp);
 #ifdef IPSEC_NAT_T
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 #endif
 			switch (optval) {
 			case 0:
 				/* Clear all UDP encap. */
 #ifdef IPSEC_NAT_T
 				up->u_flags &= ~UF_ESPINUDP_ALL;
 #endif
 				break;
 #ifdef IPSEC_NAT_T
 			case UDP_ENCAP_ESPINUDP:
 			case UDP_ENCAP_ESPINUDP_NON_IKE:
 				up->u_flags &= ~UF_ESPINUDP_ALL;
 				if (optval == UDP_ENCAP_ESPINUDP)
 					up->u_flags |= UF_ESPINUDP;
 				else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE)
 					up->u_flags |= UF_ESPINUDP_NON_IKE;
 				break;
 #endif
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 		case UDPLITE_SEND_CSCOV:
 		case UDPLITE_RECV_CSCOV:
 			if (!isudplite) {
 				INP_WUNLOCK(inp);
 				error = ENOPROTOOPT;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 			    sizeof(optval));
 			if (error != 0)
 				break;
 			inp = sotoinpcb(so);
 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 			INP_WLOCK(inp);
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 			if ((optval != 0 && optval < 8) || (optval > 65535)) {
 				INP_WUNLOCK(inp);
 				error = EINVAL;
 				break;
 			}
 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
 				up->u_txcslen = optval;
 			else
 				up->u_rxcslen = optval;
 			INP_WUNLOCK(inp);
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 #ifdef IPSEC_NAT_T
 		case UDP_ENCAP:
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 			optval = up->u_flags & UF_ESPINUDP_ALL;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 #endif
 		case UDPLITE_SEND_CSCOV:
 		case UDPLITE_RECV_CSCOV:
 			if (!isudplite) {
 				INP_WUNLOCK(inp);
 				error = ENOPROTOOPT;
 				break;
 			}
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
 				optval = up->u_txcslen;
 			else
 				optval = up->u_rxcslen;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}	
 	return (error);
 }
 
 #ifdef INET
 #define	UH_WLOCKED	2
 #define	UH_RLOCKED	1
 #define	UH_UNLOCKED	0
 static int
 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *td)
 {
 	struct udpiphdr *ui;
 	int len = m->m_pkthdr.len;
 	struct in_addr faddr, laddr;
 	struct cmsghdr *cm;
 	struct inpcbinfo *pcbinfo;
 	struct sockaddr_in *sin, src;
 	int cscov_partial = 0;
 	int error = 0;
 	int ipflags;
 	u_short fport, lport;
 	int unlock_udbinfo;
 	u_char tos;
 	uint8_t pr;
 	uint16_t cscov = 0;
 	uint32_t flowid = 0;
 	int flowid_type = 0;
 	int use_flowid = 0;
 
 	/*
 	 * udp_output() may need to temporarily bind or connect the current
 	 * inpcb.  As such, we don't know up front whether we will need the
 	 * pcbinfo lock or not.  Do any work to decide what is needed up
 	 * front before acquiring any locks.
 	 */
 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
 		if (control)
 			m_freem(control);
 		m_freem(m);
 		return (EMSGSIZE);
 	}
 
 	src.sin_family = 0;
 	INP_RLOCK(inp);
 	tos = inp->inp_ip_tos;
 	if (control != NULL) {
 		/*
 		 * XXX: Currently, we assume all the optional information is
 		 * stored in a single mbuf.
 		 */
 		if (control->m_next) {
 			INP_RUNLOCK(inp);
 			m_freem(control);
 			m_freem(m);
 			return (EINVAL);
 		}
 		for (; control->m_len > 0;
 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 			cm = mtod(control, struct cmsghdr *);
 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
 			    || cm->cmsg_len > control->m_len) {
 				error = EINVAL;
 				break;
 			}
 			if (cm->cmsg_level != IPPROTO_IP)
 				continue;
 
 			switch (cm->cmsg_type) {
 			case IP_SENDSRCADDR:
 				if (cm->cmsg_len !=
 				    CMSG_LEN(sizeof(struct in_addr))) {
 					error = EINVAL;
 					break;
 				}
 				bzero(&src, sizeof(src));
 				src.sin_family = AF_INET;
 				src.sin_len = sizeof(src);
 				src.sin_port = inp->inp_lport;
 				src.sin_addr =
 				    *(struct in_addr *)CMSG_DATA(cm);
 				break;
 
 			case IP_TOS:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
 					error = EINVAL;
 					break;
 				}
 				tos = *(u_char *)CMSG_DATA(cm);
 				break;
 
 			case IP_FLOWID:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
 					error = EINVAL;
 					break;
 				}
 				flowid = *(uint32_t *) CMSG_DATA(cm);
 				break;
 
 			case IP_FLOWTYPE:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
 					error = EINVAL;
 					break;
 				}
 				flowid_type = *(uint32_t *) CMSG_DATA(cm);
 				use_flowid = 1;
 				break;
 
 #ifdef	RSS
 			case IP_RSSBUCKETID:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
 					error = EINVAL;
 					break;
 				}
 				/* This is just a placeholder for now */
 				break;
 #endif	/* RSS */
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			if (error)
 				break;
 		}
 		m_freem(control);
 	}
 	if (error) {
 		INP_RUNLOCK(inp);
 		m_freem(m);
 		return (error);
 	}
 
 	/*
 	 * Depending on whether or not the application has bound or connected
 	 * the socket, we may have to do varying levels of work.  The optimal
 	 * case is for a connected UDP socket, as a global lock isn't
 	 * required at all.
 	 *
 	 * In order to decide which we need, we require stability of the
 	 * inpcb binding, which we ensure by acquiring a read lock on the
 	 * inpcb.  This doesn't strictly follow the lock order, so we play
 	 * the trylock and retry game; note that we may end up with more
 	 * conservative locks than required the second time around, so later
 	 * assertions have to accept that.  Further analysis of the number of
 	 * misses under contention is required.
 	 *
 	 * XXXRW: Check that hash locking update here is correct.
 	 */
 	pr = inp->inp_socket->so_proto->pr_protocol;
 	pcbinfo = get_inpcbinfo(pr);
 	sin = (struct sockaddr_in *)addr;
 	if (sin != NULL &&
 	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
 		INP_RUNLOCK(inp);
 		INP_WLOCK(inp);
 		INP_HASH_WLOCK(pcbinfo);
 		unlock_udbinfo = UH_WLOCKED;
 	} else if ((sin != NULL && (
 	    (sin->sin_addr.s_addr == INADDR_ANY) ||
 	    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
 	    (inp->inp_laddr.s_addr == INADDR_ANY) ||
 	    (inp->inp_lport == 0))) ||
 	    (src.sin_family == AF_INET)) {
 		INP_HASH_RLOCK(pcbinfo);
 		unlock_udbinfo = UH_RLOCKED;
 	} else
 		unlock_udbinfo = UH_UNLOCKED;
 
 	/*
 	 * If the IP_SENDSRCADDR control message was specified, override the
 	 * source address for this datagram.  Its use is invalidated if the
 	 * address thus specified is incomplete or clobbers other inpcbs.
 	 */
 	laddr = inp->inp_laddr;
 	lport = inp->inp_lport;
 	if (src.sin_family == AF_INET) {
 		INP_HASH_LOCK_ASSERT(pcbinfo);
 		if ((lport == 0) ||
 		    (laddr.s_addr == INADDR_ANY &&
 		     src.sin_addr.s_addr == INADDR_ANY)) {
 			error = EINVAL;
 			goto release;
 		}
 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
 		    &laddr.s_addr, &lport, td->td_ucred);
 		if (error)
 			goto release;
 	}
 
 	/*
 	 * If a UDP socket has been connected, then a local address/port will
 	 * have been selected and bound.
 	 *
 	 * If a UDP socket has not been connected to, then an explicit
 	 * destination address must be used, in which case a local
 	 * address/port may not have been selected and bound.
 	 */
 	if (sin != NULL) {
 		INP_LOCK_ASSERT(inp);
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto release;
 		}
 
 		/*
 		 * Jail may rewrite the destination address, so let it do
 		 * that before we use it.
 		 */
 		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
 		if (error)
 			goto release;
 
 		/*
 		 * If a local address or port hasn't yet been selected, or if
 		 * the destination address needs to be rewritten due to using
 		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
 		 * to do the heavy lifting.  Once a port is selected, we
 		 * commit the binding back to the socket; we also commit the
 		 * binding of the address if in jail.
 		 *
 		 * If we already have a valid binding and we're not
 		 * requesting a destination address rewrite, use a fast path.
 		 */
 		if (inp->inp_laddr.s_addr == INADDR_ANY ||
 		    inp->inp_lport == 0 ||
 		    sin->sin_addr.s_addr == INADDR_ANY ||
 		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
 			INP_HASH_LOCK_ASSERT(pcbinfo);
 			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
 			    &lport, &faddr.s_addr, &fport, NULL,
 			    td->td_ucred);
 			if (error)
 				goto release;
 
 			/*
 			 * XXXRW: Why not commit the port if the address is
 			 * !INADDR_ANY?
 			 */
 			/* Commit the local port if newly assigned. */
 			if (inp->inp_laddr.s_addr == INADDR_ANY &&
 			    inp->inp_lport == 0) {
 				INP_WLOCK_ASSERT(inp);
 				INP_HASH_WLOCK_ASSERT(pcbinfo);
 				/*
 				 * Remember addr if jailed, to prevent
 				 * rebinding.
 				 */
 				if (prison_flag(td->td_ucred, PR_IP4))
 					inp->inp_laddr = laddr;
 				inp->inp_lport = lport;
 				if (in_pcbinshash(inp) != 0) {
 					inp->inp_lport = 0;
 					error = EAGAIN;
 					goto release;
 				}
 				inp->inp_flags |= INP_ANONPORT;
 			}
 		} else {
 			faddr = sin->sin_addr;
 			fport = sin->sin_port;
 		}
 	} else {
 		INP_LOCK_ASSERT(inp);
 		faddr = inp->inp_faddr;
 		fport = inp->inp_fport;
 		if (faddr.s_addr == INADDR_ANY) {
 			error = ENOTCONN;
 			goto release;
 		}
 	}
 
 	/*
 	 * Calculate data length and get a mbuf for UDP, IP, and possible
 	 * link-layer headers.  Immediate slide the data pointer back forward
 	 * since we won't use that space at this layer.
 	 */
 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto release;
 	}
 	m->m_data += max_linkhdr;
 	m->m_len -= max_linkhdr;
 	m->m_pkthdr.len -= max_linkhdr;
 
 	/*
 	 * Fill in mbuf with extended UDP header and addresses and length put
 	 * into network format.
 	 */
 	ui = mtod(m, struct udpiphdr *);
 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
 	ui->ui_pr = pr;
 	ui->ui_src = laddr;
 	ui->ui_dst = faddr;
 	ui->ui_sport = lport;
 	ui->ui_dport = fport;
 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
 	if (pr == IPPROTO_UDPLITE) {
 		struct udpcb *up;
 		uint16_t plen;
 
 		up = intoudpcb(inp);
 		cscov = up->u_txcslen;
 		plen = (u_short)len + sizeof(struct udphdr);
 		if (cscov >= plen)
 			cscov = 0;
 		ui->ui_len = htons(plen);
 		ui->ui_ulen = htons(cscov);
 		/*
 		 * For UDP-Lite, checksum coverage length of zero means
 		 * the entire UDPLite packet is covered by the checksum.
 		 */
 		cscov_partial = (cscov == 0) ? 0 : 1;
 	} else
 		ui->ui_v = IPVERSION << 4;
 
 	/*
 	 * Set the Don't Fragment bit in the IP header.
 	 */
 	if (inp->inp_flags & INP_DONTFRAG) {
 		struct ip *ip;
 
 		ip = (struct ip *)&ui->ui_i;
 		ip->ip_off |= htons(IP_DF);
 	}
 
 	ipflags = 0;
 	if (inp->inp_socket->so_options & SO_DONTROUTE)
 		ipflags |= IP_ROUTETOIF;
 	if (inp->inp_socket->so_options & SO_BROADCAST)
 		ipflags |= IP_ALLOWBROADCAST;
 	if (inp->inp_flags & INP_ONESBCAST)
 		ipflags |= IP_SENDONES;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 	/*
 	 * Set up checksum and output datagram.
 	 */
 	ui->ui_sum = 0;
 	if (pr == IPPROTO_UDPLITE) {
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		if (cscov_partial) {
 			if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
 				ui->ui_sum = 0xffff;
 		} else {
 			if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
 				ui->ui_sum = 0xffff;
 		}
 	} else if (V_udp_cksum) {
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
 		    htons((u_short)len + sizeof(struct udphdr) + pr));
 		m->m_pkthdr.csum_flags = CSUM_UDP;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	}
 	((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
 	((struct ip *)ui)->ip_tos = tos;		/* XXX */
 	UDPSTAT_INC(udps_opackets);
 
 	/*
 	 * Setup flowid / RSS information for outbound socket.
 	 *
 	 * Once the UDP code decides to set a flowid some other way,
 	 * this allows the flowid to be overridden by userland.
 	 */
 	if (use_flowid) {
 		m->m_flags |= M_FLOWID;
 		m->m_pkthdr.flowid = flowid;
 		M_HASHTYPE_SET(m, flowid_type);
 #ifdef	RSS
 	} else {
 		uint32_t hash_val, hash_type;
 		/*
 		 * Calculate an appropriate RSS hash for UDP and
 		 * UDP Lite.
 		 *
 		 * The called function will take care of figuring out
 		 * whether a 2-tuple or 4-tuple hash is required based
 		 * on the currently configured scheme.
 		 *
 		 * Later later on connected socket values should be
 		 * cached in the inpcb and reused, rather than constantly
 		 * re-calculating it.
 		 *
 		 * UDP Lite is a different protocol number and will
 		 * likely end up being hashed as a 2-tuple until
 		 * RSS / NICs grow UDP Lite protocol awareness.
 		 */
 		if (rss_proto_software_hash_v4(faddr, laddr, fport, lport,
 		    pr, &hash_val, &hash_type) == 0) {
 			m->m_pkthdr.flowid = hash_val;
 			m->m_flags |= M_FLOWID;
 			M_HASHTYPE_SET(m, hash_type);
 		}
 #endif
 	}
 
 #ifdef	RSS
 	/*
 	 * Don't override with the inp cached flowid value.
 	 *
 	 * Depending upon the kind of send being done, the inp
 	 * flowid/flowtype values may actually not be appropriate
 	 * for this particular socket send.
 	 *
 	 * We should either leave the flowid at zero (which is what is
 	 * currently done) or set it to some software generated
 	 * hash value based on the packet contents.
 	 */
 	ipflags |= IP_NODEFAULTFLOWID;
 #endif	/* RSS */
 
 	if (unlock_udbinfo == UH_WLOCKED)
 		INP_HASH_WUNLOCK(pcbinfo);
 	else if (unlock_udbinfo == UH_RLOCKED)
 		INP_HASH_RUNLOCK(pcbinfo);
 	UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
 	error = ip_output(m, inp->inp_options, NULL, ipflags,
 	    inp->inp_moptions, inp);
 	if (unlock_udbinfo == UH_WLOCKED)
 		INP_WUNLOCK(inp);
 	else
 		INP_RUNLOCK(inp);
 	return (error);
 
 release:
 	if (unlock_udbinfo == UH_WLOCKED) {
 		INP_HASH_WUNLOCK(pcbinfo);
 		INP_WUNLOCK(inp);
 	} else if (unlock_udbinfo == UH_RLOCKED) {
 		INP_HASH_RUNLOCK(pcbinfo);
 		INP_RUNLOCK(inp);
 	} else
 		INP_RUNLOCK(inp);
 	m_freem(m);
 	return (error);
 }
 
 
 #if defined(IPSEC) && defined(IPSEC_NAT_T)
 /*
  * Potentially decap ESP in UDP frame.  Check for an ESP header
  * and optional marker; if present, strip the UDP header and
  * push the result through IPSec.
  *
  * Returns mbuf to be processed (potentially re-allocated) or
  * NULL if consumed and/or processed.
  */
 static struct mbuf *
 udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
 {
 	size_t minlen, payload, skip, iphlen;
 	caddr_t data;
 	struct udpcb *up;
 	struct m_tag *tag;
 	struct udphdr *udphdr;
 	struct ip *ip;
 
 	INP_RLOCK_ASSERT(inp);
 
 	/* 
 	 * Pull up data so the longest case is contiguous:
 	 *    IP/UDP hdr + non ESP marker + ESP hdr.
 	 */
 	minlen = off + sizeof(uint64_t) + sizeof(struct esp);
 	if (minlen > m->m_pkthdr.len)
 		minlen = m->m_pkthdr.len;
 	if ((m = m_pullup(m, minlen)) == NULL) {
 		IPSECSTAT_INC(ips_in_inval);
 		return (NULL);		/* Bypass caller processing. */
 	}
 	data = mtod(m, caddr_t);	/* Points to ip header. */
 	payload = m->m_len - off;	/* Size of payload. */
 
 	if (payload == 1 && data[off] == '\xff')
 		return (m);		/* NB: keepalive packet, no decap. */
 
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
 	KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0,
 	    ("u_flags 0x%x", up->u_flags));
 
 	/* 
 	 * Check that the payload is large enough to hold an
 	 * ESP header and compute the amount of data to remove.
 	 *
 	 * NB: the caller has already done a pullup for us.
 	 * XXX can we assume alignment and eliminate bcopys?
 	 */
 	if (up->u_flags & UF_ESPINUDP_NON_IKE) {
 		/*
 		 * draft-ietf-ipsec-nat-t-ike-0[01].txt and
 		 * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring
 		 * possible AH mode non-IKE marker+non-ESP marker
 		 * from draft-ietf-ipsec-udp-encaps-00.txt.
 		 */
 		uint64_t marker;
 
 		if (payload <= sizeof(uint64_t) + sizeof(struct esp))
 			return (m);	/* NB: no decap. */
 		bcopy(data + off, &marker, sizeof(uint64_t));
 		if (marker != 0)	/* Non-IKE marker. */
 			return (m);	/* NB: no decap. */
 		skip = sizeof(uint64_t) + sizeof(struct udphdr);
 	} else {
 		uint32_t spi;
 
 		if (payload <= sizeof(struct esp)) {
 			IPSECSTAT_INC(ips_in_inval);
 			m_freem(m);
 			return (NULL);	/* Discard. */
 		}
 		bcopy(data + off, &spi, sizeof(uint32_t));
 		if (spi == 0)		/* Non-ESP marker. */
 			return (m);	/* NB: no decap. */
 		skip = sizeof(struct udphdr);
 	}
 
 	/*
 	 * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
 	 * the UDP ports. This is required if we want to select
 	 * the right SPD for multiple hosts behind same NAT.
 	 *
 	 * NB: ports are maintained in network byte order everywhere
 	 *     in the NAT-T code.
 	 */
 	tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
 		2 * sizeof(uint16_t), M_NOWAIT);
 	if (tag == NULL) {
 		IPSECSTAT_INC(ips_in_nomem);
 		m_freem(m);
 		return (NULL);		/* Discard. */
 	}
 	iphlen = off - sizeof(struct udphdr);
 	udphdr = (struct udphdr *)(data + iphlen);
 	((uint16_t *)(tag + 1))[0] = udphdr->uh_sport;
 	((uint16_t *)(tag + 1))[1] = udphdr->uh_dport;
 	m_tag_prepend(m, tag);
 
 	/*
 	 * Remove the UDP header (and possibly the non ESP marker)
 	 * IP header length is iphlen
 	 * Before:
 	 *   <--- off --->
 	 *   +----+------+-----+
 	 *   | IP |  UDP | ESP |
 	 *   +----+------+-----+
 	 *        <-skip->
 	 * After:
 	 *          +----+-----+
 	 *          | IP | ESP |
 	 *          +----+-----+
 	 *   <-skip->
 	 */
 	ovbcopy(data, data + skip, iphlen);
 	m_adj(m, skip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_len = htons(ntohs(ip->ip_len) - skip);
 	ip->ip_p = IPPROTO_ESP;
 
 	/*
 	 * We cannot yet update the cksums so clear any
 	 * h/w cksum flags as they are no longer valid.
 	 */
 	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)
 		m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 
 	(void) ipsec4_common_input(m, iphlen, ip->ip_p);
 	return (NULL);			/* NB: consumed, bypass processing. */
 }
 #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
 
 static void
 udp_abort(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_HASH_WLOCK(pcbinfo);
 		in_pcbdisconnect(inp);
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		INP_HASH_WUNLOCK(pcbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error;
 
 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
 	error = soreserve(so, udp_sendspace, udp_recvspace);
 	if (error)
 		return (error);
 	INP_INFO_WLOCK(pcbinfo);
 	error = in_pcballoc(so, pcbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(pcbinfo);
 		return (error);
 	}
 
 	inp = sotoinpcb(so);
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_ttl = V_ip_defttl;
 
 	error = udp_newudpcb(inp);
 	if (error) {
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 		INP_INFO_WUNLOCK(pcbinfo);
 		return (error);
 	}
 
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(pcbinfo);
 	return (0);
 }
 #endif /* INET */
 
 int
 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, void *ctx)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 
 	KASSERT(so->so_type == SOCK_DGRAM,
 	    ("udp_set_kernel_tunneling: !dgram"));
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	if (up->u_tun_func != NULL) {
 		INP_WUNLOCK(inp);
 		return (EBUSY);
 	}
 	up->u_tun_func = f;
 	up->u_tun_ctx = ctx;
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 #ifdef INET
 static int
 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error;
 
 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(pcbinfo);
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(pcbinfo);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp_close(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_HASH_WLOCK(pcbinfo);
 		in_pcbdisconnect(inp);
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		INP_HASH_WUNLOCK(pcbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct sockaddr_in *sin;
 	int error;
 
 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_WUNLOCK(inp);
 		return (EISCONN);
 	}
 	sin = (struct sockaddr_in *)nam;
 	error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
 	if (error != 0) {
 		INP_WUNLOCK(inp);
 		return (error);
 	}
 	INP_HASH_WLOCK(pcbinfo);
 	error = in_pcbconnect(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(pcbinfo);
 	if (error == 0)
 		soisconnected(so);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct udpcb *up;
 
 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
 	    ("udp_detach: not disconnected"));
 	INP_INFO_WLOCK(pcbinfo);
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: up == NULL", __func__));
 	inp->inp_ppcb = NULL;
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	INP_INFO_WUNLOCK(pcbinfo);
 	udp_discardcb(up);
 }
 
 static int
 udp_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
 		INP_WUNLOCK(inp);
 		return (ENOTCONN);
 	}
 	INP_HASH_WLOCK(pcbinfo);
 	in_pcbdisconnect(inp);
 	inp->inp_laddr.s_addr = INADDR_ANY;
 	INP_HASH_WUNLOCK(pcbinfo);
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
 	return (udp_output(inp, m, addr, control, td));
 }
 #endif /* INET */
 
 int
 udp_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 #ifdef INET
 struct pr_usrreqs udp_usrreqs = {
 	.pru_abort =		udp_abort,
 	.pru_attach =		udp_attach,
 	.pru_bind =		udp_bind,
 	.pru_connect =		udp_connect,
 	.pru_control =		in_control,
 	.pru_detach =		udp_detach,
 	.pru_disconnect =	udp_disconnect,
 	.pru_peeraddr =		in_getpeeraddr,
 	.pru_send =		udp_send,
 	.pru_soreceive =	soreceive_dgram,
 	.pru_sosend =		sosend_dgram,
 	.pru_shutdown =		udp_shutdown,
 	.pru_sockaddr =		in_getsockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel,
 	.pru_close =		udp_close,
 };
 #endif /* INET */
Index: user/dchagin/lemul/sys/netinet6/in6_gif.c
===================================================================
--- user/dchagin/lemul/sys/netinet6/in6_gif.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet6/in6_gif.c	(revision 274241)
@@ -1,249 +1,249 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_gif.c,v 1.49 2001/05/14 14:02:17 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/queue.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/protosw.h>
 #include <sys/malloc.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #ifdef INET
 #include <netinet/ip.h>
 #endif
 #include <netinet/ip_encap.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_gif.h>
 #include <netinet6/in6_var.h>
 #endif
 #include <netinet/ip_ecn.h>
 #ifdef INET6
 #include <netinet6/ip6_ecn.h>
 #endif
 
 #include <net/if_gif.h>
 
 VNET_DEFINE(int, ip6_gif_hlim) = GIF_HLIM;
 #define	V_ip6_gif_hlim			VNET(ip6_gif_hlim)
 
 SYSCTL_DECL(_net_inet6_ip6);
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM, gifhlim, CTLFLAG_RW,
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM, gifhlim, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip6_gif_hlim), 0, "");
 
 static int gif_validate6(const struct ip6_hdr *, struct gif_softc *,
 			 struct ifnet *);
 
 extern  struct domain inet6domain;
 struct protosw in6_gif_protosw = {
 	.pr_type =	SOCK_RAW,
 	.pr_domain =	&inet6domain,
 	.pr_protocol =	0,			/* IPPROTO_IPV[46] */
 	.pr_flags =	PR_ATOMIC|PR_ADDR,
 	.pr_input =	in6_gif_input,
 	.pr_output =	rip6_output,
 	.pr_ctloutput =	rip6_ctloutput,
 	.pr_usrreqs =	&rip6_usrreqs
 };
 
 int
 in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn)
 {
 	GIF_RLOCK_TRACKER;
 	struct gif_softc *sc = ifp->if_softc;
 	struct ip6_hdr *ip6;
 	int len;
 
 	/* prepend new IP header */
 	len = sizeof(struct ip6_hdr);
 #ifndef __NO_STRICT_ALIGNMENT
 	if (proto == IPPROTO_ETHERIP)
 		len += ETHERIP_ALIGN;
 #endif
 	M_PREPEND(m, len, M_NOWAIT);
 	if (m == NULL)
 		return (ENOBUFS);
 #ifndef __NO_STRICT_ALIGNMENT
 	if (proto == IPPROTO_ETHERIP) {
 		len = mtod(m, vm_offset_t) & 3;
 		KASSERT(len == 0 || len == ETHERIP_ALIGN,
 		    ("in6_gif_output: unexpected misalignment"));
 		m->m_data += len;
 		m->m_len -= ETHERIP_ALIGN;
 	}
 #endif
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	GIF_RLOCK(sc);
 	if (sc->gif_family != AF_INET6) {
 		m_freem(m);
 		GIF_RUNLOCK(sc);
 		return (ENETDOWN);
 	}
 	bcopy(sc->gif_ip6hdr, ip6, sizeof(struct ip6_hdr));
 	GIF_RUNLOCK(sc);
 
 	ip6->ip6_flow  |= htonl((uint32_t)ecn << 20);
 	ip6->ip6_nxt	= proto;
 	ip6->ip6_hlim	= V_ip6_gif_hlim;
 	/*
 	 * force fragmentation to minimum MTU, to avoid path MTU discovery.
 	 * it is too painful to ask for resend of inner packet, to achieve
 	 * path MTU discovery for encapsulated packets.
 	 */
 	return (ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL, NULL));
 }
 
 int
 in6_gif_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ifnet *gifp;
 	struct gif_softc *sc;
 	struct ip6_hdr *ip6;
 	uint8_t ecn;
 
 	sc = encap_getarg(m);
 	if (sc == NULL) {
 		m_freem(m);
 		IP6STAT_INC(ip6s_nogif);
 		return (IPPROTO_DONE);
 	}
 	gifp = GIF2IFP(sc);
 	if ((gifp->if_flags & IFF_UP) != 0) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		ecn = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 		m_adj(m, *offp);
 		gif_input(m, gifp, proto, ecn);
 	} else {
 		m_freem(m);
 		IP6STAT_INC(ip6s_nogif);
 	}
 	return (IPPROTO_DONE);
 }
 
 /*
  * validate outer address.
  */
 static int
 gif_validate6(const struct ip6_hdr *ip6, struct gif_softc *sc,
     struct ifnet *ifp)
 {
 
 	GIF_RLOCK_ASSERT(sc);
 	/*
 	 * Check for address match.  Note that the check is for an incoming
 	 * packet.  We should compare the *source* address in our configuration
 	 * and the *destination* address of the packet, and vice versa.
 	 */
 	if (!IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src, &ip6->ip6_dst) ||
 	    !IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_dst, &ip6->ip6_src))
 		return (0);
 
 	/* martian filters on outer source - done in ip6_input */
 
 	/* ingress filters on outer source */
 	if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0 && ifp) {
 		struct sockaddr_in6 sin6;
 		struct rtentry *rt;
 
 		bzero(&sin6, sizeof(sin6));
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_len = sizeof(struct sockaddr_in6);
 		sin6.sin6_addr = ip6->ip6_src;
 		sin6.sin6_scope_id = 0; /* XXX */
 
 		rt = in6_rtalloc1((struct sockaddr *)&sin6, 0, 0UL,
 		    sc->gif_fibnum);
 		if (!rt || rt->rt_ifp != ifp) {
 			if (rt)
 				RTFREE_LOCKED(rt);
 			return (0);
 		}
 		RTFREE_LOCKED(rt);
 	}
 
 	return (128 * 2);
 }
 
 /*
  * we know that we are in IFF_UP, outer address available, and outer family
  * matched the physical addr family.  see gif_encapcheck().
  */
 int
 in6_gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
 {
 	struct ip6_hdr ip6;
 	struct gif_softc *sc;
 	struct ifnet *ifp;
 
 	/* sanity check done in caller */
 	sc = (struct gif_softc *)arg;
 	GIF_RLOCK_ASSERT(sc);
 
 	m_copydata(m, 0, sizeof(ip6), (caddr_t)&ip6);
 	ifp = ((m->m_flags & M_PKTHDR) != 0) ? m->m_pkthdr.rcvif : NULL;
 	return (gif_validate6(&ip6, sc, ifp));
 }
 
 int
 in6_gif_attach(struct gif_softc *sc)
 {
 
 	KASSERT(sc->gif_ecookie == NULL, ("gif_ecookie isn't NULL"));
 	sc->gif_ecookie = encap_attach_func(AF_INET6, -1, gif_encapcheck,
 	    (void *)&in6_gif_protosw, sc);
 	if (sc->gif_ecookie == NULL)
 		return (EEXIST);
 	return (0);
 }
Index: user/dchagin/lemul/sys/netinet6/in6_proto.c
===================================================================
--- user/dchagin/lemul/sys/netinet6/in6_proto.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet6/in6_proto.c	(revision 274241)
@@ -1,620 +1,619 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_proto.c,v 1.91 2001/05/27 13:28:35 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_proto.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_ipstealth.h"
 #include "opt_sctp.h"
 #include "opt_mpath.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/domain.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/icmp6.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet6/raw_ip6.h>
 #include <netinet6/udp6_var.h>
 #include <netinet6/pim6_var.h>
 #include <netinet6/nd6.h>
 
 #ifdef SCTP
 #include <netinet/in_pcb.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp.h>
 #include <netinet/sctp_var.h>
 #include <netinet6/sctp6_var.h>
 #endif /* SCTP */
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /* IPSEC */
 
 #include <netinet6/ip6protosw.h>
 
 /*
  * TCP/IP protocol family: IP6, ICMP6, UDP, TCP.
  */
 FEATURE(inet6, "Internet Protocol version 6");
 
 extern	struct domain inet6domain;
 static	struct pr_usrreqs nousrreqs;
 
 #define PR_LISTEN	0
 #define PR_ABRTACPTDIS	0
 
 /* Spacer for loadable protocols. */
 #define IP6PROTOSPACER   			\
 {						\
 	.pr_domain =		&inet6domain,	\
 	.pr_protocol =		PROTO_SPACER,	\
 	.pr_usrreqs =		&nousrreqs	\
 }
 
 struct protosw inet6sw[] = {
 {
 	.pr_type =		0,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_IPV6,
 	.pr_init =		ip6_init,
 #ifdef VIMAGE
 	.pr_destroy =		ip6_destroy,
 #endif
 	.pr_slowtimo =		frag6_slowtimo,
 	.pr_drain =		frag6_drain,
 	.pr_usrreqs =		&nousrreqs,
 },
 {
 	.pr_type =		SOCK_DGRAM,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_UDP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		udp6_input,
 	.pr_ctlinput =		udp6_ctlinput,
 	.pr_ctloutput =		ip6_ctloutput,
 #ifndef INET	/* Do not call initialization twice. */
 	.pr_init =		udp_init,
 #endif
 	.pr_usrreqs =		&udp6_usrreqs,
 },
 {
 	.pr_type =		SOCK_STREAM,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_TCP,
 	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_LISTEN,
 	.pr_input =		tcp6_input,
 	.pr_ctlinput =		tcp6_ctlinput,
 	.pr_ctloutput =		tcp_ctloutput,
 #ifndef INET	/* don't call initialization and timeout routines twice */
 	.pr_init =		tcp_init,
 	.pr_slowtimo =		tcp_slowtimo,
 #endif
 	.pr_drain =		tcp_drain,
 	.pr_usrreqs =		&tcp6_usrreqs,
 },
 #ifdef SCTP
 {
 	.pr_type =		SOCK_SEQPACKET,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_SCTP,
 	.pr_flags =		PR_WANTRCVD,
 	.pr_input =		sctp6_input,
 	.pr_ctlinput =		sctp6_ctlinput,
 	.pr_ctloutput =	sctp_ctloutput,
 	.pr_drain =		sctp_drain,
 #ifndef INET	/* Do not call initialization twice. */
 	.pr_init =		sctp_init,
 #endif
 	.pr_usrreqs =		&sctp6_usrreqs
 },
 {
 	.pr_type =		SOCK_STREAM,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_SCTP,
 	.pr_flags =		PR_WANTRCVD,
 	.pr_input =		sctp6_input,
 	.pr_ctlinput =		sctp6_ctlinput,
 	.pr_ctloutput =		sctp_ctloutput,
 	.pr_drain =		sctp_drain,
 	.pr_usrreqs =		&sctp6_usrreqs
 },
 #endif /* SCTP */
 {
 	.pr_type =		SOCK_DGRAM,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_UDPLITE,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		udp6_input,
 	.pr_ctlinput =		udplite6_ctlinput,
 	.pr_ctloutput =		udp_ctloutput,
 #ifndef INET	/* Do not call initialization twice. */
 	.pr_init =		udplite_init,
 #endif
 	.pr_usrreqs =		&udp6_usrreqs,
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_RAW,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		rip6_input,
 	.pr_output =		rip6_output,
 	.pr_ctlinput =		rip6_ctlinput,
 	.pr_ctloutput =		rip6_ctloutput,
 #ifndef INET	/* Do not call initialization twice. */
 	.pr_init =		rip_init,
 #endif
 	.pr_usrreqs =		&rip6_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_ICMPV6,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		icmp6_input,
 	.pr_output =		rip6_output,
 	.pr_ctlinput =		rip6_ctlinput,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_fasttimo =		icmp6_fasttimo,
 	.pr_slowtimo =		icmp6_slowtimo,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_DSTOPTS,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		dest6_input,
 	.pr_usrreqs =		&nousrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_ROUTING,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		route6_input,
 	.pr_usrreqs =		&nousrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_FRAGMENT,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		frag6_input,
 	.pr_usrreqs =		&nousrreqs
 },
 #ifdef IPSEC
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_AH,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		ipsec6_common_input,
 	.pr_usrreqs =		&nousrreqs,
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_ESP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
         .pr_input =		ipsec6_common_input,
 	.pr_ctlinput =		esp6_ctlinput,
 	.pr_usrreqs =		&nousrreqs,
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_IPCOMP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
         .pr_input =		ipsec6_common_input,
 	.pr_usrreqs =		&nousrreqs,
 },
 #endif /* IPSEC */
 #ifdef INET
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_IPV4,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_init =		encap_init,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 #endif /* INET */
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_IPV6,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_init =		encap_init,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_PIM,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =		encap6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 /* Spacer n-times for loadable protocols. */
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 IP6PROTOSPACER,
 /* raw wildcard */
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		rip6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 },
 };
 
 extern int in6_inithead(void **, int);
 #ifdef VIMAGE
 extern int in6_detachhead(void **, int);
 #endif
 
 struct domain inet6domain = {
 	.dom_family =		AF_INET6,
 	.dom_name =		"internet6",
 	.dom_protosw =		(struct protosw *)inet6sw,
 	.dom_protoswNPROTOSW =	(struct protosw *)
 				&inet6sw[sizeof(inet6sw)/sizeof(inet6sw[0])],
 #ifdef RADIX_MPATH
 	.dom_rtattach =		rn6_mpath_inithead,
 #else
 	.dom_rtattach =		in6_inithead,
 #endif
 #ifdef VIMAGE
 	.dom_rtdetach =		in6_detachhead,
 #endif
 	.dom_ifattach =		in6_domifattach,
 	.dom_ifdetach =		in6_domifdetach,
 	.dom_ifmtu    =		in6_domifmtu
 };
 
 VNET_DOMAIN_SET(inet6);
 
 /*
  * Internet configuration info
  */
 #ifndef	IPV6FORWARDING
 #ifdef GATEWAY6
 #define	IPV6FORWARDING	1	/* forward IP6 packets not for us */
 #else
 #define	IPV6FORWARDING	0	/* don't forward IP6 packets not for us */
 #endif /* GATEWAY6 */
 #endif /* !IPV6FORWARDING */
 
 #ifndef	IPV6_SENDREDIRECTS
 #define	IPV6_SENDREDIRECTS	1
 #endif
 
 VNET_DEFINE(int, ip6_forwarding) = IPV6FORWARDING;	/* act as router? */
 VNET_DEFINE(int, ip6_sendredirects) = IPV6_SENDREDIRECTS;
 VNET_DEFINE(int, ip6_defhlim) = IPV6_DEFHLIM;
 VNET_DEFINE(int, ip6_defmcasthlim) = IPV6_DEFAULT_MULTICAST_HOPS;
 VNET_DEFINE(int, ip6_accept_rtadv) = 0;
 VNET_DEFINE(int, ip6_no_radr) = 0;
 VNET_DEFINE(int, ip6_norbit_raif) = 0;
 VNET_DEFINE(int, ip6_rfc6204w3) = 0;
 VNET_DEFINE(int, ip6_maxfragpackets);	/* initialized in frag6.c:frag6_init() */
 VNET_DEFINE(int, ip6_maxfrags);		/* initialized in frag6.c:frag6_init() */
 VNET_DEFINE(int, ip6_log_interval) = 5;
 VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we
 					 * process? */
 VNET_DEFINE(int, ip6_dad_count) = 1;	/* DupAddrDetectionTransmits */
 VNET_DEFINE(int, ip6_auto_flowlabel) = 1;
 VNET_DEFINE(int, ip6_use_deprecated) = 1;/* allow deprecated addr
 					 * (RFC2462 5.5.4) */
 VNET_DEFINE(int, ip6_rr_prune) = 5;	/* router renumbering prefix
 					 * walk list every 5 sec. */
 VNET_DEFINE(int, ip6_mcast_pmtu) = 0;	/* enable pMTU discovery for multicast? */
 VNET_DEFINE(int, ip6_v6only) = 1;
 
 VNET_DEFINE(int, ip6_keepfaith) = 0;
 VNET_DEFINE(time_t, ip6_log_time) = (time_t)0L;
 #ifdef IPSTEALTH
 VNET_DEFINE(int, ip6stealth) = 0;
 #endif
 VNET_DEFINE(int, nd6_onlink_ns_rfc4861) = 0;/* allow 'on-link' nd6 NS
 					     * (RFC 4861) */
 
 /* icmp6 */
 /*
  * BSDI4 defines these variables in in_proto.c...
  * XXX: what if we don't define INET? Should we define pmtu6_expire
  * or so? (jinmei@kame.net 19990310)
  */
 VNET_DEFINE(int, pmtu_expire) = 60*10;
 VNET_DEFINE(int, pmtu_probe) = 60*2;
 
 /* ICMPV6 parameters */
 VNET_DEFINE(int, icmp6_rediraccept) = 1;/* accept and process redirects */
 VNET_DEFINE(int, icmp6_redirtimeout) = 10 * 60;	/* 10 minutes */
 VNET_DEFINE(int, icmp6errppslim) = 100;		/* 100pps */
 /* control how to respond to NI queries */
 VNET_DEFINE(int, icmp6_nodeinfo) =
     (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK);
 VNET_DEFINE(int, icmp6_nodeinfo_oldmcprefix) = 1;
 
 /*
  * sysctl related items.
  */
 SYSCTL_NODE(_net,	PF_INET6,	inet6,	CTLFLAG_RW,	0,
 	"Internet6 Family");
 
 /* net.inet6 */
 SYSCTL_NODE(_net_inet6,	IPPROTO_IPV6,	ip6,	CTLFLAG_RW, 0,	"IP6");
 SYSCTL_NODE(_net_inet6,	IPPROTO_ICMPV6,	icmp6,	CTLFLAG_RW, 0,	"ICMP6");
 SYSCTL_NODE(_net_inet6,	IPPROTO_UDP,	udp6,	CTLFLAG_RW, 0,	"UDP6");
 SYSCTL_NODE(_net_inet6,	IPPROTO_TCP,	tcp6,	CTLFLAG_RW, 0,	"TCP6");
 #ifdef SCTP
 SYSCTL_NODE(_net_inet6,	IPPROTO_SCTP,	sctp6,	CTLFLAG_RW, 0,	"SCTP6");
 #endif
 #ifdef IPSEC
 SYSCTL_NODE(_net_inet6,	IPPROTO_ESP,	ipsec6,	CTLFLAG_RW, 0,	"IPSEC6");
 #endif /* IPSEC */
 
 /* net.inet6.ip6 */
 static int
 sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	int old;
 
-	VNET_SYSCTL_ARG(req, arg1);
-
 	error = SYSCTL_OUT(req, arg1, sizeof(int));
 	if (error || !req->newptr)
 		return (error);
 	old = V_ip6_temp_preferred_lifetime;
 	error = SYSCTL_IN(req, arg1, sizeof(int));
 	if (V_ip6_temp_preferred_lifetime <
 	    V_ip6_desync_factor + V_ip6_temp_regen_advance) {
 		V_ip6_temp_preferred_lifetime = old;
 		return (EINVAL);
 	}
 	return (error);
 }
 
 static int
 sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	int old;
 
-	VNET_SYSCTL_ARG(req, arg1);
-
 	error = SYSCTL_OUT(req, arg1, sizeof(int));
 	if (error || !req->newptr)
 		return (error);
 	old = V_ip6_temp_valid_lifetime;
 	error = SYSCTL_IN(req, arg1, sizeof(int));
 	if (V_ip6_temp_valid_lifetime < V_ip6_temp_preferred_lifetime) {
 		V_ip6_temp_preferred_lifetime = old;
 		return (EINVAL);
 	}
 	return (error);
 }
 
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding, CTLFLAG_RW,
-	&VNET_NAME(ip6_forwarding), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
-	&VNET_NAME(ip6_sendredirects), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim, CTLFLAG_RW,
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, redirect,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_sendredirects), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ip6_defhlim), 0, "");
 SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat,
-    ip6stat, "IP6 statistics (struct ip6stat, netinet6/ip6_var.h)");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets,
-	CTLFLAG_RW, &VNET_NAME(ip6_maxfragpackets), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv,
-	CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0,
+	ip6stat, "IP6 statistics (struct ip6stat, netinet6/ip6_var.h)");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragpackets), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0,
 	"Default value of per-interface flag for accepting ICMPv6 Router"
 	"Advertisement messages");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_NO_RADR, no_radr,
-	CTLFLAG_RW, &VNET_NAME(ip6_no_radr), 0,
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NO_RADR, no_radr,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_no_radr), 0,
 	"Default value of per-interface flag to control whether routers "
 	"sending ICMPv6 RA messages on that interface are added into the "
 	"default router list.");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_NORBIT_RAIF, norbit_raif, CTLFLAG_RW,
-	&VNET_NAME(ip6_norbit_raif), 0,
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NORBIT_RAIF, norbit_raif,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_norbit_raif), 0,
 	"Always set 0 to R flag in ICMPv6 NA messages when accepting RA"
 	" on the interface.");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_RFC6204W3, rfc6204w3,
-	CTLFLAG_RW, &VNET_NAME(ip6_rfc6204w3), 0,
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RFC6204W3, rfc6204w3,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rfc6204w3), 0,
 	"Accept the default router list from ICMPv6 RA messages even "
 	"when packet forwarding enabled.");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_KEEPFAITH, keepfaith, CTLFLAG_RW,
-	&VNET_NAME(ip6_keepfaith), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval,
-	CTLFLAG_RW, &VNET_NAME(ip6_log_interval), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, hdrnestlimit,
-	CTLFLAG_RW, &VNET_NAME(ip6_hdrnestlimit), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, dad_count, CTLFLAG_RW,
-	&VNET_NAME(ip6_dad_count), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, auto_flowlabel,
-	CTLFLAG_RW, &VNET_NAME(ip6_auto_flowlabel), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, defmcasthlim,
-	CTLFLAG_RW, &VNET_NAME(ip6_defmcasthlim), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_KEEPFAITH, keepfaith,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_keepfaith), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_log_interval), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, hdrnestlimit,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_hdrnestlimit), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, dad_count,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_dad_count), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, auto_flowlabel,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_flowlabel), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, defmcasthlim,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defmcasthlim), 0, "");
 SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION, kame_version,
 	CTLFLAG_RD, __KAME_VERSION, 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, use_deprecated,
-	CTLFLAG_RW, &VNET_NAME(ip6_use_deprecated), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, rr_prune, CTLFLAG_RW,
-	&VNET_NAME(ip6_rr_prune), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, use_tempaddr,
-	CTLFLAG_RW, &VNET_NAME(ip6_use_tempaddr), 0, "");
-SYSCTL_VNET_PROC(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime,
-	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ip6_temp_preferred_lifetime), 0,
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, use_deprecated,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_deprecated), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, rr_prune,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rr_prune), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, use_tempaddr,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_tempaddr), 0, "");
+SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime,
+	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+	&VNET_NAME(ip6_temp_preferred_lifetime), 0,
    	sysctl_ip6_temppltime, "I", "");
-SYSCTL_VNET_PROC(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime,
-	CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ip6_temp_valid_lifetime), 0,
+SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime,
+	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
+	&VNET_NAME(ip6_temp_valid_lifetime), 0,
    	sysctl_ip6_tempvltime, "I", "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, v6only,	CTLFLAG_RW,
-	&VNET_NAME(ip6_v6only), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal,
-	CTLFLAG_RW, &VNET_NAME(ip6_auto_linklocal), 0,
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, v6only,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_v6only), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_linklocal), 0,
 	"Default value of per-interface flag for automatically adding an IPv6"
 	" link-local address to interfaces when attached");
 SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats,
-    struct rip6stat, rip6stat,
-    "Raw IP6 statistics (struct rip6stat, netinet6/raw_ip6.h)");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr,
-	CTLFLAG_RW, &VNET_NAME(ip6_prefer_tempaddr), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone,
-	CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0,"");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, CTLFLAG_RW,
-	&VNET_NAME(ip6_maxfrags), 0, "");
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu, CTLFLAG_RW,
-	&VNET_NAME(ip6_mcast_pmtu), 0, "");
+	struct rip6stat, rip6stat,
+	"Raw IP6 statistics (struct rip6stat, netinet6/raw_ip6.h)");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_prefer_tempaddr), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0,"");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfrags), 0, "");
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0, "");
 #ifdef IPSTEALTH
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_RW,
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ip6stealth), 0, "");
 #endif
 
 /* net.inet6.icmp6 */
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept,
-	CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, "");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout,
-	CTLFLAG_RW, &VNET_NAME(icmp6_redirtimeout), 0, "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_redirtimeout), 0, "");
 SYSCTL_VNET_PCPUSTAT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats,
-    struct icmp6stat, icmp6stat,
-    "ICMPv6 statistics (struct icmp6stat, netinet/icmp6.h)");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, nd6_prune, CTLFLAG_RW,
-	&VNET_NAME(nd6_prune), 0, "");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, nd6_delay, CTLFLAG_RW,
-	&VNET_NAME(nd6_delay), 0, "");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, nd6_umaxtries,
-	CTLFLAG_RW, &VNET_NAME(nd6_umaxtries), 0, "");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, nd6_mmaxtries,
-	CTLFLAG_RW, &VNET_NAME(nd6_mmaxtries), 0, "");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, nd6_useloopback,
-	CTLFLAG_RW, &VNET_NAME(nd6_useloopback), 0, "");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo, CTLFLAG_RW,
-	&VNET_NAME(icmp6_nodeinfo), 0, "");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO_OLDMCPREFIX,
-	nodeinfo_oldmcprefix, CTLFLAG_RW,
+	struct icmp6stat, icmp6stat,
+	"ICMPv6 statistics (struct icmp6stat, netinet/icmp6.h)");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, nd6_prune,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_prune), 0, "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, nd6_delay,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_delay), 0, "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, nd6_umaxtries,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_umaxtries), 0, "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, nd6_mmaxtries,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_mmaxtries), 0, "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, nd6_useloopback,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_useloopback), 0, "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo), 0, "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO_OLDMCPREFIX,
+	nodeinfo_oldmcprefix, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmp6_nodeinfo_oldmcprefix), 0, 
 	"Join old IPv6 NI group address in draft-ietf-ipngwg-icmp-name-lookup"
 	" for compatibility with KAME implememtation.");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit,
-	CTLFLAG_RW, &VNET_NAME(icmp6errppslim), 0, "");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, nd6_maxnudhint,
-	CTLFLAG_RW, &VNET_NAME(nd6_maxnudhint), 0, "");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug, CTLFLAG_RW,
-	&VNET_NAME(nd6_debug), 0, "");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861,
-	nd6_onlink_ns_rfc4861, CTLFLAG_RW, &VNET_NAME(nd6_onlink_ns_rfc4861),
-	0, "Accept 'on-link' nd6 NS in compliance with RFC 4861.");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6errppslim), 0, "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, nd6_maxnudhint,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxnudhint), 0, "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_debug), 0, "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861,
+	nd6_onlink_ns_rfc4861, CTLFLAG_VNET | CTLFLAG_RW,
+	&VNET_NAME(nd6_onlink_ns_rfc4861), 0,
+	"Accept 'on-link' nd6 NS in compliance with RFC 4861.");
Index: user/dchagin/lemul/sys/netinet6/in6_rmx.c
===================================================================
--- user/dchagin/lemul/sys/netinet6/in6_rmx.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet6/in6_rmx.c	(revision 274241)
@@ -1,341 +1,341 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_rmx.c,v 1.11 2001/07/26 06:53:16 jinmei Exp $
  */
 
 /*-
  * Copyright 1994, 1995 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/mbuf.h>
 #include <sys/rwlock.h>
 #include <sys/syslog.h>
 #include <sys/callout.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 
 extern int	in6_inithead(void **head, int off);
 #ifdef VIMAGE
 extern int	in6_detachhead(void **head, int off);
 #endif
 
 /*
  * Do what we need to do when inserting a route.
  */
 static struct radix_node *
 in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
     struct radix_node *treenodes)
 {
 	struct rtentry *rt = (struct rtentry *)treenodes;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)rt_key(rt);
 	struct radix_node *ret;
 
 	RADIX_NODE_HEAD_WLOCK_ASSERT(head);
 	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 		rt->rt_flags |= RTF_MULTICAST;
 
 	/*
 	 * A little bit of help for both IPv6 output and input:
 	 *   For local addresses, we make sure that RTF_LOCAL is set,
 	 *   with the thought that this might one day be used to speed up
 	 *   ip_input().
 	 *
 	 * We also mark routes to multicast addresses as such, because
 	 * it's easy to do and might be useful (but this is much more
 	 * dubious since it's so easy to inspect the address).  (This
 	 * is done above.)
 	 *
 	 * XXX
 	 * should elaborate the code.
 	 */
 	if (rt->rt_flags & RTF_HOST) {
 		if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr)
 					->sin6_addr,
 				       &sin6->sin6_addr)) {
 			rt->rt_flags |= RTF_LOCAL;
 		}
 	}
 
 	if (rt->rt_ifp != NULL) {
 
 		/*
 		 * Check route MTU:
 		 * inherit interface MTU if not set or
 		 * check if MTU is too large.
 		 */
 		if (rt->rt_mtu == 0) {
 			rt->rt_mtu = IN6_LINKMTU(rt->rt_ifp);
 		} else if (rt->rt_mtu > IN6_LINKMTU(rt->rt_ifp))
 			rt->rt_mtu = IN6_LINKMTU(rt->rt_ifp);
 	}
 
 	ret = rn_addroute(v_arg, n_arg, head, treenodes);
 	if (ret == NULL) {
 		struct rtentry *rt2;
 		/*
 		 * We are trying to add a net route, but can't.
 		 * The following case should be allowed, so we'll make a
 		 * special check for this:
 		 *	Two IPv6 addresses with the same prefix is assigned
 		 *	to a single interrface.
 		 *	# ifconfig if0 inet6 3ffe:0501::1 prefix 64 alias (*1)
 		 *	# ifconfig if0 inet6 3ffe:0501::2 prefix 64 alias (*2)
 		 *	In this case, (*1) and (*2) want to add the same
 		 *	net route entry, 3ffe:0501:: -> if0.
 		 *	This case should not raise an error.
 		 */
 		rt2 = in6_rtalloc1((struct sockaddr *)sin6, 0, RTF_RNH_LOCKED,
 		    rt->rt_fibnum);
 		if (rt2) {
 			if (((rt2->rt_flags & (RTF_HOST|RTF_GATEWAY)) == 0)
 			 && rt2->rt_gateway
 			 && rt2->rt_gateway->sa_family == AF_LINK
 			 && rt2->rt_ifp == rt->rt_ifp) {
 				ret = rt2->rt_nodes;
 			}
 			RTFREE_LOCKED(rt2);
 		}
 	}
 	return (ret);
 }
 
 SYSCTL_DECL(_net_inet6_ip6);
 
 static VNET_DEFINE(int, rtq_toomany6) = 128;
 	/* 128 cached routes is ``too many'' */
 #define	V_rtq_toomany6			VNET(rtq_toomany6)
-SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW,
+SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMAXCACHE, rtmaxcache, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(rtq_toomany6) , 0, "");
 
 struct rtqk_arg {
 	struct radix_node_head *rnh;
 	int mode;
 	int updating;
 	int draining;
 	int killed;
 	int found;
 	time_t nextstop;
 };
 
 /*
  * Age old PMTUs.
  */
 struct mtuex_arg {
 	struct radix_node_head *rnh;
 	time_t nextstop;
 };
 static VNET_DEFINE(struct callout, rtq_mtutimer);
 #define	V_rtq_mtutimer			VNET(rtq_mtutimer)
 
 static int
 in6_mtuexpire(struct radix_node *rn, void *rock)
 {
 	struct rtentry *rt = (struct rtentry *)rn;
 	struct mtuex_arg *ap = rock;
 
 	/* sanity */
 	if (!rt)
 		panic("rt == NULL in in6_mtuexpire");
 
 	if (rt->rt_expire && !(rt->rt_flags & RTF_PROBEMTU)) {
 		if (rt->rt_expire <= time_uptime) {
 			rt->rt_flags |= RTF_PROBEMTU;
 		} else {
 			ap->nextstop = lmin(ap->nextstop, rt->rt_expire);
 		}
 	}
 
 	return 0;
 }
 
 #define	MTUTIMO_DEFAULT	(60*1)
 
 static void
 in6_mtutimo_one(struct radix_node_head *rnh)
 {
 	struct mtuex_arg arg;
 
 	arg.rnh = rnh;
 	arg.nextstop = time_uptime + MTUTIMO_DEFAULT;
 	RADIX_NODE_HEAD_LOCK(rnh);
 	rnh->rnh_walktree(rnh, in6_mtuexpire, &arg);
 	RADIX_NODE_HEAD_UNLOCK(rnh);
 }
 
 static void
 in6_mtutimo(void *rock)
 {
 	CURVNET_SET_QUIET((struct vnet *) rock);
 	struct radix_node_head *rnh;
 	struct timeval atv;
 	u_int fibnum;
 
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		rnh = rt_tables_get_rnh(fibnum, AF_INET6);
 		if (rnh != NULL)
 			in6_mtutimo_one(rnh);
 	}
 
 	atv.tv_sec = MTUTIMO_DEFAULT;
 	atv.tv_usec = 0;
 	callout_reset(&V_rtq_mtutimer, tvtohz(&atv), in6_mtutimo, rock);
 	CURVNET_RESTORE();
 }
 
 /*
  * Initialize our routing tree.
  */
 static VNET_DEFINE(int, _in6_rt_was_here);
 #define	V__in6_rt_was_here	VNET(_in6_rt_was_here)
 
 int
 in6_inithead(void **head, int off)
 {
 	struct radix_node_head *rnh;
 
 	if (!rn_inithead(head, offsetof(struct sockaddr_in6, sin6_addr) << 3))
 		return (0);
 
 	rnh = *head;
 	RADIX_NODE_HEAD_LOCK_INIT(rnh);
 
 	rnh->rnh_addaddr = in6_addroute;
 
 	if (V__in6_rt_was_here == 0) {
 		callout_init(&V_rtq_mtutimer, CALLOUT_MPSAFE);
 		in6_mtutimo(curvnet);	/* kick off timeout first time */
 		V__in6_rt_was_here = 1;
 	}
 
 	return (1);
 }
 
 #ifdef VIMAGE
 int
 in6_detachhead(void **head, int off)
 {
 
 	callout_drain(&V_rtq_mtutimer);
 	return (1);
 }
 #endif
 
 /*
  * Extended API for IPv6 FIB support.
  */
 void
 in6_rtredirect(struct sockaddr *dst, struct sockaddr *gw, struct sockaddr *nm,
     int flags, struct sockaddr *src, u_int fibnum)
 {
 
 	rtredirect_fib(dst, gw, nm, flags, src, fibnum);
 }
 
 int
 in6_rtrequest(int req, struct sockaddr *dst, struct sockaddr *gw,
     struct sockaddr *mask, int flags, struct rtentry **ret_nrt, u_int fibnum)
 {
 
 	return (rtrequest_fib(req, dst, gw, mask, flags, ret_nrt, fibnum));
 }
 
 void
 in6_rtalloc(struct route_in6 *ro, u_int fibnum)
 {
 
 	rtalloc_ign_fib((struct route *)ro, 0ul, fibnum);
 }
 
 void
 in6_rtalloc_ign(struct route_in6 *ro, u_long ignflags, u_int fibnum)
 {
 
 	rtalloc_ign_fib((struct route *)ro, ignflags, fibnum);
 }
 
 struct rtentry *
 in6_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum)
 {
 
 	return (rtalloc1_fib(dst, report, ignflags, fibnum));
 }
Index: user/dchagin/lemul/sys/netinet6/ip6_ipsec.c
===================================================================
--- user/dchagin/lemul/sys/netinet6/ip6_ipsec.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet6/ip6_ipsec.c	(revision 274241)
@@ -1,410 +1,410 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *      The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 
 #include <machine/in_cksum.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #include <netipsec/xform.h>
 #include <netipsec/key.h>
 #ifdef IPSEC_DEBUG
 #include <netipsec/key_debug.h>
 #else
 #define	KEYDEBUG(lev,arg)
 #endif
 #endif /*IPSEC*/
 
 #include <netinet6/ip6_ipsec.h>
 #include <netinet6/ip6_var.h>
 
 extern	struct protosw inet6sw[];
 
 
 #ifdef INET6 
 #ifdef IPSEC
 #ifdef IPSEC_FILTERTUNNEL
 static VNET_DEFINE(int, ip6_ipsec6_filtertunnel) = 1;
 #else
 static VNET_DEFINE(int, ip6_ipsec6_filtertunnel) = 0;
 #endif
 #define	V_ip6_ipsec6_filtertunnel	VNET(ip6_ipsec6_filtertunnel)
 
 SYSCTL_DECL(_net_inet6_ipsec6);
-SYSCTL_VNET_INT(_net_inet6_ipsec6, OID_AUTO,
-	filtertunnel, CTLFLAG_RW, &VNET_NAME(ip6_ipsec6_filtertunnel),  0,
+SYSCTL_INT(_net_inet6_ipsec6, OID_AUTO, filtertunnel,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_ipsec6_filtertunnel),  0,
 	"If set filter packets from an IPsec tunnel.");
 #endif /* IPSEC */
 #endif /* INET6 */
 
 /*
  * Check if we have to jump over firewall processing for this packet.
  * Called from ip6_input().
  * 1 = jump over firewall, 0 = packet goes through firewall.
  */
 int
 ip6_ipsec_filtertunnel(struct mbuf *m)
 {
 #ifdef IPSEC
 
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (!V_ip6_ipsec6_filtertunnel &&
 	    m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL)
 		return 1;
 #endif
 	return 0;
 }
 
 /*
  * Check if this packet has an active SA and needs to be dropped instead
  * of forwarded.
  * Called from ip6_input().
  * 1 = drop packet, 0 = forward packet.
  */
 int
 ip6_ipsec_fwd(struct mbuf *m)
 {
 #ifdef IPSEC
 	struct m_tag *mtag;
 	struct tdb_ident *tdbi;
 	struct secpolicy *sp;
 	int error;
 	mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
 	if (mtag != NULL) {
 		tdbi = (struct tdb_ident *)(mtag + 1);
 		sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
 	} else {
 		sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
 					   IP_FORWARDING, &error);
 	}
 	if (sp == NULL) {	/* NB: can happen if error */
 		/*XXX error stat???*/
 		DPRINTF(("%s: no SP for forwarding\n", __func__));	/*XXX*/
 		return 1;
 	}
 
 	/*
 	 * Check security policy against packet attributes.
 	 */
 	error = ipsec_in_reject(sp, m);
 	KEY_FREESP(&sp);
 	if (error) {
 		IP6STAT_INC(ip6s_cantforward);
 		return 1;
 	}
 #endif /* IPSEC */
 	return 0;
 }
 
 /*
  * Check if protocol type doesn't have a further header and do IPSEC
  * decryption or reject right now.  Protocols with further headers get
  * their IPSEC treatment within the protocol specific processing.
  * Called from ip6_input().
  * 1 = drop packet, 0 = continue processing packet.
  */
 int
 ip6_ipsec_input(struct mbuf *m, int nxt)
 {
 #ifdef IPSEC
 	struct m_tag *mtag;
 	struct tdb_ident *tdbi;
 	struct secpolicy *sp;
 	int error;
 	/*
 	 * enforce IPsec policy checking if we are seeing last header.
 	 * note that we do not visit this with protocols with pcb layer
 	 * code - like udp/tcp/raw ip.
 	 */
 	if ((inet6sw[ip6_protox[nxt]].pr_flags & PR_LASTHDR) != 0 &&
 	    ipsec6_in_reject(m, NULL)) {
 
 		/*
 		 * Check if the packet has already had IPsec processing
 		 * done.  If so, then just pass it along.  This tag gets
 		 * set during AH, ESP, etc. input handling, before the
 		 * packet is returned to the ip input queue for delivery.
 		 */
 		mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
 		if (mtag != NULL) {
 			tdbi = (struct tdb_ident *)(mtag + 1);
 			sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
 		} else {
 			sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
 						   IP_FORWARDING, &error);
 		}
 		if (sp != NULL) {
 			/*
 			 * Check security policy against packet attributes.
 			 */
 			error = ipsec_in_reject(sp, m);
 			KEY_FREESP(&sp);
 		} else {
 			/* XXX error stat??? */
 			error = EINVAL;
 			DPRINTF(("%s: no SP, packet discarded\n", __func__));/*XXX*/
 			return 1;
 		}
 		if (error)
 			return 1;
 	}
 #endif /* IPSEC */
 	return 0;
 }
 
 /*
  * Called from ip6_output().
  * 1 = drop packet, 0 = continue processing packet,
  * -1 = packet was reinjected and stop processing packet
  */
 
 int
 ip6_ipsec_output(struct mbuf **m, struct inpcb *inp, int *flags, int *error,
     struct ifnet **ifp)
 {
 #ifdef IPSEC
 	struct secpolicy *sp = NULL;
 	struct tdb_ident *tdbi;
 	struct m_tag *mtag;
 	/* XXX int s; */
 	mtag = m_tag_find(*m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
 	if (mtag != NULL) {
 		tdbi = (struct tdb_ident *)(mtag + 1);
 		sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
 		if (sp == NULL)
 			*error = -EINVAL;	/* force silent drop */
 		m_tag_delete(*m, mtag);
 	} else {
 		sp = ipsec4_checkpolicy(*m, IPSEC_DIR_OUTBOUND, *flags,
 					error, inp);
 	}
 
 	/*
 	 * There are four return cases:
 	 *    sp != NULL		    apply IPsec policy
 	 *    sp == NULL, error == 0	    no IPsec handling needed
 	 *    sp == NULL, error == -EINVAL  discard packet w/o error
 	 *    sp == NULL, error != 0	    discard packet, report error
 	 */
 	if (sp != NULL) {
 		/* Loop detection, check if ipsec processing already done */
 		KASSERT(sp->req != NULL, ("ip_output: no ipsec request"));
 		for (mtag = m_tag_first(*m); mtag != NULL;
 		     mtag = m_tag_next(*m, mtag)) {
 			if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
 				continue;
 			if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
 			    mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
 				continue;
 			/*
 			 * Check if policy has no SA associated with it.
 			 * This can happen when an SP has yet to acquire
 			 * an SA; e.g. on first reference.  If it occurs,
 			 * then we let ipsec4_process_packet do its thing.
 			 */
 			if (sp->req->sav == NULL)
 				break;
 			tdbi = (struct tdb_ident *)(mtag + 1);
 			if (tdbi->spi == sp->req->sav->spi &&
 			    tdbi->proto == sp->req->sav->sah->saidx.proto &&
 			    bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst,
 				 sizeof (union sockaddr_union)) == 0) {
 				/*
 				 * No IPsec processing is needed, free
 				 * reference to SP.
 				 *
 				 * NB: null pointer to avoid free at
 				 *     done: below.
 				 */
 				KEY_FREESP(&sp), sp = NULL;
 				goto done;
 			}
 		}
 
 		/*
 		 * Do delayed checksums now because we send before
 		 * this is done in the normal processing path.
 		 */
 #ifdef INET
 		if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			in_delayed_cksum(*m);
 			(*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 		}
 #endif
 		if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 			in6_delayed_cksum(*m, (*m)->m_pkthdr.len - sizeof(struct ip6_hdr),
 							sizeof(struct ip6_hdr));
 			(*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 		}
 #ifdef SCTP
 		if ((*m)->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
 			sctp_delayed_cksum(*m, sizeof(struct ip6_hdr));
 			(*m)->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
 		}
 #endif
 
 		/* NB: callee frees mbuf */
 		*error = ipsec6_process_packet(*m, sp->req);
 
 		if (*error == EJUSTRETURN) {
 			/*
 			 * We had a SP with a level of 'use' and no SA. We
 			 * will just continue to process the packet without
 			 * IPsec processing.
 			 */
 			*error = 0;
 			goto done;
 		}
 
 		/*
 		 * Preserve KAME behaviour: ENOENT can be returned
 		 * when an SA acquire is in progress.  Don't propagate
 		 * this to user-level; it confuses applications.
 		 *
 		 * XXX this will go away when the SADB is redone.
 		 */
 		if (*error == ENOENT)
 			*error = 0;
 		goto reinjected;
 	} else {	/* sp == NULL */
 		if (*error != 0) {
 			/*
 			 * Hack: -EINVAL is used to signal that a packet
 			 * should be silently discarded.  This is typically
 			 * because we asked key management for an SA and
 			 * it was delayed (e.g. kicked up to IKE).
 			 */
 			if (*error == -EINVAL)
 				*error = 0;
 			goto bad;
 		} else {
 			/* No IPsec processing for this packet. */
 		}
 	}
 done:
 	if (sp != NULL)
 		KEY_FREESP(&sp);
 	return 0;
 reinjected:
 	if (sp != NULL)
 		KEY_FREESP(&sp);
 	return -1;
 bad:
 	if (sp != NULL)
 		KEY_FREESP(&sp);
 	return 1;
 #endif /* IPSEC */
 	return 0;
 }
 
 #if 0
 /*
  * Compute the MTU for a forwarded packet that gets IPSEC encapsulated.
  * Called from ip_forward().
  * Returns MTU suggestion for ICMP needfrag reply.
  */
 int
 ip6_ipsec_mtu(struct mbuf *m)
 {
 	int mtu = 0;
 	/*
 	 * If the packet is routed over IPsec tunnel, tell the
 	 * originator the tunnel MTU.
 	 *	tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
 	 * XXX quickhack!!!
 	 */
 #ifdef IPSEC
 	struct secpolicy *sp = NULL;
 	int ipsecerror;
 	int ipsechdr;
 	struct route *ro;
 	sp = ipsec_getpolicybyaddr(m,
 				   IPSEC_DIR_OUTBOUND,
 				   IP_FORWARDING,
 				   &ipsecerror);
 	if (sp != NULL) {
 		/* count IPsec header size */
 		ipsechdr = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, NULL);
 
 		/*
 		 * find the correct route for outer IPv4
 		 * header, compute tunnel MTU.
 		 */
 		if (sp->req != NULL &&
 		    sp->req->sav != NULL &&
 		    sp->req->sav->sah != NULL) {
 			ro = &sp->req->sav->sah->route_cache.sa_route;
 			if (ro->ro_rt && ro->ro_rt->rt_ifp) {
 				mtu = ro->ro_rt->rt_mtu ? ro->ro_rt->rt_mtu :
 				    ro->ro_rt->rt_ifp->if_mtu;
 				mtu -= ipsechdr;
 			}
 		}
 		KEY_FREESP(&sp);
 	}
 #endif /* IPSEC */
 	/* XXX else case missing. */
 	return mtu;
 }
 #endif
Index: user/dchagin/lemul/sys/netinet6/mld6.c
===================================================================
--- user/dchagin/lemul/sys/netinet6/mld6.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet6/mld6.c	(revision 274241)
@@ -1,3313 +1,3313 @@
 /*-
  * Copyright (c) 2009 Bruce Simpson.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 #include <sys/kernel.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/ktr.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet/icmp6.h>
 #include <netinet6/mld6.h>
 #include <netinet6/mld6_var.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifndef KTR_MLD
 #define KTR_MLD KTR_INET6
 #endif
 
 static struct mld_ifinfo *
 		mli_alloc_locked(struct ifnet *);
 static void	mli_delete_locked(const struct ifnet *);
 static void	mld_dispatch_packet(struct mbuf *);
 static void	mld_dispatch_queue(struct ifqueue *, int);
 static void	mld_final_leave(struct in6_multi *, struct mld_ifinfo *);
 static void	mld_fasttimo_vnet(void);
 static int	mld_handle_state_change(struct in6_multi *,
 		    struct mld_ifinfo *);
 static int	mld_initial_join(struct in6_multi *, struct mld_ifinfo *,
 		    const int);
 #ifdef KTR
 static char *	mld_rec_type_to_str(const int);
 #endif
 static void	mld_set_version(struct mld_ifinfo *, const int);
 static void	mld_slowtimo_vnet(void);
 static int	mld_v1_input_query(struct ifnet *, const struct ip6_hdr *,
 		    /*const*/ struct mld_hdr *);
 static int	mld_v1_input_report(struct ifnet *, const struct ip6_hdr *,
 		    /*const*/ struct mld_hdr *);
 static void	mld_v1_process_group_timer(struct mld_ifinfo *,
 		    struct in6_multi *);
 static void	mld_v1_process_querier_timers(struct mld_ifinfo *);
 static int	mld_v1_transmit_report(struct in6_multi *, const int);
 static void	mld_v1_update_group(struct in6_multi *, const int);
 static void	mld_v2_cancel_link_timers(struct mld_ifinfo *);
 static void	mld_v2_dispatch_general_query(struct mld_ifinfo *);
 static struct mbuf *
 		mld_v2_encap_report(struct ifnet *, struct mbuf *);
 static int	mld_v2_enqueue_filter_change(struct ifqueue *,
 		    struct in6_multi *);
 static int	mld_v2_enqueue_group_record(struct ifqueue *,
 		    struct in6_multi *, const int, const int, const int,
 		    const int);
 static int	mld_v2_input_query(struct ifnet *, const struct ip6_hdr *,
 		    struct mbuf *, const int, const int);
 static int	mld_v2_merge_state_changes(struct in6_multi *,
 		    struct ifqueue *);
 static void	mld_v2_process_group_timers(struct mld_ifinfo *,
 		    struct ifqueue *, struct ifqueue *,
 		    struct in6_multi *, const int);
 static int	mld_v2_process_group_query(struct in6_multi *,
 		    struct mld_ifinfo *mli, int, struct mbuf *, const int);
 static int	sysctl_mld_gsr(SYSCTL_HANDLER_ARGS);
 static int	sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS);
 
 /*
  * Normative references: RFC 2710, RFC 3590, RFC 3810.
  *
  * Locking:
  *  * The MLD subsystem lock ends up being system-wide for the moment,
  *    but could be per-VIMAGE later on.
  *  * The permitted lock order is: IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK.
  *    Any may be taken independently; if any are held at the same
  *    time, the above lock order must be followed.
  *  * IN6_MULTI_LOCK covers in_multi.
  *  * MLD_LOCK covers per-link state and any global variables in this file.
  *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
  *    per-link state iterators.
  *
  *  XXX LOR PREVENTION
  *  A special case for IPv6 is the in6_setscope() routine. ip6_output()
  *  will not accept an ifp; it wants an embedded scope ID, unlike
  *  ip_output(), which happily takes the ifp given to it. The embedded
  *  scope ID is only used by MLD to select the outgoing interface.
  *
  *  During interface attach and detach, MLD will take MLD_LOCK *after*
  *  the IF_AFDATA_LOCK.
  *  As in6_setscope() takes IF_AFDATA_LOCK then SCOPE_LOCK, we can't call
  *  it with MLD_LOCK held without triggering an LOR. A netisr with indirect
  *  dispatch could work around this, but we'd rather not do that, as it
  *  can introduce other races.
  *
  *  As such, we exploit the fact that the scope ID is just the interface
  *  index, and embed it in the IPv6 destination address accordingly.
  *  This is potentially NOT VALID for MLDv1 reports, as they
  *  are always sent to the multicast group itself; as MLDv2
  *  reports are always sent to ff02::16, this is not an issue
  *  when MLDv2 is in use.
  *
  *  This does not however eliminate the LOR when ip6_output() itself
  *  calls in6_setscope() internally whilst MLD_LOCK is held. This will
  *  trigger a LOR warning in WITNESS when the ifnet is detached.
  *
  *  The right answer is probably to make IF_AFDATA_LOCK an rwlock, given
  *  how it's used across the network stack. Here we're simply exploiting
  *  the fact that MLD runs at a similar layer in the stack to scope6.c.
  *
  * VIMAGE:
  *  * Each in6_multi corresponds to an ifp, and each ifp corresponds
  *    to a vnet in ifp->if_vnet.
  */
 static struct mtx		 mld_mtx;
 static MALLOC_DEFINE(M_MLD, "mld", "mld state");
 
 #define	MLD_EMBEDSCOPE(pin6, zoneid)					\
 	if (IN6_IS_SCOPE_LINKLOCAL(pin6) ||				\
 	    IN6_IS_ADDR_MC_INTFACELOCAL(pin6))				\
 		(pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF)		\
 
 /*
  * VIMAGE-wide globals.
  */
 static VNET_DEFINE(struct timeval, mld_gsrdelay) = {10, 0};
 static VNET_DEFINE(LIST_HEAD(, mld_ifinfo), mli_head);
 static VNET_DEFINE(int, interface_timers_running6);
 static VNET_DEFINE(int, state_change_timers_running6);
 static VNET_DEFINE(int, current_state_timers_running6);
 
 #define	V_mld_gsrdelay			VNET(mld_gsrdelay)
 #define	V_mli_head			VNET(mli_head)
 #define	V_interface_timers_running6	VNET(interface_timers_running6)
 #define	V_state_change_timers_running6	VNET(state_change_timers_running6)
 #define	V_current_state_timers_running6	VNET(current_state_timers_running6)
 
 SYSCTL_DECL(_net_inet6);	/* Note: Not in any common header. */
 
 SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW, 0,
     "IPv6 Multicast Listener Discovery");
 
 /*
  * Virtualized sysctls.
  */
-SYSCTL_VNET_PROC(_net_inet6_mld, OID_AUTO, gsrdelay,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(mld_gsrdelay.tv_sec), 0, sysctl_mld_gsr, "I",
     "Rate limit for MLDv2 Group-and-Source queries in seconds");
 
 /*
  * Non-virtualized sysctls.
  */
 static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo,
     "Per-interface MLDv2 state");
 
 static int	mld_v1enable = 1;
 SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN,
     &mld_v1enable, 0, "Enable fallback to MLDv1");
 
 static int	mld_use_allow = 1;
 SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN,
     &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");
 
 /*
  * Packed Router Alert option structure declaration.
  */
 struct mld_raopt {
 	struct ip6_hbh		hbh;
 	struct ip6_opt		pad;
 	struct ip6_opt_router	ra;
 } __packed;
 
 /*
  * Router Alert hop-by-hop option header.
  */
 static struct mld_raopt mld_ra = {
 	.hbh = { 0, 0 },
 	.pad = { .ip6o_type = IP6OPT_PADN, 0 },
 	.ra = {
 	    .ip6or_type = IP6OPT_ROUTER_ALERT,
 	    .ip6or_len = IP6OPT_RTALERT_LEN - 2,
 	    .ip6or_value[0] = ((IP6OPT_RTALERT_MLD >> 8) & 0xFF),
 	    .ip6or_value[1] = (IP6OPT_RTALERT_MLD & 0xFF)
 	}
 };
 static struct ip6_pktopts mld_po;
 
 static __inline void
 mld_save_context(struct mbuf *m, struct ifnet *ifp)
 {
 
 #ifdef VIMAGE
 	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
 #endif /* VIMAGE */
 	m->m_pkthdr.flowid = ifp->if_index;
 }
 
 static __inline void
 mld_scrub_context(struct mbuf *m)
 {
 
 	m->m_pkthdr.PH_loc.ptr = NULL;
 	m->m_pkthdr.flowid = 0;
 }
 
 /*
  * Restore context from a queued output chain.
  * Return saved ifindex.
  *
  * VIMAGE: The assertion is there to make sure that we
  * actually called CURVNET_SET() with what's in the mbuf chain.
  */
 static __inline uint32_t
 mld_restore_context(struct mbuf *m)
 {
 
 #if defined(VIMAGE) && defined(INVARIANTS)
 	KASSERT(curvnet == m->m_pkthdr.PH_loc.ptr,
 	    ("%s: called when curvnet was not restored", __func__));
 #endif
 	return (m->m_pkthdr.flowid);
 }
 
 /*
  * Retrieve or set threshold between group-source queries in seconds.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by MLD lock.
  */
 static int
 sysctl_mld_gsr(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	MLD_LOCK();
 
 	i = V_mld_gsrdelay.tv_sec;
 
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (i < -1 || i >= 60) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_MLD, "change mld_gsrdelay from %d to %d",
 	     V_mld_gsrdelay.tv_sec, i);
 	V_mld_gsrdelay.tv_sec = i;
 
 out_locked:
 	MLD_UNLOCK();
 	return (error);
 }
 
 /*
  * Expose struct mld_ifinfo to userland, keyed by ifindex.
  * For use by ifmcstat(8).
  *
  * SMPng: NOTE: Does an unlocked ifindex space read.
  * VIMAGE: Assume curvnet set by caller. The node handler itself
  * is not directly virtualized.
  */
 static int
 sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS)
 {
 	int			*name;
 	int			 error;
 	u_int			 namelen;
 	struct ifnet		*ifp;
 	struct mld_ifinfo	*mli;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct mld_ifinfo));
 	if (error)
 		return (error);
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	if (name[0] <= 0 || name[0] > V_if_index) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	error = ENOENT;
 
 	ifp = ifnet_byindex(name[0]);
 	if (ifp == NULL)
 		goto out_locked;
 
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		if (ifp == mli->mli_ifp) {
 			error = SYSCTL_OUT(req, mli,
 			    sizeof(struct mld_ifinfo));
 			break;
 		}
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Dispatch an entire queue of pending packet chains.
  * VIMAGE: Assumes the vnet pointer has been set.
  */
 static void
 mld_dispatch_queue(struct ifqueue *ifq, int limit)
 {
 	struct mbuf *m;
 
 	for (;;) {
 		_IF_DEQUEUE(ifq, m);
 		if (m == NULL)
 			break;
 		CTR3(KTR_MLD, "%s: dispatch %p from %p", __func__, ifq, m);
 		mld_dispatch_packet(m);
 		if (--limit == 0)
 			break;
 	}
 }
 
 /*
  * Filter outgoing MLD report state by group.
  *
  * Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1)
  * and node-local addresses. However, kernel and socket consumers
  * always embed the KAME scope ID in the address provided, so strip it
  * when performing comparison.
  * Note: This is not the same as the *multicast* scope.
  *
  * Return zero if the given group is one for which MLD reports
  * should be suppressed, or non-zero if reports should be issued.
  */
 static __inline int
 mld_is_addr_reported(const struct in6_addr *addr)
 {
 
 	KASSERT(IN6_IS_ADDR_MULTICAST(addr), ("%s: not multicast", __func__));
 
 	if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL)
 		return (0);
 
 	if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) {
 		struct in6_addr tmp = *addr;
 		in6_clearscope(&tmp);
 		if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes))
 			return (0);
 	}
 
 	return (1);
 }
 
 /*
  * Attach MLD when PF_INET6 is attached to an interface.
  *
  * SMPng: Normally called with IF_AFDATA_LOCK held.
  */
 struct mld_ifinfo *
 mld_domifattach(struct ifnet *ifp)
 {
 	struct mld_ifinfo *mli;
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK();
 
 	mli = mli_alloc_locked(ifp);
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		mli->mli_flags |= MLIF_SILENT;
 	if (mld_use_allow)
 		mli->mli_flags |= MLIF_USEALLOW;
 
 	MLD_UNLOCK();
 
 	return (mli);
 }
 
 /*
  * VIMAGE: assume curvnet set by caller.
  */
 static struct mld_ifinfo *
 mli_alloc_locked(/*const*/ struct ifnet *ifp)
 {
 	struct mld_ifinfo *mli;
 
 	MLD_LOCK_ASSERT();
 
 	mli = malloc(sizeof(struct mld_ifinfo), M_MLD, M_NOWAIT|M_ZERO);
 	if (mli == NULL)
 		goto out;
 
 	mli->mli_ifp = ifp;
 	mli->mli_version = MLD_VERSION_2;
 	mli->mli_flags = 0;
 	mli->mli_rv = MLD_RV_INIT;
 	mli->mli_qi = MLD_QI_INIT;
 	mli->mli_qri = MLD_QRI_INIT;
 	mli->mli_uri = MLD_URI_INIT;
 
 	SLIST_INIT(&mli->mli_relinmhead);
 
 	/*
 	 * Responses to general queries are subject to bounds.
 	 */
 	IFQ_SET_MAXLEN(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS);
 
 	LIST_INSERT_HEAD(&V_mli_head, mli, mli_link);
 
 	CTR2(KTR_MLD, "allocate mld_ifinfo for ifp %p(%s)",
 	     ifp, if_name(ifp));
 
 out:
 	return (mli);
 }
 
 /*
  * Hook for ifdetach.
  *
  * NOTE: Some finalization tasks need to run before the protocol domain
  * is detached, but also before the link layer does its cleanup.
  * Run before link-layer cleanup; cleanup groups, but do not free MLD state.
  *
  * SMPng: Caller must hold IN6_MULTI_LOCK().
  * Must take IF_ADDR_LOCK() to cover if_multiaddrs iterator.
  * XXX This routine is also bitten by unlocked ifma_protospec access.
  */
 void
 mld_ifdetach(struct ifnet *ifp)
 {
 	struct mld_ifinfo	*mli;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm, *tinm;
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp,
 	    if_name(ifp));
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	if (mli->mli_version == MLD_VERSION_2) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET6 ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in6_multi *)ifma->ifma_protospec;
 			if (inm->in6m_state == MLD_LEAVING_MEMBER) {
 				SLIST_INSERT_HEAD(&mli->mli_relinmhead,
 				    inm, in6m_nrele);
 			}
 			in6m_clear_recorded(inm);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele,
 		    tinm) {
 			SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele);
 			in6m_release_locked(inm);
 		}
 	}
 
 	MLD_UNLOCK();
 }
 
 /*
  * Hook for domifdetach.
  * Runs after link-layer cleanup; free MLD state.
  *
  * SMPng: Normally called with IF_AFDATA_LOCK held.
  */
 void
 mld_domifdetach(struct ifnet *ifp)
 {
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK();
 	mli_delete_locked(ifp);
 	MLD_UNLOCK();
 }
 
 static void
 mli_delete_locked(const struct ifnet *ifp)
 {
 	struct mld_ifinfo *mli, *tmli;
 
 	CTR3(KTR_MLD, "%s: freeing mld_ifinfo for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(mli, &V_mli_head, mli_link, tmli) {
 		if (mli->mli_ifp == ifp) {
 			/*
 			 * Free deferred General Query responses.
 			 */
 			_IF_DRAIN(&mli->mli_gq);
 
 			LIST_REMOVE(mli, mli_link);
 
 			KASSERT(SLIST_EMPTY(&mli->mli_relinmhead),
 			    ("%s: there are dangling in_multi references",
 			    __func__));
 
 			free(mli, M_MLD);
 			return;
 		}
 	}
 #ifdef INVARIANTS
 	panic("%s: mld_ifinfo not found for ifp %p\n", __func__,  ifp);
 #endif
 }
 
 /*
  * Process a received MLDv1 general or address-specific query.
  * Assumes that the query header has been pulled up to sizeof(mld_hdr).
  *
  * NOTE: Can't be fully const correct as we temporarily embed scope ID in
  * mld_addr. This is OK as we own the mbuf chain.
  */
 static int
 mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
     /*const*/ struct mld_hdr *mld)
 {
 	struct ifmultiaddr	*ifma;
 	struct mld_ifinfo	*mli;
 	struct in6_multi	*inm;
 	int			 is_general_query;
 	uint16_t		 timer;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	is_general_query = 0;
 
 	if (!mld_v1enable) {
 		CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	/*
 	 * RFC3810 Section 6.2: MLD queries must originate from
 	 * a router's link-local address.
 	 */
 	if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	/*
 	 * Do address field validation upfront before we accept
 	 * the query.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
 		/*
 		 * MLDv1 General Query.
 		 * If this was not sent to the all-nodes group, ignore it.
 		 */
 		struct in6_addr		 dst;
 
 		dst = ip6->ip6_dst;
 		in6_clearscope(&dst);
 		if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes))
 			return (EINVAL);
 		is_general_query = 1;
 	} else {
 		/*
 		 * Embed scope ID of receiving interface in MLD query for
 		 * lookup whilst we don't hold other locks.
 		 */
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 	}
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	/*
 	 * Switch to MLDv1 host compatibility mode.
 	 */
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifinfo for ifp %p", __func__, ifp));
 	mld_set_version(mli, MLD_VERSION_1);
 
 	timer = (ntohs(mld->mld_maxdelay) * PR_FASTHZ) / MLD_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	IF_ADDR_RLOCK(ifp);
 	if (is_general_query) {
 		/*
 		 * For each reporting group joined on this
 		 * interface, kick the report timer.
 		 */
 		CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)",
 		    ifp, if_name(ifp));
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET6 ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in6_multi *)ifma->ifma_protospec;
 			mld_v1_update_group(inm, timer);
 		}
 	} else {
 		/*
 		 * MLDv1 Group-Specific Query.
 		 * If this is a group-specific MLDv1 query, we need only
 		 * look up the single group to process it.
 		 */
 		inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 		if (inm != NULL) {
 			CTR3(KTR_MLD, "process v1 query %s on ifp %p(%s)",
 			    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 			    ifp, if_name(ifp));
 			mld_v1_update_group(inm, timer);
 		}
 		/* XXX Clear embedded scope ID as userland won't expect it. */
 		in6_clearscope(&mld->mld_addr);
 	}
 
 	IF_ADDR_RUNLOCK(ifp);
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Update the report timer on a group in response to an MLDv1 query.
  *
  * If we are becoming the reporting member for this group, start the timer.
  * If we already are the reporting member for this group, and timer is
  * below the threshold, reset it.
  *
  * We may be updating the group for the first time since we switched
  * to MLDv2. If we are, then we must clear any recorded source lists,
  * and transition to REPORTING state; the group timer is overloaded
  * for group and group-source query responses. 
  *
  * Unlike MLDv2, the delay per group should be jittered
  * to avoid bursts of MLDv1 reports.
  */
 static void
 mld_v1_update_group(struct in6_multi *inm, const int timer)
 {
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: %s/%s timer=%d", __func__,
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    if_name(inm->in6m_ifp), timer);
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 		break;
 	case MLD_REPORTING_MEMBER:
 		if (inm->in6m_timer != 0 &&
 		    inm->in6m_timer <= timer) {
 			CTR1(KTR_MLD, "%s: REPORTING and timer running, "
 			    "skipping.", __func__);
 			break;
 		}
 		/* FALLTHROUGH */
 	case MLD_SG_QUERY_PENDING_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		CTR1(KTR_MLD, "%s: ->REPORTING", __func__);
 		inm->in6m_state = MLD_REPORTING_MEMBER;
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		break;
 	case MLD_SLEEPING_MEMBER:
 		CTR1(KTR_MLD, "%s: ->AWAKENING", __func__);
 		inm->in6m_state = MLD_AWAKENING_MEMBER;
 		break;
 	case MLD_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Process a received MLDv2 general, group-specific or
  * group-and-source-specific query.
  *
  * Assumes that the query header has been pulled up to sizeof(mldv2_query).
  *
  * Return 0 if successful, otherwise an appropriate error code is returned.
  */
 static int
 mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
     struct mbuf *m, const int off, const int icmp6len)
 {
 	struct mld_ifinfo	*mli;
 	struct mldv2_query	*mld;
 	struct in6_multi	*inm;
 	uint32_t		 maxdelay, nsrc, qqi;
 	int			 is_general_query;
 	uint16_t		 timer;
 	uint8_t			 qrv;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	is_general_query = 0;
 
 	/*
 	 * RFC3810 Section 6.2: MLD queries must originate from
 	 * a router's link-local address.
 	 */
 	if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp));
 
 	mld = (struct mldv2_query *)(mtod(m, uint8_t *) + off);
 
 	maxdelay = ntohs(mld->mld_maxdelay);	/* in 1/10ths of a second */
 	if (maxdelay >= 32768) {
 		maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) <<
 			   (MLD_MRC_EXP(maxdelay) + 3);
 	}
 	timer = (maxdelay * PR_FASTHZ) / MLD_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	qrv = MLD_QRV(mld->mld_misc);
 	if (qrv < 2) {
 		CTR3(KTR_MLD, "%s: clamping qrv %d to %d", __func__,
 		    qrv, MLD_RV_INIT);
 		qrv = MLD_RV_INIT;
 	}
 
 	qqi = mld->mld_qqi;
 	if (qqi >= 128) {
 		qqi = MLD_QQIC_MANT(mld->mld_qqi) <<
 		     (MLD_QQIC_EXP(mld->mld_qqi) + 3);
 	}
 
 	nsrc = ntohs(mld->mld_numsrc);
 	if (nsrc > MLD_MAX_GS_SOURCES)
 		return (EMSGSIZE);
 	if (icmp6len < sizeof(struct mldv2_query) +
 	    (nsrc * sizeof(struct in6_addr)))
 		return (EMSGSIZE);
 
 	/*
 	 * Do further input validation upfront to avoid resetting timers
 	 * should we need to discard this query.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
 		/*
 		 * A general query with a source list has undefined
 		 * behaviour; discard it.
 		 */
 		if (nsrc > 0)
 			return (EINVAL);
 		is_general_query = 1;
 	} else {
 		/*
 		 * Embed scope ID of receiving interface in MLD query for
 		 * lookup whilst we don't hold other locks (due to KAME
 		 * locking lameness). We own this mbuf chain just now.
 		 */
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 	}
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifinfo for ifp %p", __func__, ifp));
 
 	/*
 	 * Discard the v2 query if we're in Compatibility Mode.
 	 * The RFC is pretty clear that hosts need to stay in MLDv1 mode
 	 * until the Old Version Querier Present timer expires.
 	 */
 	if (mli->mli_version != MLD_VERSION_2)
 		goto out_locked;
 
 	mld_set_version(mli, MLD_VERSION_2);
 	mli->mli_rv = qrv;
 	mli->mli_qi = qqi;
 	mli->mli_qri = maxdelay;
 
 	CTR4(KTR_MLD, "%s: qrv %d qi %d maxdelay %d", __func__, qrv, qqi,
 	    maxdelay);
 
 	if (is_general_query) {
 		/*
 		 * MLDv2 General Query.
 		 *
 		 * Schedule a current-state report on this ifp for
 		 * all groups, possibly containing source lists.
 		 *
 		 * If there is a pending General Query response
 		 * scheduled earlier than the selected delay, do
 		 * not schedule any other reports.
 		 * Otherwise, reset the interface timer.
 		 */
 		CTR2(KTR_MLD, "process v2 general query on ifp %p(%s)",
 		    ifp, if_name(ifp));
 		if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) {
 			mli->mli_v2_timer = MLD_RANDOM_DELAY(timer);
 			V_interface_timers_running6 = 1;
 		}
 	} else {
 		/*
 		 * MLDv2 Group-specific or Group-and-source-specific Query.
 		 *
 		 * Group-source-specific queries are throttled on
 		 * a per-group basis to defeat denial-of-service attempts.
 		 * Queries for groups we are not a member of on this
 		 * link are simply ignored.
 		 */
 		IF_ADDR_RLOCK(ifp);
 		inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 		if (inm == NULL) {
 			IF_ADDR_RUNLOCK(ifp);
 			goto out_locked;
 		}
 		if (nsrc > 0) {
 			if (!ratecheck(&inm->in6m_lastgsrtv,
 			    &V_mld_gsrdelay)) {
 				CTR1(KTR_MLD, "%s: GS query throttled.",
 				    __func__);
 				IF_ADDR_RUNLOCK(ifp);
 				goto out_locked;
 			}
 		}
 		CTR2(KTR_MLD, "process v2 group query on ifp %p(%s)",
 		     ifp, if_name(ifp));
 		/*
 		 * If there is a pending General Query response
 		 * scheduled sooner than the selected delay, no
 		 * further report need be scheduled.
 		 * Otherwise, prepare to respond to the
 		 * group-specific or group-and-source query.
 		 */
 		if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer)
 			mld_v2_process_group_query(inm, mli, timer, m, off);
 
 		/* XXX Clear embedded scope ID as userland won't expect it. */
 		in6_clearscope(&mld->mld_addr);
 		IF_ADDR_RUNLOCK(ifp);
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a recieved MLDv2 group-specific or group-and-source-specific
  * query.
  * Return <0 if any error occured. Currently this is ignored.
  */
 static int
 mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifinfo *mli,
     int timer, struct mbuf *m0, const int off)
 {
 	struct mldv2_query	*mld;
 	int			 retval;
 	uint16_t		 nsrc;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	retval = 0;
 	mld = (struct mldv2_query *)(mtod(m0, uint8_t *) + off);
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		return (retval);
 		break;
 	case MLD_REPORTING_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		break;
 	}
 
 	nsrc = ntohs(mld->mld_numsrc);
 
 	/*
 	 * Deal with group-specific queries upfront.
 	 * If any group query is already pending, purge any recorded
 	 * source-list state if it exists, and schedule a query response
 	 * for this group-specific query.
 	 */
 	if (nsrc == 0) {
 		if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER ||
 		    inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) {
 			in6m_clear_recorded(inm);
 			timer = min(inm->in6m_timer, timer);
 		}
 		inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER;
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		return (retval);
 	}
 
 	/*
 	 * Deal with the case where a group-and-source-specific query has
 	 * been received but a group-specific query is already pending.
 	 */
 	if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) {
 		timer = min(inm->in6m_timer, timer);
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		return (retval);
 	}
 
 	/*
 	 * Finally, deal with the case where a group-and-source-specific
 	 * query has been received, where a response to a previous g-s-r
 	 * query exists, or none exists.
 	 * In this case, we need to parse the source-list which the Querier
 	 * has provided us with and check if we have any source list filter
 	 * entries at T1 for these sources. If we do not, there is no need
 	 * schedule a report and the query may be dropped.
 	 * If we do, we must record them and schedule a current-state
 	 * report for those sources.
 	 */
 	if (inm->in6m_nsrc > 0) {
 		struct mbuf		*m;
 		uint8_t			*sp;
 		int			 i, nrecorded;
 		int			 soff;
 
 		m = m0;
 		soff = off + sizeof(struct mldv2_query);
 		nrecorded = 0;
 		for (i = 0; i < nsrc; i++) {
 			sp = mtod(m, uint8_t *) + soff;
 			retval = in6m_record_source(inm,
 			    (const struct in6_addr *)sp);
 			if (retval < 0)
 				break;
 			nrecorded += retval;
 			soff += sizeof(struct in6_addr);
 			if (soff >= m->m_len) {
 				soff = soff - m->m_len;
 				m = m->m_next;
 				if (m == NULL)
 					break;
 			}
 		}
 		if (nrecorded > 0) {
 			CTR1(KTR_MLD,
 			    "%s: schedule response to SG query", __func__);
 			inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER;
 			inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 			V_current_state_timers_running6 = 1;
 		}
 	}
 
 	return (retval);
 }
 
 /*
  * Process a received MLDv1 host membership report.
  * Assumes mld points to mld_hdr in pulled up mbuf chain.
  *
  * NOTE: Can't be fully const correct as we temporarily embed scope ID in
  * mld_addr. This is OK as we own the mbuf chain.
  */
 static int
 mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6,
     /*const*/ struct mld_hdr *mld)
 {
 	struct in6_addr		 src, dst;
 	struct in6_ifaddr	*ia;
 	struct in6_multi	*inm;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	if (!mld_v1enable) {
 		CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 
 	/*
 	 * MLDv1 reports must originate from a host's link-local address,
 	 * or the unspecified address (when booting).
 	 */
 	src = ip6->ip6_src;
 	in6_clearscope(&src);
 	if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC2710 Section 4: MLDv1 reports must pertain to a multicast
 	 * group, and must be directed to the group itself.
 	 */
 	dst = ip6->ip6_dst;
 	in6_clearscope(&dst);
 	if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) ||
 	    !IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) {
 		CTR3(KTR_MLD, "ignore v1 query dst %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_dst),
 		    ifp, if_name(ifp));
 		return (EINVAL);
 	}
 
 	/*
 	 * Make sure we don't hear our own membership report, as fast
 	 * leave requires knowing that we are the only member of a
 	 * group. Assume we used the link-local address if available,
 	 * otherwise look for ::.
 	 *
 	 * XXX Note that scope ID comparison is needed for the address
 	 * returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be
 	 * performed for the on-wire address.
 	 */
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	if ((ia && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia))) ||
 	    (ia == NULL && IN6_IS_ADDR_UNSPECIFIED(&src))) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 
 	CTR3(KTR_MLD, "process v1 report %s on ifp %p(%s)",
 	    ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp));
 
 	/*
 	 * Embed scope ID of receiving interface in MLD query for lookup
 	 * whilst we don't hold other locks (due to KAME locking lameness).
 	 */
 	if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr))
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 	IF_ADDR_RLOCK(ifp);
 
 	/*
 	 * MLDv1 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, and our group timer is pending or about to be reset,
 	 * stop our group timer by transitioning to the 'lazy' state.
 	 */
 	inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 	if (inm != NULL) {
 		struct mld_ifinfo *mli;
 
 		mli = inm->in6m_mli;
 		KASSERT(mli != NULL,
 		    ("%s: no mli for ifp %p", __func__, ifp));
 
 		/*
 		 * If we are in MLDv2 host mode, do not allow the
 		 * other host's MLDv1 report to suppress our reports.
 		 */
 		if (mli->mli_version == MLD_VERSION_2)
 			goto out_locked;
 
 		inm->in6m_timer = 0;
 
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 			break;
 		case MLD_REPORTING_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			CTR3(KTR_MLD,
 			    "report suppressed for %s on ifp %p(%s)",
 			    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 			    ifp, if_name(ifp));
 		case MLD_LAZY_MEMBER:
 			inm->in6m_state = MLD_LAZY_MEMBER;
 			break;
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 		case MLD_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IF_ADDR_RUNLOCK(ifp);
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	/* XXX Clear embedded scope ID as userland won't expect it. */
 	in6_clearscope(&mld->mld_addr);
 
 	return (0);
 }
 
 /*
  * MLD input path.
  *
  * Assume query messages which fit in a single ICMPv6 message header
  * have been pulled up.
  * Assume that userland will want to see the message, even if it
  * otherwise fails kernel input validation; do not free it.
  * Pullup may however free the mbuf chain m if it fails.
  *
  * Return IPPROTO_DONE if we freed m. Otherwise, return 0.
  */
 int
 mld_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet	*ifp;
 	struct ip6_hdr	*ip6;
 	struct mld_hdr	*mld;
 	int		 mldlen;
 
 	CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off);
 
 	ifp = m->m_pkthdr.rcvif;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* Pullup to appropriate size. */
 	mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off);
 	if (mld->mld_type == MLD_LISTENER_QUERY &&
 	    icmp6len >= sizeof(struct mldv2_query)) {
 		mldlen = sizeof(struct mldv2_query);
 	} else {
 		mldlen = sizeof(struct mld_hdr);
 	}
 	IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen);
 	if (mld == NULL) {
 		ICMP6STAT_INC(icp6s_badlen);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Userland needs to see all of this traffic for implementing
 	 * the endpoint discovery portion of multicast routing.
 	 */
 	switch (mld->mld_type) {
 	case MLD_LISTENER_QUERY:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldquery);
 		if (icmp6len == sizeof(struct mld_hdr)) {
 			if (mld_v1_input_query(ifp, ip6, mld) != 0)
 				return (0);
 		} else if (icmp6len >= sizeof(struct mldv2_query)) {
 			if (mld_v2_input_query(ifp, ip6, m, off,
 			    icmp6len) != 0)
 				return (0);
 		}
 		break;
 	case MLD_LISTENER_REPORT:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
 		if (mld_v1_input_report(ifp, ip6, mld) != 0)
 			return (0);
 		break;
 	case MLDV2_LISTENER_REPORT:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
 		break;
 	case MLD_LISTENER_DONE:
 		icmp6_ifstat_inc(ifp, ifs6_in_mlddone);
 		break;
 	default:
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * Fast timeout handler (global).
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 mld_fasttimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		mld_fasttimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Fast timeout handler (per-vnet).
  *
  * VIMAGE: Assume caller has set up our curvnet.
  */
 static void
 mld_fasttimo_vnet(void)
 {
 	struct ifqueue		 scq;	/* State-change packets */
 	struct ifqueue		 qrq;	/* Query response packets */
 	struct ifnet		*ifp;
 	struct mld_ifinfo	*mli;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm, *tinm;
 	int			 uri_fasthz;
 
 	uri_fasthz = 0;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order to
 	 * minimize the overhead of fasttimo processing.
 	 * SMPng: XXX Unlocked reads.
 	 */
 	if (!V_current_state_timers_running6 &&
 	    !V_interface_timers_running6 &&
 	    !V_state_change_timers_running6)
 		return;
 
 	IN6_MULTI_LOCK();
 	MLD_LOCK();
 
 	/*
 	 * MLDv2 General Query response timer processing.
 	 */
 	if (V_interface_timers_running6) {
 		CTR1(KTR_MLD, "%s: interface timers running", __func__);
 
 		V_interface_timers_running6 = 0;
 		LIST_FOREACH(mli, &V_mli_head, mli_link) {
 			if (mli->mli_v2_timer == 0) {
 				/* Do nothing. */
 			} else if (--mli->mli_v2_timer == 0) {
 				mld_v2_dispatch_general_query(mli);
 			} else {
 				V_interface_timers_running6 = 1;
 			}
 		}
 	}
 
 	if (!V_current_state_timers_running6 &&
 	    !V_state_change_timers_running6)
 		goto out_locked;
 
 	V_current_state_timers_running6 = 0;
 	V_state_change_timers_running6 = 0;
 
 	CTR1(KTR_MLD, "%s: state change timers running", __func__);
 
 	/*
 	 * MLD host report and state-change timer processing.
 	 * Note: Processing a v2 group timer may remove a node.
 	 */
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		ifp = mli->mli_ifp;
 
 		if (mli->mli_version == MLD_VERSION_2) {
 			uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri *
 			    PR_FASTHZ);
 
 			memset(&qrq, 0, sizeof(struct ifqueue));
 			IFQ_SET_MAXLEN(&qrq, MLD_MAX_G_GS_PACKETS);
 
 			memset(&scq, 0, sizeof(struct ifqueue));
 			IFQ_SET_MAXLEN(&scq, MLD_MAX_STATE_CHANGE_PACKETS);
 		}
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_addr->sa_family != AF_INET6 ||
 			    ifma->ifma_protospec == NULL)
 				continue;
 			inm = (struct in6_multi *)ifma->ifma_protospec;
 			switch (mli->mli_version) {
 			case MLD_VERSION_1:
 				mld_v1_process_group_timer(mli, inm);
 				break;
 			case MLD_VERSION_2:
 				mld_v2_process_group_timers(mli, &qrq,
 				    &scq, inm, uri_fasthz);
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		switch (mli->mli_version) {
 		case MLD_VERSION_1:
 			/*
 			 * Transmit reports for this lifecycle.  This
 			 * is done while not holding IF_ADDR_LOCK
 			 * since this can call
 			 * in6ifa_ifpforlinklocal() which locks
 			 * IF_ADDR_LOCK internally as well as
 			 * ip6_output() to transmit a packet.
 			 */
 			SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead,
 			    in6m_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&mli->mli_relinmhead,
 				    in6m_nrele);
 				(void)mld_v1_transmit_report(inm,
 				    MLD_LISTENER_REPORT);
 			}
 			break;
 		case MLD_VERSION_2:
 			mld_dispatch_queue(&qrq, 0);
 			mld_dispatch_queue(&scq, 0);
 
 			/*
 			 * Free the in_multi reference(s) for
 			 * this lifecycle.
 			 */
 			SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead,
 			    in6m_nrele, tinm) {
 				SLIST_REMOVE_HEAD(&mli->mli_relinmhead,
 				    in6m_nrele);
 				in6m_release_locked(inm);
 			}
 			break;
 		}
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_UNLOCK();
 }
 
 /*
  * Update host report group timer.
  * Will update the global pending timer flags.
  */
 static void
 mld_v1_process_group_timer(struct mld_ifinfo *mli, struct in6_multi *inm)
 {
 	int report_timer_expired;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	if (inm->in6m_timer == 0) {
 		report_timer_expired = 0;
 	} else if (--inm->in6m_timer == 0) {
 		report_timer_expired = 1;
 	} else {
 		V_current_state_timers_running6 = 1;
 		return;
 	}
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		break;
 	case MLD_REPORTING_MEMBER:
 		if (report_timer_expired) {
 			inm->in6m_state = MLD_IDLE_MEMBER;
 			SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm,
 			    in6m_nrele);
 		}
 		break;
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Update a group's timers for MLDv2.
  * Will update the global pending timer flags.
  * Note: Unlocked read from mli.
  */
 static void
 mld_v2_process_group_timers(struct mld_ifinfo *mli,
     struct ifqueue *qrq, struct ifqueue *scq,
     struct in6_multi *inm, const int uri_fasthz)
 {
 	int query_response_timer_expired;
 	int state_change_retransmit_timer_expired;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	query_response_timer_expired = 0;
 	state_change_retransmit_timer_expired = 0;
 
 	/*
 	 * During a transition from compatibility mode back to MLDv2,
 	 * a group record in REPORTING state may still have its group
 	 * timer active. This is a no-op in this function; it is easier
 	 * to deal with it here than to complicate the slow-timeout path.
 	 */
 	if (inm->in6m_timer == 0) {
 		query_response_timer_expired = 0;
 	} else if (--inm->in6m_timer == 0) {
 		query_response_timer_expired = 1;
 	} else {
 		V_current_state_timers_running6 = 1;
 	}
 
 	if (inm->in6m_sctimer == 0) {
 		state_change_retransmit_timer_expired = 0;
 	} else if (--inm->in6m_sctimer == 0) {
 		state_change_retransmit_timer_expired = 1;
 	} else {
 		V_state_change_timers_running6 = 1;
 	}
 
 	/* We are in fasttimo, so be quick about it. */
 	if (!state_change_retransmit_timer_expired &&
 	    !query_response_timer_expired)
 		return;
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 	case MLD_IDLE_MEMBER:
 		break;
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		/*
 		 * Respond to a previously pending Group-Specific
 		 * or Group-and-Source-Specific query by enqueueing
 		 * the appropriate Current-State report for
 		 * immediate transmission.
 		 */
 		if (query_response_timer_expired) {
 			int retval;
 
 			retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1,
 			    (inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER),
 			    0);
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			in6m_clear_recorded(inm);
 		}
 		/* FALLTHROUGH */
 	case MLD_REPORTING_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		if (state_change_retransmit_timer_expired) {
 			/*
 			 * State-change retransmission timer fired.
 			 * If there are any further pending retransmissions,
 			 * set the global pending state-change flag, and
 			 * reset the timer.
 			 */
 			if (--inm->in6m_scrv > 0) {
 				inm->in6m_sctimer = uri_fasthz;
 				V_state_change_timers_running6 = 1;
 			}
 			/*
 			 * Retransmit the previously computed state-change
 			 * report. If there are no further pending
 			 * retransmissions, the mbuf queue will be consumed.
 			 * Update T0 state to T1 as we have now sent
 			 * a state-change.
 			 */
 			(void)mld_v2_merge_state_changes(inm, scq);
 
 			in6m_commit(inm);
 			CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 			    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 			    if_name(inm->in6m_ifp));
 
 			/*
 			 * If we are leaving the group for good, make sure
 			 * we release MLD's reference to it.
 			 * This release must be deferred using a SLIST,
 			 * as we are called from a loop which traverses
 			 * the in_ifmultiaddr TAILQ.
 			 */
 			if (inm->in6m_state == MLD_LEAVING_MEMBER &&
 			    inm->in6m_scrv == 0) {
 				inm->in6m_state = MLD_NOT_MEMBER;
 				SLIST_INSERT_HEAD(&mli->mli_relinmhead,
 				    inm, in6m_nrele);
 			}
 		}
 		break;
 	}
 }
 
 /*
  * Switch to a different version on the given interface,
  * as per Section 9.12.
  */
 static void
 mld_set_version(struct mld_ifinfo *mli, const int version)
 {
 	int old_version_timer;
 
 	MLD_LOCK_ASSERT();
 
 	CTR4(KTR_MLD, "%s: switching to v%d on ifp %p(%s)", __func__,
 	    version, mli->mli_ifp, if_name(mli->mli_ifp));
 
 	if (version == MLD_VERSION_1) {
 		/*
 		 * Compute the "Older Version Querier Present" timer as per
 		 * Section 9.12.
 		 */
 		old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri;
 		old_version_timer *= PR_SLOWHZ;
 		mli->mli_v1_timer = old_version_timer;
 	}
 
 	if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) {
 		mli->mli_version = MLD_VERSION_1;
 		mld_v2_cancel_link_timers(mli);
 	}
 }
 
 /*
  * Cancel pending MLDv2 timers for the given link and all groups
  * joined on it; state-change, general-query, and group-query timers.
  */
 static void
 mld_v2_cancel_link_timers(struct mld_ifinfo *mli)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in6_multi	*inm, *tinm;
 
 	CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__,
 	    mli->mli_ifp, if_name(mli->mli_ifp));
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	/*
 	 * Fast-track this potentially expensive operation
 	 * by checking all the global 'timer pending' flags.
 	 */
 	if (!V_interface_timers_running6 &&
 	    !V_state_change_timers_running6 &&
 	    !V_current_state_timers_running6)
 		return;
 
 	mli->mli_v2_timer = 0;
 
 	ifp = mli->mli_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET6)
 			continue;
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_LAZY_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			break;
 		case MLD_LEAVING_MEMBER:
 			/*
 			 * If we are leaving the group and switching
 			 * version, we need to release the final
 			 * reference held for issuing the INCLUDE {}.
 			 */
 			SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm,
 			    in6m_nrele);
 			/* FALLTHROUGH */
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 			in6m_clear_recorded(inm);
 			/* FALLTHROUGH */
 		case MLD_REPORTING_MEMBER:
 			inm->in6m_sctimer = 0;
 			inm->in6m_timer = 0;
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			/*
 			 * Free any pending MLDv2 state-change records.
 			 */
 			_IF_DRAIN(&inm->in6m_scq);
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 	SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele);
 		in6m_release_locked(inm);
 	}
 }
 
 /*
  * Global slowtimo handler.
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 void
 mld_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		mld_slowtimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Per-vnet slowtimo handler.
  */
 static void
 mld_slowtimo_vnet(void)
 {
 	struct mld_ifinfo *mli;
 
 	MLD_LOCK();
 
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		mld_v1_process_querier_timers(mli);
 	}
 
 	MLD_UNLOCK();
 }
 
 /*
  * Update the Older Version Querier Present timers for a link.
  * See Section 9.12 of RFC 3810.
  */
 static void
 mld_v1_process_querier_timers(struct mld_ifinfo *mli)
 {
 
 	MLD_LOCK_ASSERT();
 
 	if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) {
 		/*
 		 * MLDv1 Querier Present timer expired; revert to MLDv2.
 		 */
 		CTR5(KTR_MLD,
 		    "%s: transition from v%d -> v%d on %p(%s)",
 		    __func__, mli->mli_version, MLD_VERSION_2,
 		    mli->mli_ifp, if_name(mli->mli_ifp));
 		mli->mli_version = MLD_VERSION_2;
 	}
 }
 
 /*
  * Transmit an MLDv1 report immediately.
  */
 static int
 mld_v1_transmit_report(struct in6_multi *in6m, const int type)
 {
 	struct ifnet		*ifp;
 	struct in6_ifaddr	*ia;
 	struct ip6_hdr		*ip6;
 	struct mbuf		*mh, *md;
 	struct mld_hdr		*mld;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	ifp = in6m->in6m_ifp;
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	/* ia may be NULL if link-local address is tentative. */
 
 	mh = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mh == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (ENOMEM);
 	}
 	md = m_get(M_NOWAIT, MT_DATA);
 	if (md == NULL) {
 		m_free(mh);
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (ENOMEM);
 	}
 	mh->m_next = md;
 
 	/*
 	 * FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so
 	 * that ether_output() does not need to allocate another mbuf
 	 * for the header in the most common case.
 	 */
 	MH_ALIGN(mh, sizeof(struct ip6_hdr));
 	mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
 	mh->m_len = sizeof(struct ip6_hdr);
 
 	ip6 = mtod(mh, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
 	ip6->ip6_dst = in6m->in6m_addr;
 
 	md->m_len = sizeof(struct mld_hdr);
 	mld = mtod(md, struct mld_hdr *);
 	mld->mld_type = type;
 	mld->mld_code = 0;
 	mld->mld_cksum = 0;
 	mld->mld_maxdelay = 0;
 	mld->mld_reserved = 0;
 	mld->mld_addr = in6m->in6m_addr;
 	in6_clearscope(&mld->mld_addr);
 	mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), sizeof(struct mld_hdr));
 
 	mld_save_context(mh, ifp);
 	mh->m_flags |= M_MLDV1;
 
 	mld_dispatch_packet(mh);
 
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (0);
 }
 
 /*
  * Process a state change from the upper layer for the given IPv6 group.
  *
  * Each socket holds a reference on the in_multi in its own ip_moptions.
  * The socket layer will have made the necessary updates to.the group
  * state, it is now up to MLD to issue a state change report if there
  * has been any change between T0 (when the last state-change was issued)
  * and T1 (now).
  *
  * We use the MLDv2 state machine at group level. The MLd module
  * however makes the decision as to which MLD protocol version to speak.
  * A state change *from* INCLUDE {} always means an initial join.
  * A state change *to* INCLUDE {} always means a final leave.
  *
  * If delay is non-zero, and the state change is an initial multicast
  * join, the state change report will be delayed by 'delay' ticks
  * in units of PR_FASTHZ if MLDv1 is active on the link; otherwise
  * the initial MLDv2 state change report will be delayed by whichever
  * is sooner, a pending state-change timer or delay itself.
  *
  * VIMAGE: curvnet should have been set by caller, as this routine
  * is called from the socket option handlers.
  */
 int
 mld_change_state(struct in6_multi *inm, const int delay)
 {
 	struct mld_ifinfo *mli;
 	struct ifnet *ifp;
 	int error;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	error = 0;
 
 	/*
 	 * Try to detect if the upper layer just asked us to change state
 	 * for an interface which has now gone away.
 	 */
 	KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->in6m_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that netinet6's notion of ifp is the
 		 * same as net's.
 		 */
 		KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifinfo for ifp %p", __func__, ifp));
 
 	/*
 	 * If we detect a state transition to or from MCAST_UNDEFINED
 	 * for this group, then we are starting or finishing an MLD
 	 * life cycle for this group.
 	 */
 	if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) {
 		CTR3(KTR_MLD, "%s: inm transition %d -> %d", __func__,
 		    inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode);
 		if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_MLD, "%s: initial join", __func__);
 			error = mld_initial_join(inm, mli, delay);
 			goto out_locked;
 		} else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_MLD, "%s: final leave", __func__);
 			mld_final_leave(inm, mli);
 			goto out_locked;
 		}
 	} else {
 		CTR1(KTR_MLD, "%s: filter set change", __func__);
 	}
 
 	error = mld_handle_state_change(inm, mli);
 
 out_locked:
 	MLD_UNLOCK();
 	return (error);
 }
 
 /*
  * Perform the initial join for an MLD group.
  *
  * When joining a group:
  *  If the group should have its MLD traffic suppressed, do nothing.
  *  MLDv1 starts sending MLDv1 host membership reports.
  *  MLDv2 will schedule an MLDv2 state-change report containing the
  *  initial state of the membership.
  *
  * If the delay argument is non-zero, then we must delay sending the
  * initial state change for delay ticks (in units of PR_FASTHZ).
  */
 static int
 mld_initial_join(struct in6_multi *inm, struct mld_ifinfo *mli,
     const int delay)
 {
 	struct ifnet		*ifp;
 	struct ifqueue		*ifq;
 	int			 error, retval, syncstates;
 	int			 odelay;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: initial join %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	error = 0;
 	syncstates = 1;
 
 	ifp = inm->in6m_ifp;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	/*
 	 * Groups joined on loopback or marked as 'not reported',
 	 * enter the MLD_SILENT_MEMBER state and
 	 * are never reported in any protocol exchanges.
 	 * All other groups enter the appropriate state machine
 	 * for the version in use on this link.
 	 * A link marked as MLIF_SILENT causes MLD to be completely
 	 * disabled for the link.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (mli->mli_flags & MLIF_SILENT) ||
 	    !mld_is_addr_reported(&inm->in6m_addr)) {
 		CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		inm->in6m_state = MLD_SILENT_MEMBER;
 		inm->in6m_timer = 0;
 	} else {
 		/*
 		 * Deal with overlapping in_multi lifecycle.
 		 * If this group was LEAVING, then make sure
 		 * we drop the reference we picked up to keep the
 		 * group around for the final INCLUDE {} enqueue.
 		 */
 		if (mli->mli_version == MLD_VERSION_2 &&
 		    inm->in6m_state == MLD_LEAVING_MEMBER)
 			in6m_release_locked(inm);
 
 		inm->in6m_state = MLD_REPORTING_MEMBER;
 
 		switch (mli->mli_version) {
 		case MLD_VERSION_1:
 			/*
 			 * If a delay was provided, only use it if
 			 * it is greater than the delay normally
 			 * used for an MLDv1 state change report,
 			 * and delay sending the initial MLDv1 report
 			 * by not transitioning to the IDLE state.
 			 */
 			odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_FASTHZ);
 			if (delay) {
 				inm->in6m_timer = max(delay, odelay);
 				V_current_state_timers_running6 = 1;
 			} else {
 				inm->in6m_state = MLD_IDLE_MEMBER;
 				error = mld_v1_transmit_report(inm,
 				     MLD_LISTENER_REPORT);
 				if (error == 0) {
 					inm->in6m_timer = odelay;
 					V_current_state_timers_running6 = 1;
 				}
 			}
 			break;
 
 		case MLD_VERSION_2:
 			/*
 			 * Defer update of T0 to T1, until the first copy
 			 * of the state change has been transmitted.
 			 */
 			syncstates = 0;
 
 			/*
 			 * Immediately enqueue a State-Change Report for
 			 * this interface, freeing any previous reports.
 			 * Don't kick the timers if there is nothing to do,
 			 * or if an error occurred.
 			 */
 			ifq = &inm->in6m_scq;
 			_IF_DRAIN(ifq);
 			retval = mld_v2_enqueue_group_record(ifq, inm, 1,
 			    0, 0, (mli->mli_flags & MLIF_USEALLOW));
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			if (retval <= 0) {
 				error = retval * -1;
 				break;
 			}
 
 			/*
 			 * Schedule transmission of pending state-change
 			 * report up to RV times for this link. The timer
 			 * will fire at the next mld_fasttimo (~200ms),
 			 * giving us an opportunity to merge the reports.
 			 *
 			 * If a delay was provided to this function, only
 			 * use this delay if sooner than the existing one.
 			 */
 			KASSERT(mli->mli_rv > 1,
 			   ("%s: invalid robustness %d", __func__,
 			    mli->mli_rv));
 			inm->in6m_scrv = mli->mli_rv;
 			if (delay) {
 				if (inm->in6m_sctimer > 1) {
 					inm->in6m_sctimer =
 					    min(inm->in6m_sctimer, delay);
 				} else
 					inm->in6m_sctimer = delay;
 			} else
 				inm->in6m_sctimer = 1;
 			V_state_change_timers_running6 = 1;
 
 			error = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Only update the T0 state if state change is atomic,
 	 * i.e. we don't need to wait for a timer to fire before we
 	 * can consider the state change to have been communicated.
 	 */
 	if (syncstates) {
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 	}
 
 	return (error);
 }
 
 /*
  * Issue an intermediate state change during the life-cycle.
  */
 static int
 mld_handle_state_change(struct in6_multi *inm, struct mld_ifinfo *mli)
 {
 	struct ifnet		*ifp;
 	int			 retval;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: state change for %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	ifp = inm->in6m_ifp;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli && mli->mli_ifp == ifp,
 	    ("%s: inconsistent ifp", __func__));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (mli->mli_flags & MLIF_SILENT) ||
 	    !mld_is_addr_reported(&inm->in6m_addr) ||
 	    (mli->mli_version != MLD_VERSION_2)) {
 		if (!mld_is_addr_reported(&inm->in6m_addr)) {
 			CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		}
 		CTR1(KTR_MLD, "%s: nothing to do", __func__);
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		return (0);
 	}
 
 	_IF_DRAIN(&inm->in6m_scq);
 
 	retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0,
 	    (mli->mli_flags & MLIF_USEALLOW));
 	CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval);
 	if (retval <= 0)
 		return (-retval);
 
 	/*
 	 * If record(s) were enqueued, start the state-change
 	 * report timer for this group.
 	 */
 	inm->in6m_scrv = mli->mli_rv;
 	inm->in6m_sctimer = 1;
 	V_state_change_timers_running6 = 1;
 
 	return (0);
 }
 
 /*
  * Perform the final leave for a multicast address.
  *
  * When leaving a group:
  *  MLDv1 sends a DONE message, if and only if we are the reporter.
  *  MLDv2 enqueues a state-change report containing a transition
  *  to INCLUDE {} for immediate transmission.
  */
 static void
 mld_final_leave(struct in6_multi *inm, struct mld_ifinfo *mli)
 {
 	int syncstates;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	syncstates = 1;
 
 	CTR4(KTR_MLD, "%s: final leave %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		/* Already leaving or left; do nothing. */
 		CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		break;
 	case MLD_REPORTING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		if (mli->mli_version == MLD_VERSION_1) {
 #ifdef INVARIANTS
 			if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER ||
 			    inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER)
 			panic("%s: MLDv2 state reached, not MLDv2 mode",
 			     __func__);
 #endif
 			mld_v1_transmit_report(inm, MLD_LISTENER_DONE);
 			inm->in6m_state = MLD_NOT_MEMBER;
 			V_current_state_timers_running6 = 1;
 		} else if (mli->mli_version == MLD_VERSION_2) {
 			/*
 			 * Stop group timer and all pending reports.
 			 * Immediately enqueue a state-change report
 			 * TO_IN {} to be sent on the next fast timeout,
 			 * giving us an opportunity to merge reports.
 			 */
 			_IF_DRAIN(&inm->in6m_scq);
 			inm->in6m_timer = 0;
 			inm->in6m_scrv = mli->mli_rv;
 			CTR4(KTR_MLD, "%s: Leaving %s/%s with %d "
 			    "pending retransmissions.", __func__,
 			    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 			    if_name(inm->in6m_ifp), inm->in6m_scrv);
 			if (inm->in6m_scrv == 0) {
 				inm->in6m_state = MLD_NOT_MEMBER;
 				inm->in6m_sctimer = 0;
 			} else {
 				int retval;
 
 				in6m_acquire_locked(inm);
 
 				retval = mld_v2_enqueue_group_record(
 				    &inm->in6m_scq, inm, 1, 0, 0,
 				    (mli->mli_flags & MLIF_USEALLOW));
 				KASSERT(retval != 0,
 				    ("%s: enqueue record = %d", __func__,
 				     retval));
 
 				inm->in6m_state = MLD_LEAVING_MEMBER;
 				inm->in6m_sctimer = 1;
 				V_state_change_timers_running6 = 1;
 				syncstates = 0;
 			}
 			break;
 		}
 		break;
 	case MLD_LAZY_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		/* Our reports are suppressed; do nothing. */
 		break;
 	}
 
 	if (syncstates) {
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 		CTR3(KTR_MLD, "%s: T1 now MCAST_UNDEFINED for %p/%s",
 		    __func__, &inm->in6m_addr, if_name(inm->in6m_ifp));
 	}
 }
 
 /*
  * Enqueue an MLDv2 group record to the given output queue.
  *
  * If is_state_change is zero, a current-state record is appended.
  * If is_state_change is non-zero, a state-change report is appended.
  *
  * If is_group_query is non-zero, an mbuf packet chain is allocated.
  * If is_group_query is zero, and if there is a packet with free space
  * at the tail of the queue, it will be appended to providing there
  * is enough free space.
  * Otherwise a new mbuf packet chain is allocated.
  *
  * If is_source_query is non-zero, each source is checked to see if
  * it was recorded for a Group-Source query, and will be omitted if
  * it is not both in-mode and recorded.
  *
  * If use_block_allow is non-zero, state change reports for initial join
  * and final leave, on an inclusive mode group with a source list, will be
  * rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively.
  *
  * The function will attempt to allocate leading space in the packet
  * for the IPv6+ICMP headers to be prepended without fragmenting the chain.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 mld_v2_enqueue_group_record(struct ifqueue *ifq, struct in6_multi *inm,
     const int is_state_change, const int is_group_query,
     const int is_source_query, const int use_block_allow)
 {
 	struct mldv2_record	 mr;
 	struct mldv2_record	*pmr;
 	struct ifnet		*ifp;
 	struct ip6_msource	*ims, *nims;
 	struct mbuf		*m0, *m, *md;
 	int			 error, is_filter_list_change;
 	int			 minrec0len, m0srcs, msrcs, nbytes, off;
 	int			 record_has_sources;
 	int			 now;
 	int			 type;
 	uint8_t			 mode;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	error = 0;
 	ifp = inm->in6m_ifp;
 	is_filter_list_change = 0;
 	m = NULL;
 	m0 = NULL;
 	m0srcs = 0;
 	msrcs = 0;
 	nbytes = 0;
 	nims = NULL;
 	record_has_sources = 1;
 	pmr = NULL;
 	type = MLD_DO_NOTHING;
 	mode = inm->in6m_st[1].iss_fmode;
 
 	/*
 	 * If we did not transition out of ASM mode during t0->t1,
 	 * and there are no source nodes to process, we can skip
 	 * the generation of source records.
 	 */
 	if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 &&
 	    inm->in6m_nsrc == 0)
 		record_has_sources = 0;
 
 	if (is_state_change) {
 		/*
 		 * Queue a state change record.
 		 * If the mode did not change, and there are non-ASM
 		 * listeners or source filters present,
 		 * we potentially need to issue two records for the group.
 		 * If there are ASM listeners, and there was no filter
 		 * mode transition of any kind, do nothing.
 		 *
 		 * If we are transitioning to MCAST_UNDEFINED, we need
 		 * not send any sources. A transition to/from this state is
 		 * considered inclusive with some special treatment.
 		 *
 		 * If we are rewriting initial joins/leaves to use
 		 * ALLOW/BLOCK, and the group's membership is inclusive,
 		 * we need to send sources in all cases.
 		 */
 		if (mode != inm->in6m_st[0].iss_fmode) {
 			if (mode == MCAST_EXCLUDE) {
 				CTR1(KTR_MLD, "%s: change to EXCLUDE",
 				    __func__);
 				type = MLD_CHANGE_TO_EXCLUDE_MODE;
 			} else {
 				CTR1(KTR_MLD, "%s: change to INCLUDE",
 				    __func__);
 				if (use_block_allow) {
 					/*
 					 * XXX
 					 * Here we're interested in state
 					 * edges either direction between
 					 * MCAST_UNDEFINED and MCAST_INCLUDE.
 					 * Perhaps we should just check
 					 * the group state, rather than
 					 * the filter mode.
 					 */
 					if (mode == MCAST_UNDEFINED) {
 						type = MLD_BLOCK_OLD_SOURCES;
 					} else {
 						type = MLD_ALLOW_NEW_SOURCES;
 					}
 				} else {
 					type = MLD_CHANGE_TO_INCLUDE_MODE;
 					if (mode == MCAST_UNDEFINED)
 						record_has_sources = 0;
 				}
 			}
 		} else {
 			if (record_has_sources) {
 				is_filter_list_change = 1;
 			} else {
 				type = MLD_DO_NOTHING;
 			}
 		}
 	} else {
 		/*
 		 * Queue a current state record.
 		 */
 		if (mode == MCAST_EXCLUDE) {
 			type = MLD_MODE_IS_EXCLUDE;
 		} else if (mode == MCAST_INCLUDE) {
 			type = MLD_MODE_IS_INCLUDE;
 			KASSERT(inm->in6m_st[1].iss_asm == 0,
 			    ("%s: inm %p is INCLUDE but ASM count is %d",
 			     __func__, inm, inm->in6m_st[1].iss_asm));
 		}
 	}
 
 	/*
 	 * Generate the filter list changes using a separate function.
 	 */
 	if (is_filter_list_change)
 		return (mld_v2_enqueue_filter_change(ifq, inm));
 
 	if (type == MLD_DO_NOTHING) {
 		CTR3(KTR_MLD, "%s: nothing to do for %s/%s",
 		    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		return (0);
 	}
 
 	/*
 	 * If any sources are present, we must be able to fit at least
 	 * one in the trailing space of the tail packet's mbuf,
 	 * ideally more.
 	 */
 	minrec0len = sizeof(struct mldv2_record);
 	if (record_has_sources)
 		minrec0len += sizeof(struct in6_addr);
 
 	CTR4(KTR_MLD, "%s: queueing %s for %s/%s", __func__,
 	    mld_rec_type_to_str(type),
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    if_name(inm->in6m_ifp));
 
 	/*
 	 * Check if we have a packet in the tail of the queue for this
 	 * group into which the first group record for this group will fit.
 	 * Otherwise allocate a new packet.
 	 * Always allocate leading space for IP6+RA+ICMPV6+REPORT.
 	 * Note: Group records for G/GSR query responses MUST be sent
 	 * in their own packet.
 	 */
 	m0 = ifq->ifq_tail;
 	if (!is_group_query &&
 	    m0 != NULL &&
 	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) &&
 	    (m0->m_pkthdr.len + minrec0len) <
 	     (ifp->if_mtu - MLD_MTUSPACE)) {
 		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 			    sizeof(struct mldv2_record)) /
 			    sizeof(struct in6_addr);
 		m = m0;
 		CTR1(KTR_MLD, "%s: use existing packet", __func__);
 	} else {
 		if (_IF_QFULL(ifq)) {
 			CTR1(KTR_MLD, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = NULL;
 		m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 		    sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
 		if (!is_state_change && !is_group_query)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (-ENOMEM);
 
 		mld_save_context(m, ifp);
 
 		CTR1(KTR_MLD, "%s: allocated first packet", __func__);
 	}
 
 	/*
 	 * Append group record.
 	 * If we have sources, we don't know how many yet.
 	 */
 	mr.mr_type = type;
 	mr.mr_datalen = 0;
 	mr.mr_numsrc = 0;
 	mr.mr_addr = inm->in6m_addr;
 	in6_clearscope(&mr.mr_addr);
 	if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
 		if (m != m0)
 			m_freem(m);
 		CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
 		return (-ENOMEM);
 	}
 	nbytes += sizeof(struct mldv2_record);
 
 	/*
 	 * Append as many sources as will fit in the first packet.
 	 * If we are appending to a new packet, the chain allocation
 	 * may potentially use clusters; use m_getptr() in this case.
 	 * If we are appending to an existing packet, we need to obtain
 	 * a pointer to the group record after m_append(), in case a new
 	 * mbuf was allocated.
 	 *
 	 * Only append sources which are in-mode at t1. If we are
 	 * transitioning to MCAST_UNDEFINED state on the group, and
 	 * use_block_allow is zero, do not include source entries.
 	 * Otherwise, we need to include this source in the report.
 	 *
 	 * Only report recorded sources in our filter set when responding
 	 * to a group-source query.
 	 */
 	if (record_has_sources) {
 		if (m == m0) {
 			md = m_last(m);
 			pmr = (struct mldv2_record *)(mtod(md, uint8_t *) +
 			    md->m_len - nbytes);
 		} else {
 			md = m_getptr(m, 0, &off);
 			pmr = (struct mldv2_record *)(mtod(md, uint8_t *) +
 			    off);
 		}
 		msrcs = 0;
 		RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs,
 		    nims) {
 			CTR2(KTR_MLD, "%s: visit node %s", __func__,
 			    ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 			now = im6s_get_mode(inm, ims, 1);
 			CTR2(KTR_MLD, "%s: node is %d", __func__, now);
 			if ((now != mode) ||
 			    (now == mode &&
 			     (!use_block_allow && mode == MCAST_UNDEFINED))) {
 				CTR1(KTR_MLD, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->im6s_stp == 0) {
 				CTR1(KTR_MLD, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_MLD, "%s: append node", __func__);
 			if (!m_append(m, sizeof(struct in6_addr),
 			    (void *)&ims->im6s_addr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			nbytes += sizeof(struct in6_addr);
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		CTR2(KTR_MLD, "%s: msrcs is %d this packet", __func__,
 		    msrcs);
 		pmr->mr_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(struct in6_addr));
 	}
 
 	if (is_source_query && msrcs == 0) {
 		CTR1(KTR_MLD, "%s: no recorded sources to report", __func__);
 		if (m != m0)
 			m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * We are good to go with first packet.
 	 */
 	if (m != m0) {
 		CTR1(KTR_MLD, "%s: enqueueing first packet", __func__);
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		_IF_ENQUEUE(ifq, m);
 	} else
 		m->m_pkthdr.PH_vt.vt_nrecs++;
 
 	/*
 	 * No further work needed if no source list in packet(s).
 	 */
 	if (!record_has_sources)
 		return (nbytes);
 
 	/*
 	 * Whilst sources remain to be announced, we need to allocate
 	 * a new packet and fill out as many sources as will fit.
 	 * Always try for a cluster first.
 	 */
 	while (nims != NULL) {
 		if (_IF_QFULL(ifq)) {
 			CTR1(KTR_MLD, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (-ENOMEM);
 		mld_save_context(m, ifp);
 		md = m_getptr(m, 0, &off);
 		pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off);
 		CTR1(KTR_MLD, "%s: allocated next packet", __func__);
 
 		if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
 			if (m != m0)
 				m_freem(m);
 			CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
 			return (-ENOMEM);
 		}
 		m->m_pkthdr.PH_vt.vt_nrecs = 1;
 		nbytes += sizeof(struct mldv2_record);
 
 		m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 		    sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
 
 		msrcs = 0;
 		RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
 			CTR2(KTR_MLD, "%s: visit node %s",
 			    __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 			now = im6s_get_mode(inm, ims, 1);
 			if ((now != mode) ||
 			    (now == mode &&
 			     (!use_block_allow && mode == MCAST_UNDEFINED))) {
 				CTR1(KTR_MLD, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->im6s_stp == 0) {
 				CTR1(KTR_MLD, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_MLD, "%s: append node", __func__);
 			if (!m_append(m, sizeof(struct in6_addr),
 			    (void *)&ims->im6s_addr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		pmr->mr_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(struct in6_addr));
 
 		CTR1(KTR_MLD, "%s: enqueueing next packet", __func__);
 		_IF_ENQUEUE(ifq, m);
 	}
 
 	return (nbytes);
 }
 
 /*
  * Type used to mark record pass completion.
  * We exploit the fact we can cast to this easily from the
  * current filter modes on each ip_msource node.
  */
 typedef enum {
 	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
 	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
 	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
 	REC_FULL = REC_ALLOW | REC_BLOCK
 } rectype_t;
 
 /*
  * Enqueue an MLDv2 filter list change to the given output queue.
  *
  * Source list filter state is held in an RB-tree. When the filter list
  * for a group is changed without changing its mode, we need to compute
  * the deltas between T0 and T1 for each source in the filter set,
  * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
  *
  * As we may potentially queue two record types, and the entire R-B tree
  * needs to be walked at once, we break this out into its own function
  * so we can generate a tightly packed queue of packets.
  *
  * XXX This could be written to only use one tree walk, although that makes
  * serializing into the mbuf chains a bit harder. For now we do two walks
  * which makes things easier on us, and it may or may not be harder on
  * the L2 cache.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 mld_v2_enqueue_filter_change(struct ifqueue *ifq, struct in6_multi *inm)
 {
 	static const int MINRECLEN =
 	    sizeof(struct mldv2_record) + sizeof(struct in6_addr);
 	struct ifnet		*ifp;
 	struct mldv2_record	 mr;
 	struct mldv2_record	*pmr;
 	struct ip6_msource	*ims, *nims;
 	struct mbuf		*m, *m0, *md;
 	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
 	int			 nallow, nblock;
 	uint8_t			 mode, now, then;
 	rectype_t		 crt, drt, nrt;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	if (inm->in6m_nsrc == 0 ||
 	    (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0))
 		return (0);
 
 	ifp = inm->in6m_ifp;			/* interface */
 	mode = inm->in6m_st[1].iss_fmode;	/* filter mode at t1 */
 	crt = REC_NONE;	/* current group record type */
 	drt = REC_NONE;	/* mask of completed group record types */
 	nrt = REC_NONE;	/* record type for current node */
 	m0srcs = 0;	/* # source which will fit in current mbuf chain */
 	npbytes = 0;	/* # of bytes appended this packet */
 	nbytes = 0;	/* # of bytes appended to group's state-change queue */
 	rsrcs = 0;	/* # sources encoded in current record */
 	schanged = 0;	/* # nodes encoded in overall filter change */
 	nallow = 0;	/* # of source entries in ALLOW_NEW */
 	nblock = 0;	/* # of source entries in BLOCK_OLD */
 	nims = NULL;	/* next tree node pointer */
 
 	/*
 	 * For each possible filter record mode.
 	 * The first kind of source we encounter tells us which
 	 * is the first kind of record we start appending.
 	 * If a node transitioned to UNDEFINED at t1, its mode is treated
 	 * as the inverse of the group's filter mode.
 	 */
 	while (drt != REC_FULL) {
 		do {
 			m0 = ifq->ifq_tail;
 			if (m0 != NULL &&
 			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
 			     MLD_V2_REPORT_MAXRECS) &&
 			    (m0->m_pkthdr.len + MINRECLEN) <
 			     (ifp->if_mtu - MLD_MTUSPACE)) {
 				m = m0;
 				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 					    sizeof(struct mldv2_record)) /
 					    sizeof(struct in6_addr);
 				CTR1(KTR_MLD,
 				    "%s: use previous packet", __func__);
 			} else {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				if (m == NULL)
 					m = m_gethdr(M_NOWAIT, MT_DATA);
 				if (m == NULL) {
 					CTR1(KTR_MLD,
 					    "%s: m_get*() failed", __func__);
 					return (-ENOMEM);
 				}
 				m->m_pkthdr.PH_vt.vt_nrecs = 0;
 				mld_save_context(m, ifp);
 				m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 				    sizeof(struct mldv2_record)) /
 				    sizeof(struct in6_addr);
 				npbytes = 0;
 				CTR1(KTR_MLD,
 				    "%s: allocated new packet", __func__);
 			}
 			/*
 			 * Append the MLD group record header to the
 			 * current packet's data area.
 			 * Recalculate pointer to free space for next
 			 * group record, in case m_append() allocated
 			 * a new mbuf or cluster.
 			 */
 			memset(&mr, 0, sizeof(mr));
 			mr.mr_addr = inm->in6m_addr;
 			in6_clearscope(&mr.mr_addr);
 			if (!m_append(m, sizeof(mr), (void *)&mr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD,
 				    "%s: m_append() failed", __func__);
 				return (-ENOMEM);
 			}
 			npbytes += sizeof(struct mldv2_record);
 			if (m != m0) {
 				/* new packet; offset in chain */
 				md = m_getptr(m, npbytes -
 				    sizeof(struct mldv2_record), &off);
 				pmr = (struct mldv2_record *)(mtod(md,
 				    uint8_t *) + off);
 			} else {
 				/* current packet; offset from last append */
 				md = m_last(m);
 				pmr = (struct mldv2_record *)(mtod(md,
 				    uint8_t *) + md->m_len -
 				    sizeof(struct mldv2_record));
 			}
 			/*
 			 * Begin walking the tree for this record type
 			 * pass, or continue from where we left off
 			 * previously if we had to allocate a new packet.
 			 * Only report deltas in-mode at t1.
 			 * We need not report included sources as allowed
 			 * if we are in inclusive mode on the group,
 			 * however the converse is not true.
 			 */
 			rsrcs = 0;
 			if (nims == NULL) {
 				nims = RB_MIN(ip6_msource_tree,
 				    &inm->in6m_srcs);
 			}
 			RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
 				CTR2(KTR_MLD, "%s: visit node %s", __func__,
 				    ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 				now = im6s_get_mode(inm, ims, 1);
 				then = im6s_get_mode(inm, ims, 0);
 				CTR3(KTR_MLD, "%s: mode: t0 %d, t1 %d",
 				    __func__, then, now);
 				if (now == then) {
 					CTR1(KTR_MLD,
 					    "%s: skip unchanged", __func__);
 					continue;
 				}
 				if (mode == MCAST_EXCLUDE &&
 				    now == MCAST_INCLUDE) {
 					CTR1(KTR_MLD,
 					    "%s: skip IN src on EX group",
 					    __func__);
 					continue;
 				}
 				nrt = (rectype_t)now;
 				if (nrt == REC_NONE)
 					nrt = (rectype_t)(~mode & REC_FULL);
 				if (schanged++ == 0) {
 					crt = nrt;
 				} else if (crt != nrt)
 					continue;
 				if (!m_append(m, sizeof(struct in6_addr),
 				    (void *)&ims->im6s_addr)) {
 					if (m != m0)
 						m_freem(m);
 					CTR1(KTR_MLD,
 					    "%s: m_append() failed", __func__);
 					return (-ENOMEM);
 				}
 				nallow += !!(crt == REC_ALLOW);
 				nblock += !!(crt == REC_BLOCK);
 				if (++rsrcs == m0srcs)
 					break;
 			}
 			/*
 			 * If we did not append any tree nodes on this
 			 * pass, back out of allocations.
 			 */
 			if (rsrcs == 0) {
 				npbytes -= sizeof(struct mldv2_record);
 				if (m != m0) {
 					CTR1(KTR_MLD,
 					    "%s: m_free(m)", __func__);
 					m_freem(m);
 				} else {
 					CTR1(KTR_MLD,
 					    "%s: m_adj(m, -mr)", __func__);
 					m_adj(m, -((int)sizeof(
 					    struct mldv2_record)));
 				}
 				continue;
 			}
 			npbytes += (rsrcs * sizeof(struct in6_addr));
 			if (crt == REC_ALLOW)
 				pmr->mr_type = MLD_ALLOW_NEW_SOURCES;
 			else if (crt == REC_BLOCK)
 				pmr->mr_type = MLD_BLOCK_OLD_SOURCES;
 			pmr->mr_numsrc = htons(rsrcs);
 			/*
 			 * Count the new group record, and enqueue this
 			 * packet if it wasn't already queued.
 			 */
 			m->m_pkthdr.PH_vt.vt_nrecs++;
 			if (m != m0)
 				_IF_ENQUEUE(ifq, m);
 			nbytes += npbytes;
 		} while (nims != NULL);
 		drt |= crt;
 		crt = (~crt & REC_FULL);
 	}
 
 	CTR3(KTR_MLD, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
 	    nallow, nblock);
 
 	return (nbytes);
 }
 
 static int
 mld_v2_merge_state_changes(struct in6_multi *inm, struct ifqueue *ifscq)
 {
 	struct ifqueue	*gq;
 	struct mbuf	*m;		/* pending state-change */
 	struct mbuf	*m0;		/* copy of pending state-change */
 	struct mbuf	*mt;		/* last state-change in packet */
 	int		 docopy, domerge;
 	u_int		 recslen;
 
 	docopy = 0;
 	domerge = 0;
 	recslen = 0;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	/*
 	 * If there are further pending retransmissions, make a writable
 	 * copy of each queued state-change message before merging.
 	 */
 	if (inm->in6m_scrv > 0)
 		docopy = 1;
 
 	gq = &inm->in6m_scq;
 #ifdef KTR
 	if (gq->ifq_head == NULL) {
 		CTR2(KTR_MLD, "%s: WARNING: queue for inm %p is empty",
 		    __func__, inm);
 	}
 #endif
 
 	m = gq->ifq_head;
 	while (m != NULL) {
 		/*
 		 * Only merge the report into the current packet if
 		 * there is sufficient space to do so; an MLDv2 report
 		 * packet may only contain 65,535 group records.
 		 * Always use a simple mbuf chain concatentation to do this,
 		 * as large state changes for single groups may have
 		 * allocated clusters.
 		 */
 		domerge = 0;
 		mt = ifscq->ifq_tail;
 		if (mt != NULL) {
 			recslen = m_length(m, NULL);
 
 			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
 			    m->m_pkthdr.PH_vt.vt_nrecs <=
 			    MLD_V2_REPORT_MAXRECS) &&
 			    (mt->m_pkthdr.len + recslen <=
 			    (inm->in6m_ifp->if_mtu - MLD_MTUSPACE)))
 				domerge = 1;
 		}
 
 		if (!domerge && _IF_QFULL(gq)) {
 			CTR2(KTR_MLD,
 			    "%s: outbound queue full, skipping whole packet %p",
 			    __func__, m);
 			mt = m->m_nextpkt;
 			if (!docopy)
 				m_freem(m);
 			m = mt;
 			continue;
 		}
 
 		if (!docopy) {
 			CTR2(KTR_MLD, "%s: dequeueing %p", __func__, m);
 			_IF_DEQUEUE(gq, m0);
 			m = m0->m_nextpkt;
 		} else {
 			CTR2(KTR_MLD, "%s: copying %p", __func__, m);
 			m0 = m_dup(m, M_NOWAIT);
 			if (m0 == NULL)
 				return (ENOMEM);
 			m0->m_nextpkt = NULL;
 			m = m->m_nextpkt;
 		}
 
 		if (!domerge) {
 			CTR3(KTR_MLD, "%s: queueing %p to ifscq %p)",
 			    __func__, m0, ifscq);
 			_IF_ENQUEUE(ifscq, m0);
 		} else {
 			struct mbuf *mtl;	/* last mbuf of packet mt */
 
 			CTR3(KTR_MLD, "%s: merging %p with ifscq tail %p)",
 			    __func__, m0, mt);
 
 			mtl = m_last(mt);
 			m0->m_flags &= ~M_PKTHDR;
 			mt->m_pkthdr.len += recslen;
 			mt->m_pkthdr.PH_vt.vt_nrecs +=
 			    m0->m_pkthdr.PH_vt.vt_nrecs;
 
 			mtl->m_next = m0;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Respond to a pending MLDv2 General Query.
  */
 static void
 mld_v2_dispatch_general_query(struct mld_ifinfo *mli)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in6_multi	*inm;
 	int			 retval;
 
 	IN6_MULTI_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli->mli_version == MLD_VERSION_2,
 	    ("%s: called when version %d", __func__, mli->mli_version));
 
 	ifp = mli->mli_ifp;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET6 ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 		KASSERT(ifp == inm->in6m_ifp,
 		    ("%s: inconsistent ifp", __func__));
 
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 			break;
 		case MLD_REPORTING_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_LAZY_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			retval = mld_v2_enqueue_group_record(&mli->mli_gq,
 			    inm, 0, 0, 0, 0);
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			break;
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 		case MLD_LEAVING_MEMBER:
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST);
 
 	/*
 	 * Slew transmission of bursts over 500ms intervals.
 	 */
 	if (mli->mli_gq.ifq_head != NULL) {
 		mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY(
 		    MLD_RESPONSE_BURST_INTERVAL);
 		V_interface_timers_running6 = 1;
 	}
 }
 
 /*
  * Transmit the next pending message in the output queue.
  *
  * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
  * MRT: Nothing needs to be done, as MLD traffic is always local to
  * a link and uses a link-scope multicast address.
  */
 static void
 mld_dispatch_packet(struct mbuf *m)
 {
 	struct ip6_moptions	 im6o;
 	struct ifnet		*ifp;
 	struct ifnet		*oifp;
 	struct mbuf		*m0;
 	struct mbuf		*md;
 	struct ip6_hdr		*ip6;
 	struct mld_hdr		*mld;
 	int			 error;
 	int			 off;
 	int			 type;
 	uint32_t		 ifindex;
 
 	CTR2(KTR_MLD, "%s: transmit %p", __func__, m);
 
 	/*
 	 * Set VNET image pointer from enqueued mbuf chain
 	 * before doing anything else. Whilst we use interface
 	 * indexes to guard against interface detach, they are
 	 * unique to each VIMAGE and must be retrieved.
 	 */
 	ifindex = mld_restore_context(m);
 
 	/*
 	 * Check if the ifnet still exists. This limits the scope of
 	 * any race in the absence of a global ifp lock for low cost
 	 * (an array lookup).
 	 */
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR3(KTR_MLD, "%s: dropped %p as ifindex %u went away.",
 		    __func__, m, ifindex);
 		m_freem(m);
 		IP6STAT_INC(ip6s_noroute);
 		goto out;
 	}
 
 	im6o.im6o_multicast_hlim  = 1;
 	im6o.im6o_multicast_loop = (V_ip6_mrouter != NULL);
 	im6o.im6o_multicast_ifp = ifp;
 
 	if (m->m_flags & M_MLDV1) {
 		m0 = m;
 	} else {
 		m0 = mld_v2_encap_report(ifp, m);
 		if (m0 == NULL) {
 			CTR2(KTR_MLD, "%s: dropped %p", __func__, m);
 			IP6STAT_INC(ip6s_odropped);
 			goto out;
 		}
 	}
 
 	mld_scrub_context(m0);
 	m_clrprotoflags(m);
 	m0->m_pkthdr.rcvif = V_loif;
 
 	ip6 = mtod(m0, struct ip6_hdr *);
 #if 0
 	(void)in6_setscope(&ip6->ip6_dst, ifp, NULL);	/* XXX LOR */
 #else
 	/*
 	 * XXX XXX Break some KPI rules to prevent an LOR which would
 	 * occur if we called in6_setscope() at transmission.
 	 * See comments at top of file.
 	 */
 	MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index);
 #endif
 
 	/*
 	 * Retrieve the ICMPv6 type before handoff to ip6_output(),
 	 * so we can bump the stats.
 	 */
 	md = m_getptr(m0, sizeof(struct ip6_hdr), &off);
 	mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off);
 	type = mld->mld_type;
 
 	error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, &im6o,
 	    &oifp, NULL);
 	if (error) {
 		CTR3(KTR_MLD, "%s: ip6_output(%p) = %d", __func__, m0, error);
 		goto out;
 	}
 	ICMP6STAT_INC(icp6s_outhist[type]);
 	if (oifp != NULL) {
 		icmp6_ifstat_inc(oifp, ifs6_out_msg);
 		switch (type) {
 		case MLD_LISTENER_REPORT:
 		case MLDV2_LISTENER_REPORT:
 			icmp6_ifstat_inc(oifp, ifs6_out_mldreport);
 			break;
 		case MLD_LISTENER_DONE:
 			icmp6_ifstat_inc(oifp, ifs6_out_mlddone);
 			break;
 		}
 	}
 out:
 	return;
 }
 
 /*
  * Encapsulate an MLDv2 report.
  *
  * KAME IPv6 requires that hop-by-hop options be passed separately,
  * and that the IPv6 header be prepended in a separate mbuf.
  *
  * Returns a pointer to the new mbuf chain head, or NULL if the
  * allocation failed.
  */
 static struct mbuf *
 mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m)
 {
 	struct mbuf		*mh;
 	struct mldv2_report	*mld;
 	struct ip6_hdr		*ip6;
 	struct in6_ifaddr	*ia;
 	int			 mldreclen;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 	KASSERT((m->m_flags & M_PKTHDR),
 	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
 
 	/*
 	 * RFC3590: OK to send as :: or tentative during DAD.
 	 */
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	if (ia == NULL)
 		CTR1(KTR_MLD, "%s: warning: ia is NULL", __func__);
 
 	mh = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mh == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		m_freem(m);
 		return (NULL);
 	}
 	MH_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report));
 
 	mldreclen = m_length(m, NULL);
 	CTR2(KTR_MLD, "%s: mldreclen is %d", __func__, mldreclen);
 
 	mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report);
 	mh->m_pkthdr.len = sizeof(struct ip6_hdr) +
 	    sizeof(struct mldv2_report) + mldreclen;
 
 	ip6 = mtod(mh, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	ip6->ip6_dst = in6addr_linklocal_allv2routers;
 	/* scope ID will be set in netisr */
 
 	mld = (struct mldv2_report *)(ip6 + 1);
 	mld->mld_type = MLDV2_LISTENER_REPORT;
 	mld->mld_code = 0;
 	mld->mld_cksum = 0;
 	mld->mld_v2_reserved = 0;
 	mld->mld_v2_numrecs = htons(m->m_pkthdr.PH_vt.vt_nrecs);
 	m->m_pkthdr.PH_vt.vt_nrecs = 0;
 
 	mh->m_next = m;
 	mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen);
 	return (mh);
 }
 
 #ifdef KTR
 static char *
 mld_rec_type_to_str(const int type)
 {
 
 	switch (type) {
 		case MLD_CHANGE_TO_EXCLUDE_MODE:
 			return "TO_EX";
 			break;
 		case MLD_CHANGE_TO_INCLUDE_MODE:
 			return "TO_IN";
 			break;
 		case MLD_MODE_IS_EXCLUDE:
 			return "MODE_EX";
 			break;
 		case MLD_MODE_IS_INCLUDE:
 			return "MODE_IN";
 			break;
 		case MLD_ALLOW_NEW_SOURCES:
 			return "ALLOW_NEW";
 			break;
 		case MLD_BLOCK_OLD_SOURCES:
 			return "BLOCK_OLD";
 			break;
 		default:
 			break;
 	}
 	return "unknown";
 }
 #endif
 
 static void
 mld_init(void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: initializing", __func__);
 	MLD_LOCK_INIT();
 
 	ip6_initpktopts(&mld_po);
 	mld_po.ip6po_hlim = 1;
 	mld_po.ip6po_hbh = &mld_ra.hbh;
 	mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER;
 	mld_po.ip6po_flags = IP6PO_DONTFRAG;
 }
 SYSINIT(mld_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, mld_init, NULL);
 
 static void
 mld_uninit(void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: tearing down", __func__);
 	MLD_LOCK_DESTROY();
 }
 SYSUNINIT(mld_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, mld_uninit, NULL);
 
 static void
 vnet_mld_init(const void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: initializing", __func__);
 
 	LIST_INIT(&V_mli_head);
 }
 VNET_SYSINIT(vnet_mld_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mld_init,
     NULL);
 
 static void
 vnet_mld_uninit(const void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: tearing down", __func__);
 
 	KASSERT(LIST_EMPTY(&V_mli_head),
 	    ("%s: mli list not empty; ifnets not detached?", __func__));
 }
 VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mld_uninit,
     NULL);
 
 static int
 mld_modevent(module_t mod, int type, void *unused __unused)
 {
 
     switch (type) {
     case MOD_LOAD:
     case MOD_UNLOAD:
 	break;
     default:
 	return (EOPNOTSUPP);
     }
     return (0);
 }
 
 static moduledata_t mld_mod = {
     "mld",
     mld_modevent,
     0
 };
 DECLARE_MODULE(mld, mld_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
Index: user/dchagin/lemul/sys/netinet6/nd6.c
===================================================================
--- user/dchagin/lemul/sys/netinet6/nd6.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet6/nd6.c	(revision 274241)
@@ -1,2475 +1,2475 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/protosw.h>
 #include <sys/errno.h>
 #include <sys/syslog.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/queue.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arc.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/iso88025.h>
 #include <net/fddi.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <net/if_llatbl.h>
 #define	L3_ADDR_SIN6(le)	((struct sockaddr_in6 *) L3_ADDR(le))
 #include <netinet/if_ether.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet/icmp6.h>
 #include <netinet6/send.h>
 
 #include <sys/limits.h>
 
 #include <security/mac/mac_framework.h>
 
 #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */
 #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */
 
 #define SIN6(s) ((const struct sockaddr_in6 *)(s))
 
 /* timer values */
 VNET_DEFINE(int, nd6_prune)	= 1;	/* walk list every 1 seconds */
 VNET_DEFINE(int, nd6_delay)	= 5;	/* delay first probe time 5 second */
 VNET_DEFINE(int, nd6_umaxtries)	= 3;	/* maximum unicast query */
 VNET_DEFINE(int, nd6_mmaxtries)	= 3;	/* maximum multicast query */
 VNET_DEFINE(int, nd6_useloopback) = 1;	/* use loopback interface for
 					 * local traffic */
 VNET_DEFINE(int, nd6_gctimer)	= (60 * 60 * 24); /* 1 day: garbage
 					 * collection timer */
 
 /* preventing too many loops in ND option parsing */
 static VNET_DEFINE(int, nd6_maxndopt) = 10; /* max # of ND options allowed */
 
 VNET_DEFINE(int, nd6_maxnudhint) = 0;	/* max # of subsequent upper
 					 * layer hints */
 static VNET_DEFINE(int, nd6_maxqueuelen) = 1; /* max pkts cached in unresolved
 					 * ND entries */
 #define	V_nd6_maxndopt			VNET(nd6_maxndopt)
 #define	V_nd6_maxqueuelen		VNET(nd6_maxqueuelen)
 
 #ifdef ND6_DEBUG
 VNET_DEFINE(int, nd6_debug) = 1;
 #else
 VNET_DEFINE(int, nd6_debug) = 0;
 #endif
 
 /* for debugging? */
 #if 0
 static int nd6_inuse, nd6_allocated;
 #endif
 
 VNET_DEFINE(struct nd_drhead, nd_defrouter);
 VNET_DEFINE(struct nd_prhead, nd_prefix);
 
 VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL;
 #define	V_nd6_recalc_reachtm_interval	VNET(nd6_recalc_reachtm_interval)
 
 int	(*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int);
 
 static int nd6_is_new_addr_neighbor(struct sockaddr_in6 *,
 	struct ifnet *);
 static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *);
 static void nd6_slowtimo(void *);
 static int regen_tmpaddr(struct in6_ifaddr *);
 static struct llentry *nd6_free(struct llentry *, int);
 static void nd6_llinfo_timer(void *);
 static void clear_llinfo_pqueue(struct llentry *);
 static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 
 static VNET_DEFINE(struct callout, nd6_slowtimo_ch);
 #define	V_nd6_slowtimo_ch		VNET(nd6_slowtimo_ch)
 
 VNET_DEFINE(struct callout, nd6_timer_ch);
 
 void
 nd6_init(void)
 {
 
 	LIST_INIT(&V_nd_prefix);
 
 	/* initialization of the default router list */
 	TAILQ_INIT(&V_nd_defrouter);
 
 	/* start timer */
 	callout_init(&V_nd6_slowtimo_ch, 0);
 	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
 	    nd6_slowtimo, curvnet);
 }
 
 #ifdef VIMAGE
 void
 nd6_destroy()
 {
 
 	callout_drain(&V_nd6_slowtimo_ch);
 	callout_drain(&V_nd6_timer_ch);
 }
 #endif
 
 struct nd_ifinfo *
 nd6_ifattach(struct ifnet *ifp)
 {
 	struct nd_ifinfo *nd;
 
 	nd = (struct nd_ifinfo *)malloc(sizeof(*nd), M_IP6NDP, M_WAITOK|M_ZERO);
 	nd->initialized = 1;
 
 	nd->chlim = IPV6_DEFHLIM;
 	nd->basereachable = REACHABLE_TIME;
 	nd->reachable = ND_COMPUTE_RTIME(nd->basereachable);
 	nd->retrans = RETRANS_TIMER;
 
 	nd->flags = ND6_IFF_PERFORMNUD;
 
 	/* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL.
 	 * XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by
 	 * default regardless of the V_ip6_auto_linklocal configuration to
 	 * give a reasonable default behavior.
 	 */
 	if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) ||
 	    (ifp->if_flags & IFF_LOOPBACK))
 		nd->flags |= ND6_IFF_AUTO_LINKLOCAL;
 	/*
 	 * A loopback interface does not need to accept RTADV.
 	 * XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by
 	 * default regardless of the V_ip6_accept_rtadv configuration to
 	 * prevent the interface from accepting RA messages arrived
 	 * on one of the member interfaces with ND6_IFF_ACCEPT_RTADV.
 	 */
 	if (V_ip6_accept_rtadv &&
 	    !(ifp->if_flags & IFF_LOOPBACK) &&
 	    (ifp->if_type != IFT_BRIDGE))
 			nd->flags |= ND6_IFF_ACCEPT_RTADV;
 	if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK))
 		nd->flags |= ND6_IFF_NO_RADR;
 
 	/* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */
 	nd6_setmtu0(ifp, nd);
 
 	return nd;
 }
 
 void
 nd6_ifdetach(struct nd_ifinfo *nd)
 {
 
 	free(nd, M_IP6NDP);
 }
 
 /*
  * Reset ND level link MTU. This function is called when the physical MTU
  * changes, which means we might have to adjust the ND level MTU.
  */
 void
 nd6_setmtu(struct ifnet *ifp)
 {
 
 	nd6_setmtu0(ifp, ND_IFINFO(ifp));
 }
 
 /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */
 void
 nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi)
 {
 	u_int32_t omaxmtu;
 
 	omaxmtu = ndi->maxmtu;
 
 	switch (ifp->if_type) {
 	case IFT_ARCNET:
 		ndi->maxmtu = MIN(ARC_PHDS_MAXMTU, ifp->if_mtu); /* RFC2497 */
 		break;
 	case IFT_FDDI:
 		ndi->maxmtu = MIN(FDDIIPMTU, ifp->if_mtu); /* RFC2467 */
 		break;
 	case IFT_ISO88025:
 		 ndi->maxmtu = MIN(ISO88025_MAX_MTU, ifp->if_mtu);
 		 break;
 	default:
 		ndi->maxmtu = ifp->if_mtu;
 		break;
 	}
 
 	/*
 	 * Decreasing the interface MTU under IPV6 minimum MTU may cause
 	 * undesirable situation.  We thus notify the operator of the change
 	 * explicitly.  The check for omaxmtu is necessary to restrict the
 	 * log to the case of changing the MTU, not initializing it.
 	 */
 	if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) {
 		log(LOG_NOTICE, "nd6_setmtu0: "
 		    "new link MTU on %s (%lu) is too small for IPv6\n",
 		    if_name(ifp), (unsigned long)ndi->maxmtu);
 	}
 
 	if (ndi->maxmtu > V_in6_maxmtu)
 		in6_setmaxmtu(); /* check all interfaces just in case */
 
 }
 
 void
 nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts)
 {
 
 	bzero(ndopts, sizeof(*ndopts));
 	ndopts->nd_opts_search = (struct nd_opt_hdr *)opt;
 	ndopts->nd_opts_last
 		= (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len);
 
 	if (icmp6len == 0) {
 		ndopts->nd_opts_done = 1;
 		ndopts->nd_opts_search = NULL;
 	}
 }
 
 /*
  * Take one ND option.
  */
 struct nd_opt_hdr *
 nd6_option(union nd_opts *ndopts)
 {
 	struct nd_opt_hdr *nd_opt;
 	int olen;
 
 	KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
 	KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
 	    __func__));
 	if (ndopts->nd_opts_search == NULL)
 		return NULL;
 	if (ndopts->nd_opts_done)
 		return NULL;
 
 	nd_opt = ndopts->nd_opts_search;
 
 	/* make sure nd_opt_len is inside the buffer */
 	if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) {
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	}
 
 	olen = nd_opt->nd_opt_len << 3;
 	if (olen == 0) {
 		/*
 		 * Message validation requires that all included
 		 * options have a length that is greater than zero.
 		 */
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	}
 
 	ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen);
 	if (ndopts->nd_opts_search > ndopts->nd_opts_last) {
 		/* option overruns the end of buffer, invalid */
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	} else if (ndopts->nd_opts_search == ndopts->nd_opts_last) {
 		/* reached the end of options chain */
 		ndopts->nd_opts_done = 1;
 		ndopts->nd_opts_search = NULL;
 	}
 	return nd_opt;
 }
 
 /*
  * Parse multiple ND options.
  * This function is much easier to use, for ND routines that do not need
  * multiple options of the same type.
  */
 int
 nd6_options(union nd_opts *ndopts)
 {
 	struct nd_opt_hdr *nd_opt;
 	int i = 0;
 
 	KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
 	KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
 	    __func__));
 	if (ndopts->nd_opts_search == NULL)
 		return 0;
 
 	while (1) {
 		nd_opt = nd6_option(ndopts);
 		if (nd_opt == NULL && ndopts->nd_opts_last == NULL) {
 			/*
 			 * Message validation requires that all included
 			 * options have a length that is greater than zero.
 			 */
 			ICMP6STAT_INC(icp6s_nd_badopt);
 			bzero(ndopts, sizeof(*ndopts));
 			return -1;
 		}
 
 		if (nd_opt == NULL)
 			goto skip1;
 
 		switch (nd_opt->nd_opt_type) {
 		case ND_OPT_SOURCE_LINKADDR:
 		case ND_OPT_TARGET_LINKADDR:
 		case ND_OPT_MTU:
 		case ND_OPT_REDIRECTED_HEADER:
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
 				nd6log((LOG_INFO,
 				    "duplicated ND6 option found (type=%d)\n",
 				    nd_opt->nd_opt_type));
 				/* XXX bark? */
 			} else {
 				ndopts->nd_opt_array[nd_opt->nd_opt_type]
 					= nd_opt;
 			}
 			break;
 		case ND_OPT_PREFIX_INFORMATION:
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) {
 				ndopts->nd_opt_array[nd_opt->nd_opt_type]
 					= nd_opt;
 			}
 			ndopts->nd_opts_pi_end =
 				(struct nd_opt_prefix_info *)nd_opt;
 			break;
 		/* What about ND_OPT_ROUTE_INFO? RFC 4191 */
 		case ND_OPT_RDNSS:	/* RFC 6106 */
 		case ND_OPT_DNSSL:	/* RFC 6106 */
 			/*
 			 * Silently ignore options we know and do not care about
 			 * in the kernel.
 			 */
 			break;
 		default:
 			/*
 			 * Unknown options must be silently ignored,
 			 * to accomodate future extension to the protocol.
 			 */
 			nd6log((LOG_DEBUG,
 			    "nd6_options: unsupported option %d - "
 			    "option ignored\n", nd_opt->nd_opt_type));
 		}
 
 skip1:
 		i++;
 		if (i > V_nd6_maxndopt) {
 			ICMP6STAT_INC(icp6s_nd_toomanyopt);
 			nd6log((LOG_INFO, "too many loop in nd opt\n"));
 			break;
 		}
 
 		if (ndopts->nd_opts_done)
 			break;
 	}
 
 	return 0;
 }
 
 /*
  * ND6 timer routine to handle ND6 entries
  */
 void
 nd6_llinfo_settimer_locked(struct llentry *ln, long tick)
 {
 	int canceled;
 
 	LLE_WLOCK_ASSERT(ln);
 
 	if (tick < 0) {
 		ln->la_expire = 0;
 		ln->ln_ntick = 0;
 		canceled = callout_stop(&ln->ln_timer_ch);
 	} else {
 		ln->la_expire = time_uptime + tick / hz;
 		LLE_ADDREF(ln);
 		if (tick > INT_MAX) {
 			ln->ln_ntick = tick - INT_MAX;
 			canceled = callout_reset(&ln->ln_timer_ch, INT_MAX,
 			    nd6_llinfo_timer, ln);
 		} else {
 			ln->ln_ntick = 0;
 			canceled = callout_reset(&ln->ln_timer_ch, tick,
 			    nd6_llinfo_timer, ln);
 		}
 	}
 	if (canceled)
 		LLE_REMREF(ln);
 }
 
 void
 nd6_llinfo_settimer(struct llentry *ln, long tick)
 {
 
 	LLE_WLOCK(ln);
 	nd6_llinfo_settimer_locked(ln, tick);
 	LLE_WUNLOCK(ln);
 }
 
 static void
 nd6_llinfo_timer(void *arg)
 {
 	struct llentry *ln;
 	struct in6_addr *dst;
 	struct ifnet *ifp;
 	struct nd_ifinfo *ndi = NULL;
 
 	KASSERT(arg != NULL, ("%s: arg NULL", __func__));
 	ln = (struct llentry *)arg;
 	LLE_WLOCK_ASSERT(ln);
 	ifp = ln->lle_tbl->llt_ifp;
 
 	CURVNET_SET(ifp->if_vnet);
 
 	if (ln->ln_ntick > 0) {
 		if (ln->ln_ntick > INT_MAX) {
 			ln->ln_ntick -= INT_MAX;
 			nd6_llinfo_settimer_locked(ln, INT_MAX);
 		} else {
 			ln->ln_ntick = 0;
 			nd6_llinfo_settimer_locked(ln, ln->ln_ntick);
 		}
 		goto done;
 	}
 
 	ndi = ND_IFINFO(ifp);
 	dst = &L3_ADDR_SIN6(ln)->sin6_addr;
 	if (ln->la_flags & LLE_STATIC) {
 		goto done;
 	}
 
 	if (ln->la_flags & LLE_DELETED) {
 		(void)nd6_free(ln, 0);
 		ln = NULL;
 		goto done;
 	}
 
 	switch (ln->ln_state) {
 	case ND6_LLINFO_INCOMPLETE:
 		if (ln->la_asked < V_nd6_mmaxtries) {
 			ln->la_asked++;
 			nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000);
 			LLE_WUNLOCK(ln);
 			nd6_ns_output(ifp, NULL, dst, ln, 0);
 			LLE_WLOCK(ln);
 		} else {
 			struct mbuf *m = ln->la_hold;
 			if (m) {
 				struct mbuf *m0;
 
 				/*
 				 * assuming every packet in la_hold has the
 				 * same IP header.  Send error after unlock.
 				 */
 				m0 = m->m_nextpkt;
 				m->m_nextpkt = NULL;
 				ln->la_hold = m0;
 				clear_llinfo_pqueue(ln);
 			}
 			EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_TIMEDOUT);
 			(void)nd6_free(ln, 0);
 			ln = NULL;
 			if (m != NULL)
 				icmp6_error2(m, ICMP6_DST_UNREACH,
 				    ICMP6_DST_UNREACH_ADDR, 0, ifp);
 		}
 		break;
 	case ND6_LLINFO_REACHABLE:
 		if (!ND6_LLINFO_PERMANENT(ln)) {
 			ln->ln_state = ND6_LLINFO_STALE;
 			nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz);
 		}
 		break;
 
 	case ND6_LLINFO_STALE:
 		/* Garbage Collection(RFC 2461 5.3) */
 		if (!ND6_LLINFO_PERMANENT(ln)) {
 			EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED);
 			(void)nd6_free(ln, 1);
 			ln = NULL;
 		}
 		break;
 
 	case ND6_LLINFO_DELAY:
 		if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) {
 			/* We need NUD */
 			ln->la_asked = 1;
 			ln->ln_state = ND6_LLINFO_PROBE;
 			nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000);
 			LLE_WUNLOCK(ln);
 			nd6_ns_output(ifp, dst, dst, ln, 0);
 			LLE_WLOCK(ln);
 		} else {
 			ln->ln_state = ND6_LLINFO_STALE; /* XXX */
 			nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz);
 		}
 		break;
 	case ND6_LLINFO_PROBE:
 		if (ln->la_asked < V_nd6_umaxtries) {
 			ln->la_asked++;
 			nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000);
 			LLE_WUNLOCK(ln);
 			nd6_ns_output(ifp, dst, dst, ln, 0);
 			LLE_WLOCK(ln);
 		} else {
 			EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED);
 			(void)nd6_free(ln, 0);
 			ln = NULL;
 		}
 		break;
 	default:
 		panic("%s: paths in a dark night can be confusing: %d",
 		    __func__, ln->ln_state);
 	}
 done:
 	if (ln != NULL)
 		LLE_FREE_LOCKED(ln);
 	CURVNET_RESTORE();
 }
 
 
 /*
  * ND6 timer routine to expire default route list and prefix list
  */
 void
 nd6_timer(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct nd_defrouter *dr, *ndr;
 	struct nd_prefix *pr, *npr;
 	struct in6_ifaddr *ia6, *nia6;
 
 	callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz,
 	    nd6_timer, curvnet);
 
 	/* expire default router list */
 	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
 		if (dr->expire && dr->expire < time_uptime)
 			defrtrlist_del(dr);
 	}
 
 	/*
 	 * expire interface addresses.
 	 * in the past the loop was inside prefix expiry processing.
 	 * However, from a stricter speci-confrmance standpoint, we should
 	 * rather separate address lifetimes and prefix lifetimes.
 	 *
 	 * XXXRW: in6_ifaddrhead locking.
 	 */
   addrloop:
 	TAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) {
 		/* check address lifetime */
 		if (IFA6_IS_INVALID(ia6)) {
 			int regen = 0;
 
 			/*
 			 * If the expiring address is temporary, try
 			 * regenerating a new one.  This would be useful when
 			 * we suspended a laptop PC, then turned it on after a
 			 * period that could invalidate all temporary
 			 * addresses.  Although we may have to restart the
 			 * loop (see below), it must be after purging the
 			 * address.  Otherwise, we'd see an infinite loop of
 			 * regeneration.
 			 */
 			if (V_ip6_use_tempaddr &&
 			    (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
 				if (regen_tmpaddr(ia6) == 0)
 					regen = 1;
 			}
 
 			in6_purgeaddr(&ia6->ia_ifa);
 
 			if (regen)
 				goto addrloop; /* XXX: see below */
 		} else if (IFA6_IS_DEPRECATED(ia6)) {
 			int oldflags = ia6->ia6_flags;
 
 			ia6->ia6_flags |= IN6_IFF_DEPRECATED;
 
 			/*
 			 * If a temporary address has just become deprecated,
 			 * regenerate a new one if possible.
 			 */
 			if (V_ip6_use_tempaddr &&
 			    (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (oldflags & IN6_IFF_DEPRECATED) == 0) {
 
 				if (regen_tmpaddr(ia6) == 0) {
 					/*
 					 * A new temporary address is
 					 * generated.
 					 * XXX: this means the address chain
 					 * has changed while we are still in
 					 * the loop.  Although the change
 					 * would not cause disaster (because
 					 * it's not a deletion, but an
 					 * addition,) we'd rather restart the
 					 * loop just for safety.  Or does this
 					 * significantly reduce performance??
 					 */
 					goto addrloop;
 				}
 			}
 		} else {
 			/*
 			 * A new RA might have made a deprecated address
 			 * preferred.
 			 */
 			ia6->ia6_flags &= ~IN6_IFF_DEPRECATED;
 		}
 	}
 
 	/* expire prefix list */
 	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
 		/*
 		 * check prefix lifetime.
 		 * since pltime is just for autoconf, pltime processing for
 		 * prefix is not necessary.
 		 */
 		if (pr->ndpr_vltime != ND6_INFINITE_LIFETIME &&
 		    time_uptime - pr->ndpr_lastupdate > pr->ndpr_vltime) {
 
 			/*
 			 * address expiration and prefix expiration are
 			 * separate.  NEVER perform in6_purgeaddr here.
 			 */
 			prelist_remove(pr);
 		}
 	}
 	CURVNET_RESTORE();
 }
 
 /*
  * ia6 - deprecated/invalidated temporary address
  */
 static int
 regen_tmpaddr(struct in6_ifaddr *ia6)
 {
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	struct in6_ifaddr *public_ifa6 = NULL;
 
 	ifp = ia6->ia_ifa.ifa_ifp;
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in6_ifaddr *it6;
 
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		it6 = (struct in6_ifaddr *)ifa;
 
 		/* ignore no autoconf addresses. */
 		if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 			continue;
 
 		/* ignore autoconf addresses with different prefixes. */
 		if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr)
 			continue;
 
 		/*
 		 * Now we are looking at an autoconf address with the same
 		 * prefix as ours.  If the address is temporary and is still
 		 * preferred, do not create another one.  It would be rare, but
 		 * could happen, for example, when we resume a laptop PC after
 		 * a long period.
 		 */
 		if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 		    !IFA6_IS_DEPRECATED(it6)) {
 			public_ifa6 = NULL;
 			break;
 		}
 
 		/*
 		 * This is a public autoconf address that has the same prefix
 		 * as ours.  If it is preferred, keep it.  We can't break the
 		 * loop here, because there may be a still-preferred temporary
 		 * address with the prefix.
 		 */
 		if (!IFA6_IS_DEPRECATED(it6))
 		    public_ifa6 = it6;
 
 		if (public_ifa6 != NULL)
 			ifa_ref(&public_ifa6->ia_ifa);
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	if (public_ifa6 != NULL) {
 		int e;
 
 		if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) {
 			ifa_free(&public_ifa6->ia_ifa);
 			log(LOG_NOTICE, "regen_tmpaddr: failed to create a new"
 			    " tmp addr,errno=%d\n", e);
 			return (-1);
 		}
 		ifa_free(&public_ifa6->ia_ifa);
 		return (0);
 	}
 
 	return (-1);
 }
 
 /*
  * Nuke neighbor cache/prefix/default router management table, right before
  * ifp goes away.
  */
 void
 nd6_purge(struct ifnet *ifp)
 {
 	struct nd_defrouter *dr, *ndr;
 	struct nd_prefix *pr, *npr;
 
 	/*
 	 * Nuke default router list entries toward ifp.
 	 * We defer removal of default router list entries that is installed
 	 * in the routing table, in order to keep additional side effects as
 	 * small as possible.
 	 */
 	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
 		if (dr->installed)
 			continue;
 
 		if (dr->ifp == ifp)
 			defrtrlist_del(dr);
 	}
 
 	TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) {
 		if (!dr->installed)
 			continue;
 
 		if (dr->ifp == ifp)
 			defrtrlist_del(dr);
 	}
 
 	/* Nuke prefix list entries toward ifp */
 	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
 		if (pr->ndpr_ifp == ifp) {
 			/*
 			 * Because if_detach() does *not* release prefixes
 			 * while purging addresses the reference count will
 			 * still be above zero. We therefore reset it to
 			 * make sure that the prefix really gets purged.
 			 */
 			pr->ndpr_refcnt = 0;
 
 			/*
 			 * Previously, pr->ndpr_addr is removed as well,
 			 * but I strongly believe we don't have to do it.
 			 * nd6_purge() is only called from in6_ifdetach(),
 			 * which removes all the associated interface addresses
 			 * by itself.
 			 * (jinmei@kame.net 20010129)
 			 */
 			prelist_remove(pr);
 		}
 	}
 
 	/* cancel default outgoing interface setting */
 	if (V_nd6_defifindex == ifp->if_index)
 		nd6_setdefaultiface(0);
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		/* Refresh default router list. */
 		defrouter_select();
 	}
 
 	/* XXXXX
 	 * We do not nuke the neighbor cache entries here any more
 	 * because the neighbor cache is kept in if_afdata[AF_INET6].
 	 * nd6_purge() is invoked by in6_ifdetach() which is called
 	 * from if_detach() where everything gets purged. So let
 	 * in6_domifdetach() do the actual L2 table purging work.
 	 */
 }
 
 /* 
  * the caller acquires and releases the lock on the lltbls
  * Returns the llentry locked
  */
 struct llentry *
 nd6_lookup(struct in6_addr *addr6, int flags, struct ifnet *ifp)
 {
 	struct sockaddr_in6 sin6;
 	struct llentry *ln;
 	int llflags;
 	
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_addr = *addr6;
 
 	IF_AFDATA_LOCK_ASSERT(ifp);
 
 	llflags = 0;
 	if (flags & ND6_CREATE)
 	    llflags |= LLE_CREATE;
 	if (flags & ND6_EXCLUSIVE)
 	    llflags |= LLE_EXCLUSIVE;	
 	
 	ln = lla_lookup(LLTABLE6(ifp), llflags, (struct sockaddr *)&sin6);
 	if ((ln != NULL) && (llflags & LLE_CREATE))
 		ln->ln_state = ND6_LLINFO_NOSTATE;
 	
 	return (ln);
 }
 
 /*
  * Test whether a given IPv6 address is a neighbor or not, ignoring
  * the actual neighbor cache.  The neighbor cache is ignored in order
  * to not reenter the routing code from within itself.
  */
 static int
 nd6_is_new_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp)
 {
 	struct nd_prefix *pr;
 	struct ifaddr *dstaddr;
 
 	/*
 	 * A link-local address is always a neighbor.
 	 * XXX: a link does not necessarily specify a single interface.
 	 */
 	if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) {
 		struct sockaddr_in6 sin6_copy;
 		u_int32_t zone;
 
 		/*
 		 * We need sin6_copy since sa6_recoverscope() may modify the
 		 * content (XXX).
 		 */
 		sin6_copy = *addr;
 		if (sa6_recoverscope(&sin6_copy))
 			return (0); /* XXX: should be impossible */
 		if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone))
 			return (0);
 		if (sin6_copy.sin6_scope_id == zone)
 			return (1);
 		else
 			return (0);
 	}
 
 	/*
 	 * If the address matches one of our addresses,
 	 * it should be a neighbor.
 	 * If the address matches one of our on-link prefixes, it should be a
 	 * neighbor.
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		if (pr->ndpr_ifp != ifp)
 			continue;
 
 		if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) {
 			struct rtentry *rt;
 
 			/* Always use the default FIB here. */
 			rt = in6_rtalloc1((struct sockaddr *)&pr->ndpr_prefix,
 			    0, 0, RT_DEFAULT_FIB);
 			if (rt == NULL)
 				continue;
 			/*
 			 * This is the case where multiple interfaces
 			 * have the same prefix, but only one is installed 
 			 * into the routing table and that prefix entry
 			 * is not the one being examined here. In the case
 			 * where RADIX_MPATH is enabled, multiple route
 			 * entries (of the same rt_key value) will be 
 			 * installed because the interface addresses all
 			 * differ.
 			 */
 			if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
 			       &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr)) {
 				RTFREE_LOCKED(rt);
 				continue;
 			}
 			RTFREE_LOCKED(rt);
 		}
 
 		if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
 		    &addr->sin6_addr, &pr->ndpr_mask))
 			return (1);
 	}
 
 	/*
 	 * If the address is assigned on the node of the other side of
 	 * a p2p interface, the address should be a neighbor.
 	 */
 	dstaddr = ifa_ifwithdstaddr((struct sockaddr *)addr, RT_ALL_FIBS);
 	if (dstaddr != NULL) {
 		if (dstaddr->ifa_ifp == ifp) {
 			ifa_free(dstaddr);
 			return (1);
 		}
 		ifa_free(dstaddr);
 	}
 
 	/*
 	 * If the default router list is empty, all addresses are regarded
 	 * as on-link, and thus, as a neighbor.
 	 */
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV &&
 	    TAILQ_EMPTY(&V_nd_defrouter) &&
 	    V_nd6_defifindex == ifp->if_index) {
 		return (1);
 	}
 
 	return (0);
 }
 
 
 /*
  * Detect if a given IPv6 address identifies a neighbor on a given link.
  * XXX: should take care of the destination of a p2p link?
  */
 int
 nd6_is_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp)
 {
 	struct llentry *lle;
 	int rc = 0;
 
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
 	if (nd6_is_new_addr_neighbor(addr, ifp))
 		return (1);
 
 	/*
 	 * Even if the address matches none of our addresses, it might be
 	 * in the neighbor cache.
 	 */
 	IF_AFDATA_RLOCK(ifp);
 	if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) {
 		LLE_RUNLOCK(lle);
 		rc = 1;
 	}
 	IF_AFDATA_RUNLOCK(ifp);
 	return (rc);
 }
 
 /*
  * Free an nd6 llinfo entry.
  * Since the function would cause significant changes in the kernel, DO NOT
  * make it global, unless you have a strong reason for the change, and are sure
  * that the change is safe.
  */
 static struct llentry *
 nd6_free(struct llentry *ln, int gc)
 {
         struct llentry *next;
 	struct nd_defrouter *dr;
 	struct ifnet *ifp;
 
 	LLE_WLOCK_ASSERT(ln);
 
 	/*
 	 * we used to have pfctlinput(PRC_HOSTDEAD) here.
 	 * even though it is not harmful, it was not really necessary.
 	 */
 
 	/* cancel timer */
 	nd6_llinfo_settimer_locked(ln, -1);
 
 	ifp = ln->lle_tbl->llt_ifp;
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		dr = defrouter_lookup(&L3_ADDR_SIN6(ln)->sin6_addr, ifp);
 
 		if (dr != NULL && dr->expire &&
 		    ln->ln_state == ND6_LLINFO_STALE && gc) {
 			/*
 			 * If the reason for the deletion is just garbage
 			 * collection, and the neighbor is an active default
 			 * router, do not delete it.  Instead, reset the GC
 			 * timer using the router's lifetime.
 			 * Simply deleting the entry would affect default
 			 * router selection, which is not necessarily a good
 			 * thing, especially when we're using router preference
 			 * values.
 			 * XXX: the check for ln_state would be redundant,
 			 *      but we intentionally keep it just in case.
 			 */
 			if (dr->expire > time_uptime)
 				nd6_llinfo_settimer_locked(ln,
 				    (dr->expire - time_uptime) * hz);
 			else
 				nd6_llinfo_settimer_locked(ln,
 				    (long)V_nd6_gctimer * hz);
 
 			next = LIST_NEXT(ln, lle_next);
 			LLE_REMREF(ln);
 			LLE_WUNLOCK(ln);
 			return (next);
 		}
 
 		if (dr) {
 			/*
 			 * Unreachablity of a router might affect the default
 			 * router selection and on-link detection of advertised
 			 * prefixes.
 			 */
 
 			/*
 			 * Temporarily fake the state to choose a new default
 			 * router and to perform on-link determination of
 			 * prefixes correctly.
 			 * Below the state will be set correctly,
 			 * or the entry itself will be deleted.
 			 */
 			ln->ln_state = ND6_LLINFO_INCOMPLETE;
 		}
 
 		if (ln->ln_router || dr) {
 
 			/*
 			 * We need to unlock to avoid a LOR with rt6_flush() with the
 			 * rnh and for the calls to pfxlist_onlink_check() and
 			 * defrouter_select() in the block further down for calls
 			 * into nd6_lookup().  We still hold a ref.
 			 */
 			LLE_WUNLOCK(ln);
 
 			/*
 			 * rt6_flush must be called whether or not the neighbor
 			 * is in the Default Router List.
 			 * See a corresponding comment in nd6_na_input().
 			 */
 			rt6_flush(&L3_ADDR_SIN6(ln)->sin6_addr, ifp);
 		}
 
 		if (dr) {
 			/*
 			 * Since defrouter_select() does not affect the
 			 * on-link determination and MIP6 needs the check
 			 * before the default router selection, we perform
 			 * the check now.
 			 */
 			pfxlist_onlink_check();
 
 			/*
 			 * Refresh default router list.
 			 */
 			defrouter_select();
 		}
 
 		if (ln->ln_router || dr)
 			LLE_WLOCK(ln);
 	}
 
 	/*
 	 * Before deleting the entry, remember the next entry as the
 	 * return value.  We need this because pfxlist_onlink_check() above
 	 * might have freed other entries (particularly the old next entry) as
 	 * a side effect (XXX).
 	 */
 	next = LIST_NEXT(ln, lle_next);
 
 	/*
 	 * Save to unlock. We still hold an extra reference and will not
 	 * free(9) in llentry_free() if someone else holds one as well.
 	 */
 	LLE_WUNLOCK(ln);
 	IF_AFDATA_LOCK(ifp);
 	LLE_WLOCK(ln);
 
 	/* Guard against race with other llentry_free(). */
 	if (ln->la_flags & LLE_LINKED) {
 		LLE_REMREF(ln);
 		llentry_free(ln);
 	} else
 		LLE_FREE_LOCKED(ln);
 
 	IF_AFDATA_UNLOCK(ifp);
 
 	return (next);
 }
 
 /*
  * Upper-layer reachability hint for Neighbor Unreachability Detection.
  *
  * XXX cost-effective methods?
  */
 void
 nd6_nud_hint(struct rtentry *rt, struct in6_addr *dst6, int force)
 {
 	struct llentry *ln;
 	struct ifnet *ifp;
 
 	if ((dst6 == NULL) || (rt == NULL))
 		return;
 
 	ifp = rt->rt_ifp;
 	IF_AFDATA_RLOCK(ifp);
 	ln = nd6_lookup(dst6, ND6_EXCLUSIVE, NULL);
 	IF_AFDATA_RUNLOCK(ifp);
 	if (ln == NULL)
 		return;
 
 	if (ln->ln_state < ND6_LLINFO_REACHABLE)
 		goto done;
 
 	/*
 	 * if we get upper-layer reachability confirmation many times,
 	 * it is possible we have false information.
 	 */
 	if (!force) {
 		ln->ln_byhint++;
 		if (ln->ln_byhint > V_nd6_maxnudhint) {
 			goto done;
 		}
 	}
 
  	ln->ln_state = ND6_LLINFO_REACHABLE;
 	if (!ND6_LLINFO_PERMANENT(ln)) {
 		nd6_llinfo_settimer_locked(ln,
 		    (long)ND_IFINFO(rt->rt_ifp)->reachable * hz);
 	}
 done:
 	LLE_WUNLOCK(ln);
 }
 
 
 /*
  * Rejuvenate this function for routing operations related
  * processing.
  */
 void
 nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct sockaddr_in6 *gateway;
 	struct nd_defrouter *dr;
 	struct ifnet *ifp;
 
 	gateway = (struct sockaddr_in6 *)rt->rt_gateway;
 	ifp = rt->rt_ifp;
 
 	switch (req) {
 	case RTM_ADD:
 		break;
 
 	case RTM_DELETE:
 		if (!ifp)
 			return;
 		/*
 		 * Only indirect routes are interesting.
 		 */
 		if ((rt->rt_flags & RTF_GATEWAY) == 0)
 			return;
 		/*
 		 * check for default route
 		 */
 		if (IN6_ARE_ADDR_EQUAL(&in6addr_any, 
 				       &SIN6(rt_key(rt))->sin6_addr)) {
 
 			dr = defrouter_lookup(&gateway->sin6_addr, ifp);
 			if (dr != NULL)
 				dr->installed = 0;
 		}
 		break;
 	}
 }
 
 
 int
 nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
 {
 	struct in6_drlist *drl = (struct in6_drlist *)data;
 	struct in6_oprlist *oprl = (struct in6_oprlist *)data;
 	struct in6_ndireq *ndi = (struct in6_ndireq *)data;
 	struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data;
 	struct in6_ndifreq *ndif = (struct in6_ndifreq *)data;
 	struct nd_defrouter *dr;
 	struct nd_prefix *pr;
 	int i = 0, error = 0;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return (EPFNOSUPPORT);
 	switch (cmd) {
 	case SIOCGDRLST_IN6:
 		/*
 		 * obsolete API, use sysctl under net.inet6.icmp6
 		 */
 		bzero(drl, sizeof(*drl));
 		TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
 			if (i >= DRLSTSIZ)
 				break;
 			drl->defrouter[i].rtaddr = dr->rtaddr;
 			in6_clearscope(&drl->defrouter[i].rtaddr);
 
 			drl->defrouter[i].flags = dr->flags;
 			drl->defrouter[i].rtlifetime = dr->rtlifetime;
 			drl->defrouter[i].expire = dr->expire +
 			    (time_second - time_uptime);
 			drl->defrouter[i].if_index = dr->ifp->if_index;
 			i++;
 		}
 		break;
 	case SIOCGPRLST_IN6:
 		/*
 		 * obsolete API, use sysctl under net.inet6.icmp6
 		 *
 		 * XXX the structure in6_prlist was changed in backward-
 		 * incompatible manner.  in6_oprlist is used for SIOCGPRLST_IN6,
 		 * in6_prlist is used for nd6_sysctl() - fill_prlist().
 		 */
 		/*
 		 * XXX meaning of fields, especialy "raflags", is very
 		 * differnet between RA prefix list and RR/static prefix list.
 		 * how about separating ioctls into two?
 		 */
 		bzero(oprl, sizeof(*oprl));
 		LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 			struct nd_pfxrouter *pfr;
 			int j;
 
 			if (i >= PRLSTSIZ)
 				break;
 			oprl->prefix[i].prefix = pr->ndpr_prefix.sin6_addr;
 			oprl->prefix[i].raflags = pr->ndpr_raf;
 			oprl->prefix[i].prefixlen = pr->ndpr_plen;
 			oprl->prefix[i].vltime = pr->ndpr_vltime;
 			oprl->prefix[i].pltime = pr->ndpr_pltime;
 			oprl->prefix[i].if_index = pr->ndpr_ifp->if_index;
 			if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME)
 				oprl->prefix[i].expire = 0;
 			else {
 				time_t maxexpire;
 
 				/* XXX: we assume time_t is signed. */
 				maxexpire = (-1) &
 				    ~((time_t)1 <<
 				    ((sizeof(maxexpire) * 8) - 1));
 				if (pr->ndpr_vltime <
 				    maxexpire - pr->ndpr_lastupdate) {
 					oprl->prefix[i].expire =
 					    pr->ndpr_lastupdate +
 					    pr->ndpr_vltime +
 					    (time_second - time_uptime);
 				} else
 					oprl->prefix[i].expire = maxexpire;
 			}
 
 			j = 0;
 			LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) {
 				if (j < DRLSTSIZ) {
 #define RTRADDR oprl->prefix[i].advrtr[j]
 					RTRADDR = pfr->router->rtaddr;
 					in6_clearscope(&RTRADDR);
 #undef RTRADDR
 				}
 				j++;
 			}
 			oprl->prefix[i].advrtrs = j;
 			oprl->prefix[i].origin = PR_ORIG_RA;
 
 			i++;
 		}
 
 		break;
 	case OSIOCGIFINFO_IN6:
 #define ND	ndi->ndi
 		/* XXX: old ndp(8) assumes a positive value for linkmtu. */
 		bzero(&ND, sizeof(ND));
 		ND.linkmtu = IN6_LINKMTU(ifp);
 		ND.maxmtu = ND_IFINFO(ifp)->maxmtu;
 		ND.basereachable = ND_IFINFO(ifp)->basereachable;
 		ND.reachable = ND_IFINFO(ifp)->reachable;
 		ND.retrans = ND_IFINFO(ifp)->retrans;
 		ND.flags = ND_IFINFO(ifp)->flags;
 		ND.recalctm = ND_IFINFO(ifp)->recalctm;
 		ND.chlim = ND_IFINFO(ifp)->chlim;
 		break;
 	case SIOCGIFINFO_IN6:
 		ND = *ND_IFINFO(ifp);
 		break;
 	case SIOCSIFINFO_IN6:
 		/*
 		 * used to change host variables from userland.
 		 * intented for a use on router to reflect RA configurations.
 		 */
 		/* 0 means 'unspecified' */
 		if (ND.linkmtu != 0) {
 			if (ND.linkmtu < IPV6_MMTU ||
 			    ND.linkmtu > IN6_LINKMTU(ifp)) {
 				error = EINVAL;
 				break;
 			}
 			ND_IFINFO(ifp)->linkmtu = ND.linkmtu;
 		}
 
 		if (ND.basereachable != 0) {
 			int obasereachable = ND_IFINFO(ifp)->basereachable;
 
 			ND_IFINFO(ifp)->basereachable = ND.basereachable;
 			if (ND.basereachable != obasereachable)
 				ND_IFINFO(ifp)->reachable =
 				    ND_COMPUTE_RTIME(ND.basereachable);
 		}
 		if (ND.retrans != 0)
 			ND_IFINFO(ifp)->retrans = ND.retrans;
 		if (ND.chlim != 0)
 			ND_IFINFO(ifp)->chlim = ND.chlim;
 		/* FALLTHROUGH */
 	case SIOCSIFINFO_FLAGS:
 	{
 		struct ifaddr *ifa;
 		struct in6_ifaddr *ia;
 
 		if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 		    !(ND.flags & ND6_IFF_IFDISABLED)) {
 			/* ifdisabled 1->0 transision */
 
 			/*
 			 * If the interface is marked as ND6_IFF_IFDISABLED and
 			 * has an link-local address with IN6_IFF_DUPLICATED,
 			 * do not clear ND6_IFF_IFDISABLED.
 			 * See RFC 4862, Section 5.4.5.
 			 */
 			int duplicated_linklocal = 0;
 
 			IF_ADDR_RLOCK(ifp);
 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				if (ifa->ifa_addr->sa_family != AF_INET6)
 					continue;
 				ia = (struct in6_ifaddr *)ifa;
 				if ((ia->ia6_flags & IN6_IFF_DUPLICATED) &&
 				    IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) {
 					duplicated_linklocal = 1;
 					break;
 				}
 			}
 			IF_ADDR_RUNLOCK(ifp);
 
 			if (duplicated_linklocal) {
 				ND.flags |= ND6_IFF_IFDISABLED;
 				log(LOG_ERR, "Cannot enable an interface"
 				    " with a link-local address marked"
 				    " duplicate.\n");
 			} else {
 				ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED;
 				if (ifp->if_flags & IFF_UP)
 					in6_if_up(ifp);
 			}
 		} else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 			    (ND.flags & ND6_IFF_IFDISABLED)) {
 			/* ifdisabled 0->1 transision */
 			/* Mark all IPv6 address as tentative. */
 
 			ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED;
 			IF_ADDR_RLOCK(ifp);
 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				if (ifa->ifa_addr->sa_family != AF_INET6)
 					continue;
 				ia = (struct in6_ifaddr *)ifa;
 				ia->ia6_flags |= IN6_IFF_TENTATIVE;
 			}
 			IF_ADDR_RUNLOCK(ifp);
 		}
 
 		if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) {
 			if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) {
 				/* auto_linklocal 0->1 transision */
 
 				/* If no link-local address on ifp, configure */
 				ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL;
 				in6_ifattach(ifp, NULL);
 			} else if (!(ND.flags & ND6_IFF_IFDISABLED) &&
 			    ifp->if_flags & IFF_UP) {
 				/*
 				 * When the IF already has
 				 * ND6_IFF_AUTO_LINKLOCAL, no link-local
 				 * address is assigned, and IFF_UP, try to
 				 * assign one.
 				 */
 				int haslinklocal = 0;
 			
 				IF_ADDR_RLOCK(ifp);
 				TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 					if (ifa->ifa_addr->sa_family != AF_INET6)
 						continue;
 					ia = (struct in6_ifaddr *)ifa;
 					if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) {
 						haslinklocal = 1;
 						break;
 					}
 				}
 				IF_ADDR_RUNLOCK(ifp);
 				if (!haslinklocal)
 					in6_ifattach(ifp, NULL);
 			}
 		}
 	}
 		ND_IFINFO(ifp)->flags = ND.flags;
 		break;
 #undef ND
 	case SIOCSNDFLUSH_IN6:	/* XXX: the ioctl name is confusing... */
 		/* sync kernel routing table with the default router list */
 		defrouter_reset();
 		defrouter_select();
 		break;
 	case SIOCSPFXFLUSH_IN6:
 	{
 		/* flush all the prefix advertised by routers */
 		struct nd_prefix *pr, *next;
 
 		LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) {
 			struct in6_ifaddr *ia, *ia_next;
 
 			if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
 				continue; /* XXX */
 
 			/* do we really have to remove addresses as well? */
 			/* XXXRW: in6_ifaddrhead locking. */
 			TAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link,
 			    ia_next) {
 				if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 					continue;
 
 				if (ia->ia6_ndpr == pr)
 					in6_purgeaddr(&ia->ia_ifa);
 			}
 			prelist_remove(pr);
 		}
 		break;
 	}
 	case SIOCSRTRFLUSH_IN6:
 	{
 		/* flush all the default routers */
 		struct nd_defrouter *dr, *next;
 
 		defrouter_reset();
 		TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, next) {
 			defrtrlist_del(dr);
 		}
 		defrouter_select();
 		break;
 	}
 	case SIOCGNBRINFO_IN6:
 	{
 		struct llentry *ln;
 		struct in6_addr nb_addr = nbi->addr; /* make local for safety */
 
 		if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0)
 			return (error);
 
 		IF_AFDATA_RLOCK(ifp);
 		ln = nd6_lookup(&nb_addr, 0, ifp);
 		IF_AFDATA_RUNLOCK(ifp);
 
 		if (ln == NULL) {
 			error = EINVAL;
 			break;
 		}
 		nbi->state = ln->ln_state;
 		nbi->asked = ln->la_asked;
 		nbi->isrouter = ln->ln_router;
 		if (ln->la_expire == 0)
 			nbi->expire = 0;
 		else
 			nbi->expire = ln->la_expire +
 			    (time_second - time_uptime);
 		LLE_RUNLOCK(ln);
 		break;
 	}
 	case SIOCGDEFIFACE_IN6:	/* XXX: should be implemented as a sysctl? */
 		ndif->ifindex = V_nd6_defifindex;
 		break;
 	case SIOCSDEFIFACE_IN6:	/* XXX: should be implemented as a sysctl? */
 		return (nd6_setdefaultiface(ndif->ifindex));
 	}
 	return (error);
 }
 
 /*
  * Create neighbor cache entry and cache link-layer address,
  * on reception of inbound ND6 packets.  (RS/RA/NS/redirect)
  *
  * type - ICMP6 type
  * code - type dependent information
  *
  * XXXXX
  *  The caller of this function already acquired the ndp 
  *  cache table lock because the cache entry is returned.
  */
 struct llentry *
 nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr,
     int lladdrlen, int type, int code)
 {
 	struct llentry *ln = NULL;
 	int is_newentry;
 	int do_update;
 	int olladdr;
 	int llchange;
 	int flags;
 	int newstate = 0;
 	uint16_t router = 0;
 	struct sockaddr_in6 sin6;
 	struct mbuf *chain = NULL;
 	int static_route = 0;
 
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
 
 	KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__));
 	KASSERT(from != NULL, ("%s: from == NULL", __func__));
 
 	/* nothing must be updated for unspecified address */
 	if (IN6_IS_ADDR_UNSPECIFIED(from))
 		return NULL;
 
 	/*
 	 * Validation about ifp->if_addrlen and lladdrlen must be done in
 	 * the caller.
 	 *
 	 * XXX If the link does not have link-layer adderss, what should
 	 * we do? (ifp->if_addrlen == 0)
 	 * Spec says nothing in sections for RA, RS and NA.  There's small
 	 * description on it in NS section (RFC 2461 7.2.3).
 	 */
 	flags = lladdr ? ND6_EXCLUSIVE : 0;
 	IF_AFDATA_RLOCK(ifp);
 	ln = nd6_lookup(from, flags, ifp);
 	IF_AFDATA_RUNLOCK(ifp);
 	if (ln == NULL) {
 		flags |= ND6_EXCLUSIVE;
 		IF_AFDATA_LOCK(ifp);
 		ln = nd6_lookup(from, flags | ND6_CREATE, ifp);
 		IF_AFDATA_UNLOCK(ifp);
 		is_newentry = 1;
 	} else {
 		/* do nothing if static ndp is set */
 		if (ln->la_flags & LLE_STATIC) {
 			static_route = 1;
 			goto done;
 		}
 		is_newentry = 0;
 	}
 	if (ln == NULL)
 		return (NULL);
 
 	olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0;
 	if (olladdr && lladdr) {
 		llchange = bcmp(lladdr, &ln->ll_addr,
 		    ifp->if_addrlen);
 	} else
 		llchange = 0;
 
 	/*
 	 * newentry olladdr  lladdr  llchange	(*=record)
 	 *	0	n	n	--	(1)
 	 *	0	y	n	--	(2)
 	 *	0	n	y	--	(3) * STALE
 	 *	0	y	y	n	(4) *
 	 *	0	y	y	y	(5) * STALE
 	 *	1	--	n	--	(6)   NOSTATE(= PASSIVE)
 	 *	1	--	y	--	(7) * STALE
 	 */
 
 	if (lladdr) {		/* (3-5) and (7) */
 		/*
 		 * Record source link-layer address
 		 * XXX is it dependent to ifp->if_type?
 		 */
 		bcopy(lladdr, &ln->ll_addr, ifp->if_addrlen);
 		ln->la_flags |= LLE_VALID;
 		EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
 	}
 
 	if (!is_newentry) {
 		if ((!olladdr && lladdr != NULL) ||	/* (3) */
 		    (olladdr && lladdr != NULL && llchange)) {	/* (5) */
 			do_update = 1;
 			newstate = ND6_LLINFO_STALE;
 		} else					/* (1-2,4) */
 			do_update = 0;
 	} else {
 		do_update = 1;
 		if (lladdr == NULL)			/* (6) */
 			newstate = ND6_LLINFO_NOSTATE;
 		else					/* (7) */
 			newstate = ND6_LLINFO_STALE;
 	}
 
 	if (do_update) {
 		/*
 		 * Update the state of the neighbor cache.
 		 */
 		ln->ln_state = newstate;
 
 		if (ln->ln_state == ND6_LLINFO_STALE) {
 			/*
 			 * XXX: since nd6_output() below will cause
 			 * state tansition to DELAY and reset the timer,
 			 * we must set the timer now, although it is actually
 			 * meaningless.
 			 */
 			nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz);
 
 			if (ln->la_hold) {
 				struct mbuf *m_hold, *m_hold_next;
 
 				/*
 				 * reset the la_hold in advance, to explicitly
 				 * prevent a la_hold lookup in nd6_output()
 				 * (wouldn't happen, though...)
 				 */
 				for (m_hold = ln->la_hold, ln->la_hold = NULL;
 				    m_hold; m_hold = m_hold_next) {
 					m_hold_next = m_hold->m_nextpkt;
 					m_hold->m_nextpkt = NULL;
 
 					/*
 					 * we assume ifp is not a p2p here, so
 					 * just set the 2nd argument as the
 					 * 1st one.
 					 */
 					nd6_output_lle(ifp, ifp, m_hold, L3_ADDR_SIN6(ln), NULL, ln, &chain);
 				}
 				/*
 				 * If we have mbufs in the chain we need to do
 				 * deferred transmit. Copy the address from the
 				 * llentry before dropping the lock down below.
 				 */
 				if (chain != NULL)
 					memcpy(&sin6, L3_ADDR_SIN6(ln), sizeof(sin6));
 			}
 		} else if (ln->ln_state == ND6_LLINFO_INCOMPLETE) {
 			/* probe right away */
 			nd6_llinfo_settimer_locked((void *)ln, 0);
 		}
 	}
 
 	/*
 	 * ICMP6 type dependent behavior.
 	 *
 	 * NS: clear IsRouter if new entry
 	 * RS: clear IsRouter
 	 * RA: set IsRouter if there's lladdr
 	 * redir: clear IsRouter if new entry
 	 *
 	 * RA case, (1):
 	 * The spec says that we must set IsRouter in the following cases:
 	 * - If lladdr exist, set IsRouter.  This means (1-5).
 	 * - If it is old entry (!newentry), set IsRouter.  This means (7).
 	 * So, based on the spec, in (1-5) and (7) cases we must set IsRouter.
 	 * A quetion arises for (1) case.  (1) case has no lladdr in the
 	 * neighbor cache, this is similar to (6).
 	 * This case is rare but we figured that we MUST NOT set IsRouter.
 	 *
 	 * newentry olladdr  lladdr  llchange	    NS  RS  RA	redir
 	 *							D R
 	 *	0	n	n	--	(1)	c   ?     s
 	 *	0	y	n	--	(2)	c   s     s
 	 *	0	n	y	--	(3)	c   s     s
 	 *	0	y	y	n	(4)	c   s     s
 	 *	0	y	y	y	(5)	c   s     s
 	 *	1	--	n	--	(6) c	c	c s
 	 *	1	--	y	--	(7) c	c   s	c s
 	 *
 	 *					(c=clear s=set)
 	 */
 	switch (type & 0xff) {
 	case ND_NEIGHBOR_SOLICIT:
 		/*
 		 * New entry must have is_router flag cleared.
 		 */
 		if (is_newentry)	/* (6-7) */
 			ln->ln_router = 0;
 		break;
 	case ND_REDIRECT:
 		/*
 		 * If the icmp is a redirect to a better router, always set the
 		 * is_router flag.  Otherwise, if the entry is newly created,
 		 * clear the flag.  [RFC 2461, sec 8.3]
 		 */
 		if (code == ND_REDIRECT_ROUTER)
 			ln->ln_router = 1;
 		else if (is_newentry) /* (6-7) */
 			ln->ln_router = 0;
 		break;
 	case ND_ROUTER_SOLICIT:
 		/*
 		 * is_router flag must always be cleared.
 		 */
 		ln->ln_router = 0;
 		break;
 	case ND_ROUTER_ADVERT:
 		/*
 		 * Mark an entry with lladdr as a router.
 		 */
 		if ((!is_newentry && (olladdr || lladdr)) ||	/* (2-5) */
 		    (is_newentry && lladdr)) {			/* (7) */
 			ln->ln_router = 1;
 		}
 		break;
 	}
 
 	if (ln != NULL) {
 		static_route = (ln->la_flags & LLE_STATIC);
 		router = ln->ln_router;
 
 		if (flags & ND6_EXCLUSIVE)
 			LLE_WUNLOCK(ln);
 		else
 			LLE_RUNLOCK(ln);
 		if (static_route)
 			ln = NULL;
 	}
 	if (chain)
 		nd6_output_flush(ifp, ifp, chain, &sin6, NULL);
 	
 	/*
 	 * When the link-layer address of a router changes, select the
 	 * best router again.  In particular, when the neighbor entry is newly
 	 * created, it might affect the selection policy.
 	 * Question: can we restrict the first condition to the "is_newentry"
 	 * case?
 	 * XXX: when we hear an RA from a new router with the link-layer
 	 * address option, defrouter_select() is called twice, since
 	 * defrtrlist_update called the function as well.  However, I believe
 	 * we can compromise the overhead, since it only happens the first
 	 * time.
 	 * XXX: although defrouter_select() should not have a bad effect
 	 * for those are not autoconfigured hosts, we explicitly avoid such
 	 * cases for safety.
 	 */
 	if (do_update && router &&
 	    ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		/*
 		 * guaranteed recursion
 		 */
 		defrouter_select();
 	}
 	
 	return (ln);
 done:	
 	if (ln != NULL) {
 		if (flags & ND6_EXCLUSIVE)
 			LLE_WUNLOCK(ln);
 		else
 			LLE_RUNLOCK(ln);
 		if (static_route)
 			ln = NULL;
 	}
 	return (ln);
 }
 
 static void
 nd6_slowtimo(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct nd_ifinfo *nd6if;
 	struct ifnet *ifp;
 
 	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
 	    nd6_slowtimo, curvnet);
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_list) {
 		if (ifp->if_afdata[AF_INET6] == NULL)
 			continue;
 		nd6if = ND_IFINFO(ifp);
 		if (nd6if->basereachable && /* already initialized */
 		    (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) {
 			/*
 			 * Since reachable time rarely changes by router
 			 * advertisements, we SHOULD insure that a new random
 			 * value gets recomputed at least once every few hours.
 			 * (RFC 2461, 6.3.4)
 			 */
 			nd6if->recalctm = V_nd6_recalc_reachtm_interval;
 			nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable);
 		}
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	CURVNET_RESTORE();
 }
 
 /*
  * IPv6 packet output - light version.
  * Checks if destination LLE exists and is in proper state
  * (e.g no modification required). If not true, fall back to
  * "heavy" version.
  */
 int
 nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m,
     struct sockaddr_in6 *dst, struct rtentry *rt0)
 {
 	struct llentry *ln = NULL;
 	int error = 0;
 
 	/* discard the packet if IPv6 operation is disabled on the interface */
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) {
 		m_freem(m);
 		return (ENETDOWN); /* better error? */
 	}
 
 	if (IN6_IS_ADDR_MULTICAST(&dst->sin6_addr))
 		goto sendpkt;
 
 	if (nd6_need_cache(ifp) == 0)
 		goto sendpkt;
 
 	IF_AFDATA_RLOCK(ifp);
 	ln = nd6_lookup(&dst->sin6_addr, 0, ifp);
 	IF_AFDATA_RUNLOCK(ifp);
 
 	/*
 	 * Perform fast path for the following cases:
 	 * 1) lle state is REACHABLE
 	 * 2) lle state is DELAY (NS message sentNS message sent)
 	 *
 	 * Every other case involves lle modification, so we handle
 	 * them separately.
 	 */
 	if (ln == NULL || (ln->ln_state != ND6_LLINFO_REACHABLE &&
 	    ln->ln_state != ND6_LLINFO_DELAY)) {
 		/* Fall back to slow processing path */
 		if (ln != NULL)
 			LLE_RUNLOCK(ln);
 		return (nd6_output_lle(ifp, origifp, m, dst, rt0, NULL, NULL));
 	}
 
 sendpkt:
 	if (ln != NULL)
 		LLE_RUNLOCK(ln);
 
 #ifdef MAC
 	mac_netinet6_nd6_send(ifp, m);
 #endif
 
 	/*
 	 * If called from nd6_ns_output() (NS), nd6_na_output() (NA),
 	 * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA
 	 * as handled by rtsol and rtadvd), mbufs will be tagged for SeND
 	 * to be diverted to user space.  When re-injected into the kernel,
 	 * send_output() will directly dispatch them to the outgoing interface.
 	 */
 	if (send_sendso_input_hook != NULL) {
 		struct m_tag *mtag;
 		struct ip6_hdr *ip6;
 		int ip6len;
 		mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL);
 		if (mtag != NULL) {
 			ip6 = mtod(m, struct ip6_hdr *);
 			ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
 			/* Use the SEND socket */
 			error = send_sendso_input_hook(m, ifp, SND_OUT,
 			    ip6len);
 			/* -1 == no app on SEND socket */
 			if (error == 0 || error != -1)
 			    return (error);
 		}
 	}
 
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL,
 	    mtod(m, struct ip6_hdr *));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) == 0)
 		origifp = ifp;
 	
 	error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, NULL);
 	return (error);
 }
 
 
 /*
  * Output IPv6 packet - heavy version.
  * Function assume that either
  * 1) destination LLE does not exist, is invalid or stale, so
  *   ND6_EXCLUSIVE lock needs to be acquired
  * 2) destination lle is provided (with ND6_EXCLUSIVE lock),
  *   in that case packets are queued in &chain.
  *
  */
 int
 nd6_output_lle(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m,
     struct sockaddr_in6 *dst, struct rtentry *rt0, struct llentry *lle,
 	struct mbuf **chain)
 {
 	struct m_tag *mtag;
 	struct ip6_hdr *ip6;
 	int error = 0;
 	int flags = 0;
 	int has_lle = 0;
 	int ip6len;
 
 #ifdef INVARIANTS
 	if (lle != NULL) {
 		
 		LLE_WLOCK_ASSERT(lle);
 
 		KASSERT(chain != NULL, (" lle locked but no mbuf chain pointer passed"));
 	}
 #endif
 	KASSERT(m != NULL, ("NULL mbuf, nothing to send"));
 	/* discard the packet if IPv6 operation is disabled on the interface */
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) {
 		m_freem(m);
 		return (ENETDOWN); /* better error? */
 	}
 
 	if (lle != NULL)
 		has_lle = 1;
 
 	if (IN6_IS_ADDR_MULTICAST(&dst->sin6_addr))
 		goto sendpkt;
 
 	if (nd6_need_cache(ifp) == 0)
 		goto sendpkt;
 
 	/*
 	 * Address resolution or Neighbor Unreachability Detection
 	 * for the next hop.
 	 * At this point, the destination of the packet must be a unicast
 	 * or an anycast address(i.e. not a multicast).
 	 */
 	if (lle == NULL) {
 		IF_AFDATA_RLOCK(ifp);
 		lle = nd6_lookup(&dst->sin6_addr, ND6_EXCLUSIVE, ifp);
 		IF_AFDATA_RUNLOCK(ifp);
 		if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp))  {
 			/*
 			 * Since nd6_is_addr_neighbor() internally calls nd6_lookup(),
 			 * the condition below is not very efficient.  But we believe
 			 * it is tolerable, because this should be a rare case.
 			 */
 			flags = ND6_CREATE | ND6_EXCLUSIVE;
 			IF_AFDATA_LOCK(ifp);
 			lle = nd6_lookup(&dst->sin6_addr, flags, ifp);
 			IF_AFDATA_UNLOCK(ifp);
 		}
 	} 
 	if (lle == NULL) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0 &&
 		    !(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) {
 			char ip6buf[INET6_ADDRSTRLEN];
 			log(LOG_DEBUG,
 			    "nd6_output: can't allocate llinfo for %s "
 			    "(ln=%p)\n",
 			    ip6_sprintf(ip6buf, &dst->sin6_addr), lle);
 			m_freem(m);
 			return (ENOBUFS);
 		}
 		goto sendpkt;	/* send anyway */
 	}
 
 	LLE_WLOCK_ASSERT(lle);
 
 	/* We don't have to do link-layer address resolution on a p2p link. */
 	if ((ifp->if_flags & IFF_POINTOPOINT) != 0 &&
 	    lle->ln_state < ND6_LLINFO_REACHABLE) {
 		lle->ln_state = ND6_LLINFO_STALE;
 		nd6_llinfo_settimer_locked(lle, (long)V_nd6_gctimer * hz);
 	}
 
 	/*
 	 * The first time we send a packet to a neighbor whose entry is
 	 * STALE, we have to change the state to DELAY and a sets a timer to
 	 * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do
 	 * neighbor unreachability detection on expiration.
 	 * (RFC 2461 7.3.3)
 	 */
 	if (lle->ln_state == ND6_LLINFO_STALE) {
 		lle->la_asked = 0;
 		lle->ln_state = ND6_LLINFO_DELAY;
 		nd6_llinfo_settimer_locked(lle, (long)V_nd6_delay * hz);
 	}
 
 	/*
 	 * If the neighbor cache entry has a state other than INCOMPLETE
 	 * (i.e. its link-layer address is already resolved), just
 	 * send the packet.
 	 */
 	if (lle->ln_state > ND6_LLINFO_INCOMPLETE)
 		goto sendpkt;
 
 	/*
 	 * There is a neighbor cache entry, but no ethernet address
 	 * response yet.  Append this latest packet to the end of the
 	 * packet queue in the mbuf, unless the number of the packet
 	 * does not exceed nd6_maxqueuelen.  When it exceeds nd6_maxqueuelen,
 	 * the oldest packet in the queue will be removed.
 	 */
 	if (lle->ln_state == ND6_LLINFO_NOSTATE)
 		lle->ln_state = ND6_LLINFO_INCOMPLETE;
 
 	if (lle->la_hold != NULL) {
 		struct mbuf *m_hold;
 		int i;
 		
 		i = 0;
 		for (m_hold = lle->la_hold; m_hold; m_hold = m_hold->m_nextpkt){
 			i++;
 			if (m_hold->m_nextpkt == NULL) {
 				m_hold->m_nextpkt = m;
 				break;
 			}
 		}
 		while (i >= V_nd6_maxqueuelen) {
 			m_hold = lle->la_hold;
 			lle->la_hold = lle->la_hold->m_nextpkt;
 			m_freem(m_hold);
 			i--;
 		}
 	} else {
 		lle->la_hold = m;
 	}
 
 	/*
 	 * If there has been no NS for the neighbor after entering the
 	 * INCOMPLETE state, send the first solicitation.
 	 */
 	if (!ND6_LLINFO_PERMANENT(lle) && lle->la_asked == 0) {
 		lle->la_asked++;
 		
 		nd6_llinfo_settimer_locked(lle,
 		    (long)ND_IFINFO(ifp)->retrans * hz / 1000);
 		LLE_WUNLOCK(lle);
 		nd6_ns_output(ifp, NULL, &dst->sin6_addr, lle, 0);
 		if (has_lle != 0)
 			LLE_WLOCK(lle);
 	} else if (has_lle == 0) {
 		/*
 		 * We did the lookup (no lle arg) so we
 		 * need to do the unlock here.
 		 */
 		LLE_WUNLOCK(lle);
 	}
 
 	return (0);
 
   sendpkt:
 	/*
 	 * ln is valid and the caller did not pass in 
 	 * an llentry
 	 */
 	if (lle != NULL && has_lle == 0)
 		LLE_WUNLOCK(lle);
 
 #ifdef MAC
 	mac_netinet6_nd6_send(ifp, m);
 #endif
 
 	/*
 	 * If called from nd6_ns_output() (NS), nd6_na_output() (NA),
 	 * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA
 	 * as handled by rtsol and rtadvd), mbufs will be tagged for SeND
 	 * to be diverted to user space.  When re-injected into the kernel,
 	 * send_output() will directly dispatch them to the outgoing interface.
 	 */
 	if (send_sendso_input_hook != NULL) {
 		mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL);
 		if (mtag != NULL) {
 			ip6 = mtod(m, struct ip6_hdr *);
 			ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
 			/* Use the SEND socket */
 			error = send_sendso_input_hook(m, ifp, SND_OUT,
 			    ip6len);
 			/* -1 == no app on SEND socket */
 			if (error == 0 || error != -1)
 			    return (error);
 		}
 	}
 
 	/*
 	 * We were passed in a pointer to an lle with the lock held 
 	 * this means that we can't call if_output as we will
 	 * recurse on the lle lock - so what we do is we create
 	 * a list of mbufs to send and transmit them in the caller
 	 * after the lock is dropped
 	 */
 	if (has_lle != 0) {
 		if (*chain == NULL)
 			*chain = m;
 		else {
 			struct mbuf *mb;
 
 			/*
 			 * append mbuf to end of deferred chain
 			 */
 			mb = *chain;
 			while (mb->m_nextpkt != NULL)
 				mb = mb->m_nextpkt;
 			mb->m_nextpkt = m;
 		}
 		return (error);
 	}
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL,
 	    mtod(m, struct ip6_hdr *));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) == 0)
 		origifp = ifp;
 
 	error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, NULL);
 	return (error);
 }
 
 
 int
 nd6_output_flush(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *chain,
     struct sockaddr_in6 *dst, struct route *ro)
 {
 	struct mbuf *m, *m_head;
 	struct ifnet *outifp;
 	int error = 0;
 
 	m_head = chain;
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0)
 		outifp = origifp;
 	else
 		outifp = ifp;
 	
 	while (m_head) {
 		m = m_head;
 		m_head = m_head->m_nextpkt;
 		error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro);			       
 	}
 
 	/*
 	 * XXX
 	 * note that intermediate errors are blindly ignored - but this is 
 	 * the same convention as used with nd6_output when called by
 	 * nd6_cache_lladdr
 	 */
 	return (error);
 }	
 
 
 int
 nd6_need_cache(struct ifnet *ifp)
 {
 	/*
 	 * XXX: we currently do not make neighbor cache on any interface
 	 * other than ARCnet, Ethernet, FDDI and GIF.
 	 *
 	 * RFC2893 says:
 	 * - unidirectional tunnels needs no ND
 	 */
 	switch (ifp->if_type) {
 	case IFT_ARCNET:
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_IEEE1394:
 #ifdef IFT_L2VLAN
 	case IFT_L2VLAN:
 #endif
 #ifdef IFT_IEEE80211
 	case IFT_IEEE80211:
 #endif
 	case IFT_INFINIBAND:
 	case IFT_GIF:		/* XXX need more cases? */
 	case IFT_PPP:
 	case IFT_TUNNEL:
 	case IFT_BRIDGE:
 	case IFT_PROPVIRTUAL:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 /*
  * Add pernament ND6 link-layer record for given
  * interface address.
  *
  * Very similar to IPv4 arp_ifinit(), but:
  * 1) IPv6 DAD is performed in different place
  * 2) It is called by IPv6 protocol stack in contrast to
  * arp_ifinit() which is typically called in SIOCSIFADDR
  * driver ioctl handler.
  *
  */
 int
 nd6_add_ifa_lle(struct in6_ifaddr *ia)
 {
 	struct ifnet *ifp;
 	struct llentry *ln;
 
 	ifp = ia->ia_ifa.ifa_ifp;
 	IF_AFDATA_LOCK(ifp);
 	ia->ia_ifa.ifa_rtrequest = nd6_rtrequest;
 	ln = lla_lookup(LLTABLE6(ifp), (LLE_CREATE | LLE_IFADDR |
 	    LLE_EXCLUSIVE), (struct sockaddr *)&ia->ia_addr);
 	IF_AFDATA_UNLOCK(ifp);
 	if (ln != NULL) {
 		ln->la_expire = 0;  /* for IPv6 this means permanent */
 		ln->ln_state = ND6_LLINFO_REACHABLE;
 		LLE_WUNLOCK(ln);
 		in6_newaddrmsg(ia, RTM_ADD);
 		return (0);
 	}
 
 	return (ENOBUFS);
 }
 
 /*
  * Removes ALL lle records for interface address prefix.
  * XXXME: That's probably not we really want to do, we need
  * to remove address record only and keep other records
  * until we determine if given prefix is really going 
  * to be removed.
  */
 void
 nd6_rem_ifa_lle(struct in6_ifaddr *ia)
 {
 	struct sockaddr_in6 mask, addr;
 	struct ifnet *ifp;
 
 	in6_newaddrmsg(ia, RTM_DELETE);
 
 	ifp = ia->ia_ifa.ifa_ifp;
 	memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr));
 	memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask));
 	lltable_prefix_free(AF_INET6, (struct sockaddr *)&addr,
 	            (struct sockaddr *)&mask, LLE_STATIC);
 }
 
 /*
  * the callers of this function need to be re-worked to drop
  * the lle lock, drop here for now
  */
 int
 nd6_storelladdr(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *dst, u_char *desten, struct llentry **lle)
 {
 	struct llentry *ln;
 
 	*lle = NULL;
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
 	if (m != NULL && m->m_flags & M_MCAST) {
 		int i;
 
 		switch (ifp->if_type) {
 		case IFT_ETHER:
 		case IFT_FDDI:
 #ifdef IFT_L2VLAN
 		case IFT_L2VLAN:
 #endif
 #ifdef IFT_IEEE80211
 		case IFT_IEEE80211:
 #endif
 		case IFT_BRIDGE:
 		case IFT_ISO88025:
 			ETHER_MAP_IPV6_MULTICAST(&SIN6(dst)->sin6_addr,
 						 desten);
 			return (0);
 		case IFT_IEEE1394:
 			/*
 			 * netbsd can use if_broadcastaddr, but we don't do so
 			 * to reduce # of ifdef.
 			 */
 			for (i = 0; i < ifp->if_addrlen; i++)
 				desten[i] = ~0;
 			return (0);
 		case IFT_ARCNET:
 			*desten = 0;
 			return (0);
 		default:
 			m_freem(m);
 			return (EAFNOSUPPORT);
 		}
 	}
 
 
 	/*
 	 * the entry should have been created in nd6_store_lladdr
 	 */
 	IF_AFDATA_RLOCK(ifp);
 	ln = lla_lookup(LLTABLE6(ifp), 0, dst);
 	IF_AFDATA_RUNLOCK(ifp);
 	if ((ln == NULL) || !(ln->la_flags & LLE_VALID)) {
 		if (ln != NULL)
 			LLE_RUNLOCK(ln);
 		/* this could happen, if we could not allocate memory */
 		m_freem(m);
 		return (1);
 	}
 
 	bcopy(&ln->ll_addr, desten, ifp->if_addrlen);
 	*lle = ln;
 	LLE_RUNLOCK(ln);
 	/*
 	 * A *small* use after free race exists here
 	 */
 	return (0);
 }
 
 static void 
 clear_llinfo_pqueue(struct llentry *ln)
 {
 	struct mbuf *m_hold, *m_hold_next;
 
 	for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) {
 		m_hold_next = m_hold->m_nextpkt;
 		m_freem(m_hold);
 	}
 
 	ln->la_hold = NULL;
 	return;
 }
 
 static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS);
 static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS);
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet6_icmp6);
 #endif
 SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist,
 	CTLFLAG_RD, nd6_sysctl_drlist, "");
 SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist,
 	CTLFLAG_RD, nd6_sysctl_prlist, "");
-SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen,
-	CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, "");
-SYSCTL_VNET_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer,
-	CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), "");
+SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, "");
+SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), "");
 
 static int
 nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_defrouter d;
 	struct nd_defrouter *dr;
 	int error;
 
 	if (req->newptr)
 		return (EPERM);
 
 	bzero(&d, sizeof(d));
 	d.rtaddr.sin6_family = AF_INET6;
 	d.rtaddr.sin6_len = sizeof(d.rtaddr);
 
 	/*
 	 * XXX locking
 	 */
 	TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
 		d.rtaddr.sin6_addr = dr->rtaddr;
 		error = sa6_recoverscope(&d.rtaddr);
 		if (error != 0)
 			return (error);
 		d.flags = dr->flags;
 		d.rtlifetime = dr->rtlifetime;
 		d.expire = dr->expire + (time_second - time_uptime);
 		d.if_index = dr->ifp->if_index;
 		error = SYSCTL_OUT(req, &d, sizeof(d));
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_prefix p;
 	struct sockaddr_in6 s6;
 	struct nd_prefix *pr;
 	struct nd_pfxrouter *pfr;
 	time_t maxexpire;
 	int error;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	if (req->newptr)
 		return (EPERM);
 
 	bzero(&p, sizeof(p));
 	p.origin = PR_ORIG_RA;
 	bzero(&s6, sizeof(s6));
 	s6.sin6_family = AF_INET6;
 	s6.sin6_len = sizeof(s6);
 
 	/*
 	 * XXX locking
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		p.prefix = pr->ndpr_prefix;
 		if (sa6_recoverscope(&p.prefix)) {
 			log(LOG_ERR, "scope error in prefix list (%s)\n",
 			    ip6_sprintf(ip6buf, &p.prefix.sin6_addr));
 			/* XXX: press on... */
 		}
 		p.raflags = pr->ndpr_raf;
 		p.prefixlen = pr->ndpr_plen;
 		p.vltime = pr->ndpr_vltime;
 		p.pltime = pr->ndpr_pltime;
 		p.if_index = pr->ndpr_ifp->if_index;
 		if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME)
 			p.expire = 0;
 		else {
 			/* XXX: we assume time_t is signed. */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate)
 				p.expire = pr->ndpr_lastupdate +
 				    pr->ndpr_vltime +
 				    (time_second - time_uptime);
 			else
 				p.expire = maxexpire;
 		}
 		p.refcnt = pr->ndpr_refcnt;
 		p.flags = pr->ndpr_stateflags;
 		p.advrtrs = 0;
 		LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry)
 			p.advrtrs++;
 		error = SYSCTL_OUT(req, &p, sizeof(p));
 		if (error != 0)
 			return (error);
 		LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) {
 			s6.sin6_addr = pfr->router->rtaddr;
 			if (sa6_recoverscope(&s6))
 				log(LOG_ERR,
 				    "scope error in prefix list (%s)\n",
 				    ip6_sprintf(ip6buf, &pfr->router->rtaddr));
 			error = SYSCTL_OUT(req, &s6, sizeof(s6));
 			if (error != 0)
 				return (error);
 		}
 	}
 	return (0);
 }
Index: user/dchagin/lemul/sys/netinet6/scope6.c
===================================================================
--- user/dchagin/lemul/sys/netinet6/scope6.c	(revision 274240)
+++ user/dchagin/lemul/sys/netinet6/scope6.c	(revision 274241)
@@ -1,535 +1,535 @@
 /*-
  * Copyright (C) 2000 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: scope6.c,v 1.10 2000/07/24 13:29:31 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/systm.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 
 #ifdef ENABLE_DEFAULT_SCOPE
 VNET_DEFINE(int, ip6_use_defzone) = 1;
 #else
 VNET_DEFINE(int, ip6_use_defzone) = 0;
 #endif
 VNET_DEFINE(int, deembed_scopeid) = 1;
 SYSCTL_DECL(_net_inet6_ip6);
-SYSCTL_VNET_INT(_net_inet6_ip6, OID_AUTO, deembed_scopeid, CTLFLAG_RW,
+SYSCTL_INT(_net_inet6_ip6, OID_AUTO, deembed_scopeid, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(deembed_scopeid), 0,
     "Extract embedded zone ID and set it to sin6_scope_id in sockaddr_in6.");
 
 /*
  * The scope6_lock protects the global sid default stored in
  * sid_default below.
  */
 static struct mtx scope6_lock;
 #define	SCOPE6_LOCK_INIT()	mtx_init(&scope6_lock, "scope6_lock", NULL, MTX_DEF)
 #define	SCOPE6_LOCK()		mtx_lock(&scope6_lock)
 #define	SCOPE6_UNLOCK()		mtx_unlock(&scope6_lock)
 #define	SCOPE6_LOCK_ASSERT()	mtx_assert(&scope6_lock, MA_OWNED)
 
 static VNET_DEFINE(struct scope6_id, sid_default);
 #define	V_sid_default			VNET(sid_default)
 
 #define SID(ifp) \
 	(((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->scope6_id)
 
 static int	scope6_get(struct ifnet *, struct scope6_id *);
 static int	scope6_set(struct ifnet *, struct scope6_id *);
 
 void
 scope6_init(void)
 {
 
 	bzero(&V_sid_default, sizeof(V_sid_default));
 
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	SCOPE6_LOCK_INIT();
 }
 
 struct scope6_id *
 scope6_ifattach(struct ifnet *ifp)
 {
 	struct scope6_id *sid;
 
 	sid = malloc(sizeof(*sid), M_IFADDR, M_WAITOK | M_ZERO);
 	/*
 	 * XXX: IPV6_ADDR_SCOPE_xxx macros are not standard.
 	 * Should we rather hardcode here?
 	 */
 	sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index;
 	sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index;
 	return (sid);
 }
 
 void
 scope6_ifdetach(struct scope6_id *sid)
 {
 
 	free(sid, M_IFADDR);
 }
 
 int
 scope6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
 {
 	struct in6_ifreq *ifr;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return (EPFNOSUPPORT);
 
 	ifr = (struct in6_ifreq *)data;
 	switch (cmd) {
 	case SIOCSSCOPE6:
 		return (scope6_set(ifp,
 		    (struct scope6_id *)ifr->ifr_ifru.ifru_scope_id));
 	case SIOCGSCOPE6:
 		return (scope6_get(ifp,
 		    (struct scope6_id *)ifr->ifr_ifru.ifru_scope_id));
 	case SIOCGSCOPE6DEF:
 		return (scope6_get_default(
 		    (struct scope6_id *)ifr->ifr_ifru.ifru_scope_id));
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static int
 scope6_set(struct ifnet *ifp, struct scope6_id *idlist)
 {
 	int i;
 	int error = 0;
 	struct scope6_id *sid = NULL;
 
 	IF_AFDATA_WLOCK(ifp);
 	sid = SID(ifp);
 
 	if (!sid) {	/* paranoid? */
 		IF_AFDATA_WUNLOCK(ifp);
 		return (EINVAL);
 	}
 
 	/*
 	 * XXX: We need more consistency checks of the relationship among
 	 * scopes (e.g. an organization should be larger than a site).
 	 */
 
 	/*
 	 * TODO(XXX): after setting, we should reflect the changes to
 	 * interface addresses, routing table entries, PCB entries...
 	 */
 
 	for (i = 0; i < 16; i++) {
 		if (idlist->s6id_list[i] &&
 		    idlist->s6id_list[i] != sid->s6id_list[i]) {
 			/*
 			 * An interface zone ID must be the corresponding
 			 * interface index by definition.
 			 */
 			if (i == IPV6_ADDR_SCOPE_INTFACELOCAL &&
 			    idlist->s6id_list[i] != ifp->if_index) {
 				IF_AFDATA_WUNLOCK(ifp);
 				return (EINVAL);
 			}
 
 			if (i == IPV6_ADDR_SCOPE_LINKLOCAL &&
 			    idlist->s6id_list[i] > V_if_index) {
 				/*
 				 * XXX: theoretically, there should be no
 				 * relationship between link IDs and interface
 				 * IDs, but we check the consistency for
 				 * safety in later use.
 				 */
 				IF_AFDATA_WUNLOCK(ifp);
 				return (EINVAL);
 			}
 
 			/*
 			 * XXX: we must need lots of work in this case,
 			 * but we simply set the new value in this initial
 			 * implementation.
 			 */
 			sid->s6id_list[i] = idlist->s6id_list[i];
 		}
 	}
 	IF_AFDATA_WUNLOCK(ifp);
 
 	return (error);
 }
 
 static int
 scope6_get(struct ifnet *ifp, struct scope6_id *idlist)
 {
 	struct scope6_id *sid;
 
 	/* We only need to lock the interface's afdata for SID() to work. */
 	IF_AFDATA_RLOCK(ifp);
 	sid = SID(ifp);
 	if (sid == NULL) {	/* paranoid? */
 		IF_AFDATA_RUNLOCK(ifp);
 		return (EINVAL);
 	}
 
 	*idlist = *sid;
 
 	IF_AFDATA_RUNLOCK(ifp);
 	return (0);
 }
 
 /*
  * Get a scope of the address. Node-local, link-local, site-local or global.
  */
 int
 in6_addrscope(const struct in6_addr *addr)
 {
 
 	if (IN6_IS_ADDR_MULTICAST(addr)) {
 		/*
 		 * Addresses with reserved value F must be treated as
 		 * global multicast addresses.
 		 */
 		if (IPV6_ADDR_MC_SCOPE(addr) == 0x0f)
 			return (IPV6_ADDR_SCOPE_GLOBAL);
 		return (IPV6_ADDR_MC_SCOPE(addr));
 	}
 	if (IN6_IS_ADDR_LINKLOCAL(addr) ||
 	    IN6_IS_ADDR_LOOPBACK(addr))
 		return (IPV6_ADDR_SCOPE_LINKLOCAL);
 	if (IN6_IS_ADDR_SITELOCAL(addr))
 		return (IPV6_ADDR_SCOPE_SITELOCAL);
 	return (IPV6_ADDR_SCOPE_GLOBAL);
 }
 
 /*
  * ifp - note that this might be NULL
  */
 
 void
 scope6_setdefault(struct ifnet *ifp)
 {
 
 	/*
 	 * Currently, this function just sets the default "interfaces"
 	 * and "links" according to the given interface.
 	 * We might eventually have to separate the notion of "link" from
 	 * "interface" and provide a user interface to set the default.
 	 */
 	SCOPE6_LOCK();
 	if (ifp) {
 		V_sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] =
 			ifp->if_index;
 		V_sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] =
 			ifp->if_index;
 	} else {
 		V_sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = 0;
 		V_sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = 0;
 	}
 	SCOPE6_UNLOCK();
 }
 
 int
 scope6_get_default(struct scope6_id *idlist)
 {
 
 	SCOPE6_LOCK();
 	*idlist = V_sid_default;
 	SCOPE6_UNLOCK();
 
 	return (0);
 }
 
 u_int32_t
 scope6_addr2default(struct in6_addr *addr)
 {
 	u_int32_t id;
 
 	/*
 	 * special case: The loopback address should be considered as
 	 * link-local, but there's no ambiguity in the syntax.
 	 */
 	if (IN6_IS_ADDR_LOOPBACK(addr))
 		return (0);
 
 	/*
 	 * XXX: 32-bit read is atomic on all our platforms, is it OK
 	 * not to lock here?
 	 */
 	SCOPE6_LOCK();
 	id = V_sid_default.s6id_list[in6_addrscope(addr)];
 	SCOPE6_UNLOCK();
 	return (id);
 }
 
 /*
  * Validate the specified scope zone ID in the sin6_scope_id field.  If the ID
  * is unspecified (=0), needs to be specified, and the default zone ID can be
  * used, the default value will be used.
  * This routine then generates the kernel-internal form: if the address scope
  * of is interface-local or link-local, embed the interface index in the
  * address.
  */
 int
 sa6_embedscope(struct sockaddr_in6 *sin6, int defaultok)
 {
 	u_int32_t zoneid;
 
 	if ((zoneid = sin6->sin6_scope_id) == 0 && defaultok)
 		zoneid = scope6_addr2default(&sin6->sin6_addr);
 
 	if (zoneid != 0 &&
 	    (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
 	    IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))) {
 		/*
 		 * At this moment, we only check interface-local and
 		 * link-local scope IDs, and use interface indices as the
 		 * zone IDs assuming a one-to-one mapping between interfaces
 		 * and links.
 		 */
 		if (V_if_index < zoneid || ifnet_byindex(zoneid) == NULL)
 			return (ENXIO);
 
 		/* XXX assignment to 16bit from 32bit variable */
 		sin6->sin6_addr.s6_addr16[1] = htons(zoneid & 0xffff);
 		sin6->sin6_scope_id = 0;
 	}
 
 	return 0;
 }
 
 /*
  * generate standard sockaddr_in6 from embedded form.
  */
 int
 sa6_recoverscope(struct sockaddr_in6 *sin6)
 {
 	char ip6buf[INET6_ADDRSTRLEN];
 	u_int32_t zoneid;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
 	    IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr)) {
 		/*
 		 * KAME assumption: link id == interface id
 		 */
 		zoneid = ntohs(sin6->sin6_addr.s6_addr16[1]);
 		if (zoneid) {
 			/* sanity check */
 			if (V_if_index < zoneid)
 				return (ENXIO);
 #if 0
 			/* XXX: Disabled due to possible deadlock. */
 			if (!ifnet_byindex(zoneid))
 				return (ENXIO);
 #endif
 			if (sin6->sin6_scope_id != 0 &&
 			    zoneid != sin6->sin6_scope_id) {
 				log(LOG_NOTICE,
 				    "%s: embedded scope mismatch: %s%%%d. "
 				    "sin6_scope_id was overridden.", __func__,
 				    ip6_sprintf(ip6buf, &sin6->sin6_addr),
 				    sin6->sin6_scope_id);
 			}
 			sin6->sin6_addr.s6_addr16[1] = 0;
 			sin6->sin6_scope_id = zoneid;
 		}
 	}
 
 	return 0;
 }
 
 /*
  * Determine the appropriate scope zone ID for in6 and ifp.  If ret_id is
  * non NULL, it is set to the zone ID.  If the zone ID needs to be embedded
  * in the in6_addr structure, in6 will be modified.
  *
  * ret_id - unnecessary?
  */
 int
 in6_setscope(struct in6_addr *in6, struct ifnet *ifp, u_int32_t *ret_id)
 {
 	int scope;
 	u_int32_t zoneid = 0;
 	struct scope6_id *sid;
 
 	/*
 	 * special case: the loopback address can only belong to a loopback
 	 * interface.
 	 */
 	if (IN6_IS_ADDR_LOOPBACK(in6)) {
 		if (!(ifp->if_flags & IFF_LOOPBACK))
 			return (EINVAL);
 	} else {
 		scope = in6_addrscope(in6);
 		if (scope == IPV6_ADDR_SCOPE_INTFACELOCAL ||
 		    scope == IPV6_ADDR_SCOPE_LINKLOCAL) {
 			/*
 			 * Currently we use interface indeces as the
 			 * zone IDs for interface-local and link-local
 			 * scopes.
 			 */
 			zoneid = ifp->if_index;
 			in6->s6_addr16[1] = htons(zoneid & 0xffff); /* XXX */
 		} else if (scope != IPV6_ADDR_SCOPE_GLOBAL) {
 			IF_AFDATA_RLOCK(ifp);
 			sid = SID(ifp);
 			zoneid = sid->s6id_list[scope];
 			IF_AFDATA_RUNLOCK(ifp);
 		}
 	}
 
 	if (ret_id != NULL)
 		*ret_id = zoneid;
 
 	return (0);
 }
 
 /*
  * Just clear the embedded scope identifier.  Return 0 if the original address
  * is intact; return non 0 if the address is modified.
  */
 int
 in6_clearscope(struct in6_addr *in6)
 {
 	int modified = 0;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) {
 		if (in6->s6_addr16[1] != 0)
 			modified = 1;
 		in6->s6_addr16[1] = 0;
 	}
 
 	return (modified);
 }
 
 /*
  * Return the scope identifier or zero.
  */
 uint16_t
 in6_getscope(struct in6_addr *in6)
 {
 
 	if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6))
 		return (in6->s6_addr16[1]);
 
 	return (0);
 }
 
 /*
  * Return pointer to ifnet structure, corresponding to the zone id of
  * link-local scope.
  */
 struct ifnet*
 in6_getlinkifnet(uint32_t zoneid)
 {
 
 	return (ifnet_byindex((u_short)zoneid));
 }
 
 /*
  * Return zone id for the specified scope.
  */
 uint32_t
 in6_getscopezone(const struct ifnet *ifp, int scope)
 {
 
 	if (scope == IPV6_ADDR_SCOPE_INTFACELOCAL ||
 	    scope == IPV6_ADDR_SCOPE_LINKLOCAL)
 		return (ifp->if_index);
 	if (scope >= 0 && scope < IPV6_ADDR_SCOPES_COUNT)
 		return (SID(ifp)->s6id_list[scope]);
 	return (0);
 }
 
 /*
  * This function is for checking sockaddr_in6 structure passed
  * from the application level (usually).
  *
  * sin6_scope_id should be set for link-local unicast, link-local and
  * interface-local  multicast addresses.
  *
  * If it is zero, then look into default zone ids. If default zone id is
  * not set or disabled, then return error.
  */
 int
 sa6_checkzone(struct sockaddr_in6 *sa6)
 {
 	int scope;
 
 	scope = in6_addrscope(&sa6->sin6_addr);
 	if (scope == IPV6_ADDR_SCOPE_GLOBAL)
 		return (sa6->sin6_scope_id ? EINVAL: 0);
 	if (IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr) &&
 	    scope != IPV6_ADDR_SCOPE_LINKLOCAL &&
 	    scope != IPV6_ADDR_SCOPE_INTFACELOCAL) {
 		if (sa6->sin6_scope_id == 0 && V_ip6_use_defzone != 0)
 			sa6->sin6_scope_id = V_sid_default.s6id_list[scope];
 		return (0);
 	}
 	/*
 	 * Since ::1 address always configured on the lo0, we can
 	 * automatically set its zone id, when it is not specified.
 	 * Return error, when specified zone id doesn't match with
 	 * actual value.
 	 */
 	if (IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr)) {
 		if (sa6->sin6_scope_id == 0)
 			sa6->sin6_scope_id = in6_getscopezone(V_loif, scope);
 		else if (sa6->sin6_scope_id != in6_getscopezone(V_loif, scope))
 			return (EADDRNOTAVAIL);
 	}
 	/* XXX: we can validate sin6_scope_id here */
 	if (sa6->sin6_scope_id != 0)
 		return (0);
 	if (V_ip6_use_defzone != 0)
 		sa6->sin6_scope_id = V_sid_default.s6id_list[scope];
 	/* Return error if we can't determine zone id */
 	return (sa6->sin6_scope_id ? 0: EADDRNOTAVAIL);
 }
 
 
Index: user/dchagin/lemul/sys/netipsec/ipsec.c
===================================================================
--- user/dchagin/lemul/sys/netipsec/ipsec.c	(revision 274240)
+++ user/dchagin/lemul/sys/netipsec/ipsec.c	(revision 274241)
@@ -1,1743 +1,1743 @@
 /*	$FreeBSD$	*/
 /*	$KAME: ipsec.c,v 1.103 2001/05/24 07:14:18 sakane Exp $	*/
 
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPsec controller part.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 
 #include <netinet/ip6.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet/icmp6.h>
 #endif
 
 #include <sys/types.h>
 #include <netipsec/ipsec.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/ah_var.h>
 #include <netipsec/esp_var.h>
 #include <netipsec/ipcomp.h>		/*XXX*/
 #include <netipsec/ipcomp_var.h>
 
 #include <netipsec/key.h>
 #include <netipsec/keydb.h>
 #include <netipsec/key_debug.h>
 
 #include <netipsec/xform.h>
 
 #include <machine/in_cksum.h>
 
 #include <opencrypto/cryptodev.h>
 
 #ifdef IPSEC_DEBUG
 VNET_DEFINE(int, ipsec_debug) = 1;
 #else
 VNET_DEFINE(int, ipsec_debug) = 0;
 #endif
 
 /* NB: name changed so netstat doesn't use it. */
 VNET_PCPUSTAT_DEFINE(struct ipsecstat, ipsec4stat);
 VNET_PCPUSTAT_SYSINIT(ipsec4stat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipsec4stat);
 #endif /* VIMAGE */
 
 VNET_DEFINE(int, ip4_ah_offsetmask) = 0;	/* maybe IP_DF? */
 /* DF bit on encap. 0: clear 1: set 2: copy */
 VNET_DEFINE(int, ip4_ipsec_dfbit) = 0;
 VNET_DEFINE(int, ip4_esp_trans_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip4_esp_net_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip4_ah_trans_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip4_ah_net_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(struct secpolicy, ip4_def_policy);
 /* ECN ignore(-1)/forbidden(0)/allowed(1) */
 VNET_DEFINE(int, ip4_ipsec_ecn) = 0;
 VNET_DEFINE(int, ip4_esp_randpad) = -1;
 
 /*
  * Crypto support requirements:
  *
  *  1	require hardware support
  * -1	require software support
  *  0	take anything
  */
 VNET_DEFINE(int, crypto_support) = CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE;
 
 FEATURE(ipsec, "Internet Protocol Security (IPsec)");
 #ifdef IPSEC_NAT_T
 FEATURE(ipsec_natt, "UDP Encapsulation of IPsec ESP Packets ('NAT-T')");
 #endif
 
 SYSCTL_DECL(_net_inet_ipsec);
 
 /* net.inet.ipsec */
-SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DEF_POLICY, def_policy,
-	CTLFLAG_RW, &VNET_NAME(ip4_def_policy).policy, 0,
+SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_POLICY, def_policy,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_def_policy).policy, 0,
 	"IPsec default policy.");
-SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev,
-	CTLFLAG_RW, &VNET_NAME(ip4_esp_trans_deflev), 0,
+SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_esp_trans_deflev), 0,
 	"Default ESP transport mode level");
-SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev,
-	CTLFLAG_RW, &VNET_NAME(ip4_esp_net_deflev), 0,
+SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_esp_net_deflev), 0,
 	"Default ESP tunnel mode level.");
-SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev,
-	CTLFLAG_RW, &VNET_NAME(ip4_ah_trans_deflev), 0,
+SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ah_trans_deflev), 0,
 	"AH transfer mode default level.");
-SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev,
-	CTLFLAG_RW, &VNET_NAME(ip4_ah_net_deflev), 0,
+SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ah_net_deflev), 0,
 	"AH tunnel mode default level.");
-SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_AH_CLEARTOS, ah_cleartos,
-	CTLFLAG_RW, &VNET_NAME(ah_cleartos), 0,
+SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_CLEARTOS, ah_cleartos,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ah_cleartos), 0,
 	"If set clear type-of-service field when doing AH computation.");
-SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_AH_OFFSETMASK, ah_offsetmask,
-	CTLFLAG_RW, &VNET_NAME(ip4_ah_offsetmask), 0,
+SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_OFFSETMASK, ah_offsetmask,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ah_offsetmask), 0,
 	"If not set clear offset field mask when doing AH computation.");
-SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DFBIT, dfbit,
-	CTLFLAG_RW, &VNET_NAME(ip4_ipsec_dfbit), 0,
+SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DFBIT, dfbit,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ipsec_dfbit), 0,
 	"Do not fragment bit on encap.");
-SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_ECN, ecn,
-	CTLFLAG_RW, &VNET_NAME(ip4_ipsec_ecn), 0,
+SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ECN, ecn,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ipsec_ecn), 0,
 	"Explicit Congestion Notification handling.");
-SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DEBUG, debug,
-	CTLFLAG_RW, &VNET_NAME(ipsec_debug), 0,
+SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEBUG, debug,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_debug), 0,
 	"Enable IPsec debugging output when set.");
-SYSCTL_VNET_INT(_net_inet_ipsec, OID_AUTO, crypto_support,
-	CTLFLAG_RW, &VNET_NAME(crypto_support), 0,
+SYSCTL_INT(_net_inet_ipsec, OID_AUTO, crypto_support,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(crypto_support), 0,
 	"Crypto driver selection.");
 SYSCTL_VNET_PCPUSTAT(_net_inet_ipsec, OID_AUTO, ipsecstats, struct ipsecstat,
     ipsec4stat, "IPsec IPv4 statistics.");
 
 #ifdef REGRESSION
 /*
  * When set to 1, IPsec will send packets with the same sequence number.
  * This allows to verify if the other side has proper replay attacks detection.
  */
 VNET_DEFINE(int, ipsec_replay) = 0;
-SYSCTL_VNET_INT(_net_inet_ipsec, OID_AUTO, test_replay,
-	CTLFLAG_RW, &VNET_NAME(ipsec_replay), 0,
+SYSCTL_INT(_net_inet_ipsec, OID_AUTO, test_replay,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_replay), 0,
 	"Emulate replay attack");
 /*
  * When set 1, IPsec will send packets with corrupted HMAC.
  * This allows to verify if the other side properly detects modified packets.
  */
 VNET_DEFINE(int, ipsec_integrity) = 0;
-SYSCTL_VNET_INT(_net_inet_ipsec, OID_AUTO, test_integrity,
-	CTLFLAG_RW, &VNET_NAME(ipsec_integrity), 0,
+SYSCTL_INT(_net_inet_ipsec, OID_AUTO, test_integrity,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_integrity), 0,
 	"Emulate man-in-the-middle attack");
 #endif
 
 #ifdef INET6 
 VNET_PCPUSTAT_DEFINE(struct ipsecstat, ipsec6stat);
 VNET_PCPUSTAT_SYSINIT(ipsec6stat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipsec6stat);
 #endif /* VIMAGE */
 
 VNET_DEFINE(int, ip6_esp_trans_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip6_esp_net_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip6_ah_trans_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip6_ah_net_deflev) = IPSEC_LEVEL_USE;
 VNET_DEFINE(int, ip6_ipsec_ecn) = 0;	/* ECN ignore(-1)/forbidden(0)/allowed(1) */
 
 SYSCTL_DECL(_net_inet6_ipsec6);
 
 /* net.inet6.ipsec6 */
-SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_DEF_POLICY, def_policy, CTLFLAG_RW,
-	&VNET_NAME(ip4_def_policy).policy, 0,
+SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_POLICY, def_policy,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_def_policy).policy, 0,
 	"IPsec default policy.");
-SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, 
-	esp_trans_deflev, CTLFLAG_RW, &VNET_NAME(ip6_esp_trans_deflev),	0,
+SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_esp_trans_deflev), 0,
 	"Default ESP transport mode level.");
-SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, 
-	esp_net_deflev, CTLFLAG_RW, &VNET_NAME(ip6_esp_net_deflev),	0,
+SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_esp_net_deflev), 0,
 	"Default ESP tunnel mode level.");
-SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, 
-	ah_trans_deflev, CTLFLAG_RW, &VNET_NAME(ip6_ah_trans_deflev),	0,
+SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_ah_trans_deflev), 0,
 	"AH transfer mode default level.");
-SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, 
-	ah_net_deflev, CTLFLAG_RW, &VNET_NAME(ip6_ah_net_deflev),	0,
+SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_ah_net_deflev), 0,
 	"AH tunnel mode default level.");
-SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_ECN,
-	ecn, CTLFLAG_RW, &VNET_NAME(ip6_ipsec_ecn),	0,
+SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ECN, ecn,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_ipsec_ecn), 0,
 	"Explicit Congestion Notification handling.");
-SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG, debug, CTLFLAG_RW,
-	&VNET_NAME(ipsec_debug), 0,
+SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG, debug,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_debug), 0,
 	"Enable IPsec debugging output when set.");
 SYSCTL_VNET_PCPUSTAT(_net_inet6_ipsec6, IPSECCTL_STATS, ipsecstats,
     struct ipsecstat, ipsec6stat, "IPsec IPv6 statistics.");
 #endif /* INET6 */
 
 static int ipsec_setspidx_inpcb __P((struct mbuf *, struct inpcb *));
 static int ipsec_setspidx __P((struct mbuf *, struct secpolicyindex *, int));
 static void ipsec4_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int));
 static int ipsec4_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *));
 #ifdef INET6
 static void ipsec6_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int));
 static int ipsec6_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *));
 #endif
 static void ipsec_delpcbpolicy __P((struct inpcbpolicy *));
 static struct secpolicy *ipsec_deepcopy_policy __P((struct secpolicy *src));
 static void vshiftl __P((unsigned char *, int, int));
 
 MALLOC_DEFINE(M_IPSEC_INPCB, "inpcbpolicy", "inpcb-resident ipsec policy");
 
 /*
  * Return a held reference to the default SP.
  */
 static struct secpolicy *
 key_allocsp_default(const char* where, int tag)
 {
 	struct secpolicy *sp;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP key_allocsp_default from %s:%u\n", where, tag));
 
 	sp = &V_ip4_def_policy;
 	if (sp->policy != IPSEC_POLICY_DISCARD &&
 	    sp->policy != IPSEC_POLICY_NONE) {
 		ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n",
 		    sp->policy, IPSEC_POLICY_NONE));
 		sp->policy = IPSEC_POLICY_NONE;
 	}
 	key_addref(sp);
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP key_allocsp_default returns SP:%p (%u)\n",
 			sp, sp->refcnt));
 	return (sp);
 }
 #define	KEY_ALLOCSP_DEFAULT() \
 	key_allocsp_default(__FILE__, __LINE__)
 
 /*
  * For OUTBOUND packet having a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * OUT:	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  *	others:	a pointer to SP
  *
  * NOTE: IPv6 mapped adddress concern is implemented here.
  */
 struct secpolicy *
 ipsec_getpolicy(struct tdb_ident *tdbi, u_int dir)
 {
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(tdbi != NULL, ("null tdbi"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	sp = KEY_ALLOCSP2(tdbi->spi, &tdbi->dst, tdbi->proto, dir);
 	if (sp == NULL)			/*XXX????*/
 		sp = KEY_ALLOCSP_DEFAULT();
 	IPSEC_ASSERT(sp != NULL, ("null SP"));
 	return (sp);
 }
 
 /*
  * For OUTBOUND packet having a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * OUT:	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  *	others:	a pointer to SP
  *
  * NOTE: IPv6 mapped adddress concern is implemented here.
  */
 static struct secpolicy *
 ipsec_getpolicybysock(struct mbuf *m, u_int dir, struct inpcb *inp, int *error)
 {
 	struct inpcbpolicy *pcbsp;
 	struct secpolicy *currsp = NULL;	/* Policy on socket. */
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(inp != NULL, ("null inpcb"));
 	IPSEC_ASSERT(error != NULL, ("null error"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	/* Set spidx in pcb. */
 	*error = ipsec_setspidx_inpcb(m, inp);
 	if (*error)
 		return (NULL);
 
 	pcbsp = inp->inp_sp;
 	IPSEC_ASSERT(pcbsp != NULL, ("null pcbsp"));
 	switch (dir) {
 	case IPSEC_DIR_INBOUND:
 		currsp = pcbsp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		currsp = pcbsp->sp_out;
 		break;
 	}
 	IPSEC_ASSERT(currsp != NULL, ("null currsp"));
 
 	if (pcbsp->priv) {			/* When privilieged socket. */
 		switch (currsp->policy) {
 		case IPSEC_POLICY_BYPASS:
 		case IPSEC_POLICY_IPSEC:
 			key_addref(currsp);
 			sp = currsp;
 			break;
 
 		case IPSEC_POLICY_ENTRUST:
 			/* Look for a policy in SPD. */
 			sp = KEY_ALLOCSP(&currsp->spidx, dir);
 			if (sp == NULL)		/* No SP found. */
 				sp = KEY_ALLOCSP_DEFAULT();
 			break;
 
 		default:
 			ipseclog((LOG_ERR, "%s: Invalid policy for PCB %d\n",
 				__func__, currsp->policy));
 			*error = EINVAL;
 			return (NULL);
 		}
 	} else {				/* Unpriv, SPD has policy. */
 		sp = KEY_ALLOCSP(&currsp->spidx, dir);
 		if (sp == NULL) {		/* No SP found. */
 			switch (currsp->policy) {
 			case IPSEC_POLICY_BYPASS:
 				ipseclog((LOG_ERR, "%s: Illegal policy for "
 					"non-priviliged defined %d\n",
 					__func__, currsp->policy));
 				*error = EINVAL;
 				return (NULL);
 
 			case IPSEC_POLICY_ENTRUST:
 				sp = KEY_ALLOCSP_DEFAULT();
 				break;
 
 			case IPSEC_POLICY_IPSEC:
 				key_addref(currsp);
 				sp = currsp;
 				break;
 
 			default:
 				ipseclog((LOG_ERR, "%s: Invalid policy for "
 					"PCB %d\n", __func__, currsp->policy));
 				*error = EINVAL;
 				return (NULL);
 			}
 		}
 	}
 	IPSEC_ASSERT(sp != NULL,
 		("null SP (priv %u policy %u", pcbsp->priv, currsp->policy));
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s (priv %u policy %u) allocate SP:%p (refcnt %u)\n",
 			__func__, pcbsp->priv, currsp->policy, sp, sp->refcnt));
 	return (sp);
 }
 
 /*
  * For FORWADING packet or OUTBOUND without a socket. Searching SPD for packet,
  * and return a pointer to SP.
  * OUT:	positive: a pointer to the entry for security policy leaf matched.
  *	NULL:	no apropreate SP found, the following value is set to error.
  *		0	: bypass
  *		EACCES	: discard packet.
  *		ENOENT	: ipsec_acquire() in progress, maybe.
  *		others	: error occured.
  */
 struct secpolicy *
 ipsec_getpolicybyaddr(struct mbuf *m, u_int dir, int flag, int *error)
 {
 	struct secpolicyindex spidx;
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(error != NULL, ("null error"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	sp = NULL;
 	if (key_havesp(dir)) {
 		/* Make an index to look for a policy. */
 		*error = ipsec_setspidx(m, &spidx,
 					(flag & IP_FORWARDING) ? 0 : 1);
 		if (*error != 0) {
 			DPRINTF(("%s: setpidx failed, dir %u flag %u\n",
 				__func__, dir, flag));
 			return (NULL);
 		}
 		spidx.dir = dir;
 
 		sp = KEY_ALLOCSP(&spidx, dir);
 	}
 	if (sp == NULL)			/* No SP found, use system default. */
 		sp = KEY_ALLOCSP_DEFAULT();
 	IPSEC_ASSERT(sp != NULL, ("null SP"));
 	return (sp);
 }
 
 struct secpolicy *
 ipsec4_checkpolicy(struct mbuf *m, u_int dir, u_int flag, int *error,
     struct inpcb *inp)
 {
 	struct secpolicy *sp;
 
 	*error = 0;
 	if (inp == NULL)
 		sp = ipsec_getpolicybyaddr(m, dir, flag, error);
 	else
 		sp = ipsec_getpolicybysock(m, dir, inp, error);
 	if (sp == NULL) {
 		IPSEC_ASSERT(*error != 0, ("getpolicy failed w/o error"));
 		IPSECSTAT_INC(ips_out_inval);
 		return (NULL);
 	}
 	IPSEC_ASSERT(*error == 0, ("sp w/ error set to %u", *error));
 	switch (sp->policy) {
 	case IPSEC_POLICY_ENTRUST:
 	default:
 		printf("%s: invalid policy %u\n", __func__, sp->policy);
 		/* FALLTHROUGH */
 	case IPSEC_POLICY_DISCARD:
 		IPSECSTAT_INC(ips_out_polvio);
 		*error = -EINVAL;	/* Packet is discarded by caller. */
 		break;
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		KEY_FREESP(&sp);
 		sp = NULL;		/* NB: force NULL result. */
 		break;
 	case IPSEC_POLICY_IPSEC:
 		if (sp->req == NULL)	/* Acquire a SA. */
 			*error = key_spdacquire(sp);
 		break;
 	}
 	if (*error != 0) {
 		KEY_FREESP(&sp);
 		sp = NULL;
 	}
 	return (sp);
 }
 
 static int
 ipsec_setspidx_inpcb(struct mbuf *m, struct inpcb *inp)
 {
 	int error;
 
 	IPSEC_ASSERT(inp != NULL, ("null inp"));
 	IPSEC_ASSERT(inp->inp_sp != NULL, ("null inp_sp"));
 	IPSEC_ASSERT(inp->inp_sp->sp_out != NULL && inp->inp_sp->sp_in != NULL,
 		("null sp_in || sp_out"));
 
 	error = ipsec_setspidx(m, &inp->inp_sp->sp_in->spidx, 1);
 	if (error == 0) {
 		inp->inp_sp->sp_in->spidx.dir = IPSEC_DIR_INBOUND;
 		inp->inp_sp->sp_out->spidx = inp->inp_sp->sp_in->spidx;
 		inp->inp_sp->sp_out->spidx.dir = IPSEC_DIR_OUTBOUND;
 	} else {
 		bzero(&inp->inp_sp->sp_in->spidx,
 			sizeof (inp->inp_sp->sp_in->spidx));
 		bzero(&inp->inp_sp->sp_out->spidx,
 			sizeof (inp->inp_sp->sp_in->spidx));
 	}
 	return (error);
 }
 
 /*
  * Configure security policy index (src/dst/proto/sport/dport)
  * by looking at the content of mbuf.
  * The caller is responsible for error recovery (like clearing up spidx).
  */
 static int
 ipsec_setspidx(struct mbuf *m, struct secpolicyindex *spidx, int needport)
 {
 	struct ip *ip = NULL;
 	struct ip ipbuf;
 	u_int v;
 	struct mbuf *n;
 	int len;
 	int error;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 
 	/*
 	 * Validate m->m_pkthdr.len.  We see incorrect length if we
 	 * mistakenly call this function with inconsistent mbuf chain
 	 * (like 4.4BSD tcp/udp processing).  XXX Should we panic here?
 	 */
 	len = 0;
 	for (n = m; n; n = n->m_next)
 		len += n->m_len;
 	if (m->m_pkthdr.len != len) {
 		KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 			printf("%s: pkthdr len(%d) mismatch (%d), ignored.\n",
 				__func__, len, m->m_pkthdr.len));
 		return (EINVAL);
 	}
 
 	if (m->m_pkthdr.len < sizeof(struct ip)) {
 		KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 			printf("%s: pkthdr len(%d) too small (v4), ignored.\n",
 			    __func__, m->m_pkthdr.len));
 		return (EINVAL);
 	}
 
 	if (m->m_len >= sizeof(*ip))
 		ip = mtod(m, struct ip *);
 	else {
 		m_copydata(m, 0, sizeof(ipbuf), (caddr_t)&ipbuf);
 		ip = &ipbuf;
 	}
 	v = ip->ip_v;
 	switch (v) {
 	case 4:
 		error = ipsec4_setspidx_ipaddr(m, spidx);
 		if (error)
 			return (error);
 		ipsec4_get_ulp(m, spidx, needport);
 		return (0);
 #ifdef INET6
 	case 6:
 		if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) {
 			KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				printf("%s: pkthdr len(%d) too small (v6), "
 				"ignored\n", __func__, m->m_pkthdr.len));
 			return (EINVAL);
 		}
 		error = ipsec6_setspidx_ipaddr(m, spidx);
 		if (error)
 			return (error);
 		ipsec6_get_ulp(m, spidx, needport);
 		return (0);
 #endif
 	default:
 		KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 			printf("%s: " "unknown IP version %u, ignored.\n",
 				__func__, v));
 		return (EINVAL);
 	}
 }
 
 static void
 ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport)
 {
 	u_int8_t nxt;
 	int off;
 
 	/* Sanity check. */
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(m->m_pkthdr.len >= sizeof(struct ip),("packet too short"));
 
 	if (m->m_len >= sizeof (struct ip)) {
 		struct ip *ip = mtod(m, struct ip *);
 		if (ip->ip_off & htons(IP_MF | IP_OFFMASK))
 			goto done;
 		off = ip->ip_hl << 2;
 		nxt = ip->ip_p;
 	} else {
 		struct ip ih;
 
 		m_copydata(m, 0, sizeof (struct ip), (caddr_t) &ih);
 		if (ih.ip_off & htons(IP_MF | IP_OFFMASK))
 			goto done;
 		off = ih.ip_hl << 2;
 		nxt = ih.ip_p;
 	}
 
 	while (off < m->m_pkthdr.len) {
 		struct ip6_ext ip6e;
 		struct tcphdr th;
 		struct udphdr uh;
 
 		switch (nxt) {
 		case IPPROTO_TCP:
 			spidx->ul_proto = nxt;
 			if (!needport)
 				goto done_proto;
 			if (off + sizeof(struct tcphdr) > m->m_pkthdr.len)
 				goto done;
 			m_copydata(m, off, sizeof (th), (caddr_t) &th);
 			spidx->src.sin.sin_port = th.th_sport;
 			spidx->dst.sin.sin_port = th.th_dport;
 			return;
 		case IPPROTO_UDP:
 			spidx->ul_proto = nxt;
 			if (!needport)
 				goto done_proto;
 			if (off + sizeof(struct udphdr) > m->m_pkthdr.len)
 				goto done;
 			m_copydata(m, off, sizeof (uh), (caddr_t) &uh);
 			spidx->src.sin.sin_port = uh.uh_sport;
 			spidx->dst.sin.sin_port = uh.uh_dport;
 			return;
 		case IPPROTO_AH:
 			if (off + sizeof(ip6e) > m->m_pkthdr.len)
 				goto done;
 			/* XXX Sigh, this works but is totally bogus. */
 			m_copydata(m, off, sizeof(ip6e), (caddr_t) &ip6e);
 			off += (ip6e.ip6e_len + 2) << 2;
 			nxt = ip6e.ip6e_nxt;
 			break;
 		case IPPROTO_ICMP:
 		default:
 			/* XXX Intermediate headers??? */
 			spidx->ul_proto = nxt;
 			goto done_proto;
 		}
 	}
 done:
 	spidx->ul_proto = IPSEC_ULPROTO_ANY;
 done_proto:
 	spidx->src.sin.sin_port = IPSEC_PORT_ANY;
 	spidx->dst.sin.sin_port = IPSEC_PORT_ANY;
 }
 
 /* Assumes that m is sane. */
 static int
 ipsec4_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx)
 {
 	static const struct sockaddr_in template = {
 		sizeof (struct sockaddr_in),
 		AF_INET,
 		0, { 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 }
 	};
 
 	spidx->src.sin = template;
 	spidx->dst.sin = template;
 
 	if (m->m_len < sizeof (struct ip)) {
 		m_copydata(m, offsetof(struct ip, ip_src),
 			   sizeof (struct  in_addr),
 			   (caddr_t) &spidx->src.sin.sin_addr);
 		m_copydata(m, offsetof(struct ip, ip_dst),
 			   sizeof (struct  in_addr),
 			   (caddr_t) &spidx->dst.sin.sin_addr);
 	} else {
 		struct ip *ip = mtod(m, struct ip *);
 		spidx->src.sin.sin_addr = ip->ip_src;
 		spidx->dst.sin.sin_addr = ip->ip_dst;
 	}
 
 	spidx->prefs = sizeof(struct in_addr) << 3;
 	spidx->prefd = sizeof(struct in_addr) << 3;
 
 	return (0);
 }
 
 #ifdef INET6
 static void
 ipsec6_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport)
 {
 	int off, nxt;
 	struct tcphdr th;
 	struct udphdr uh;
 	struct icmp6_hdr ih;
 
 	/* Sanity check. */
 	if (m == NULL)
 		panic("%s: NULL pointer was passed.\n", __func__);
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("%s:\n", __func__); kdebug_mbuf(m));
 
 	/* Set default. */
 	spidx->ul_proto = IPSEC_ULPROTO_ANY;
 	((struct sockaddr_in6 *)&spidx->src)->sin6_port = IPSEC_PORT_ANY;
 	((struct sockaddr_in6 *)&spidx->dst)->sin6_port = IPSEC_PORT_ANY;
 
 	nxt = -1;
 	off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
 	if (off < 0 || m->m_pkthdr.len < off)
 		return;
 
 	switch (nxt) {
 	case IPPROTO_TCP:
 		spidx->ul_proto = nxt;
 		if (!needport)
 			break;
 		if (off + sizeof(struct tcphdr) > m->m_pkthdr.len)
 			break;
 		m_copydata(m, off, sizeof(th), (caddr_t)&th);
 		((struct sockaddr_in6 *)&spidx->src)->sin6_port = th.th_sport;
 		((struct sockaddr_in6 *)&spidx->dst)->sin6_port = th.th_dport;
 		break;
 	case IPPROTO_UDP:
 		spidx->ul_proto = nxt;
 		if (!needport)
 			break;
 		if (off + sizeof(struct udphdr) > m->m_pkthdr.len)
 			break;
 		m_copydata(m, off, sizeof(uh), (caddr_t)&uh);
 		((struct sockaddr_in6 *)&spidx->src)->sin6_port = uh.uh_sport;
 		((struct sockaddr_in6 *)&spidx->dst)->sin6_port = uh.uh_dport;
 		break;
 	case IPPROTO_ICMPV6:
 		spidx->ul_proto = nxt;
 		if (off + sizeof(struct icmp6_hdr) > m->m_pkthdr.len)
 			break;
 		m_copydata(m, off, sizeof(ih), (caddr_t)&ih);
 		((struct sockaddr_in6 *)&spidx->src)->sin6_port =
 		    htons((uint16_t)ih.icmp6_type);
 		((struct sockaddr_in6 *)&spidx->dst)->sin6_port =
 		    htons((uint16_t)ih.icmp6_code);
 		break;
 	default:
 		/* XXX Intermediate headers??? */
 		spidx->ul_proto = nxt;
 		break;
 	}
 }
 
 /* Assumes that m is sane. */
 static int
 ipsec6_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx)
 {
 	struct ip6_hdr *ip6 = NULL;
 	struct ip6_hdr ip6buf;
 	struct sockaddr_in6 *sin6;
 
 	if (m->m_len >= sizeof(*ip6))
 		ip6 = mtod(m, struct ip6_hdr *);
 	else {
 		m_copydata(m, 0, sizeof(ip6buf), (caddr_t)&ip6buf);
 		ip6 = &ip6buf;
 	}
 
 	sin6 = (struct sockaddr_in6 *)&spidx->src;
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&ip6->ip6_src, &sin6->sin6_addr, sizeof(ip6->ip6_src));
 	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		sin6->sin6_addr.s6_addr16[1] = 0;
 		sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]);
 	}
 	spidx->prefs = sizeof(struct in6_addr) << 3;
 
 	sin6 = (struct sockaddr_in6 *)&spidx->dst;
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(ip6->ip6_dst));
 	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) {
 		sin6->sin6_addr.s6_addr16[1] = 0;
 		sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]);
 	}
 	spidx->prefd = sizeof(struct in6_addr) << 3;
 
 	return (0);
 }
 #endif
 
 static void
 ipsec_delpcbpolicy(struct inpcbpolicy *p)
 {
 
 	free(p, M_IPSEC_INPCB);
 }
 
 /* Initialize policy in PCB. */
 int
 ipsec_init_policy(struct socket *so, struct inpcbpolicy **pcb_sp)
 {
 	struct inpcbpolicy *new;
 
 	/* Sanity check. */
 	if (so == NULL || pcb_sp == NULL)
 		panic("%s: NULL pointer was passed.\n", __func__);
 
 	new = (struct inpcbpolicy *) malloc(sizeof(struct inpcbpolicy),
 					    M_IPSEC_INPCB, M_NOWAIT|M_ZERO);
 	if (new == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return (ENOBUFS);
 	}
 
 	new->priv = IPSEC_IS_PRIVILEGED_SO(so);
 
 	if ((new->sp_in = KEY_NEWSP()) == NULL) {
 		ipsec_delpcbpolicy(new);
 		return (ENOBUFS);
 	}
 	new->sp_in->state = IPSEC_SPSTATE_ALIVE;
 	new->sp_in->policy = IPSEC_POLICY_ENTRUST;
 
 	if ((new->sp_out = KEY_NEWSP()) == NULL) {
 		KEY_FREESP(&new->sp_in);
 		ipsec_delpcbpolicy(new);
 		return (ENOBUFS);
 	}
 	new->sp_out->state = IPSEC_SPSTATE_ALIVE;
 	new->sp_out->policy = IPSEC_POLICY_ENTRUST;
 
 	*pcb_sp = new;
 
 	return (0);
 }
 
 /* Copy old IPsec policy into new. */
 int
 ipsec_copy_policy(struct inpcbpolicy *old, struct inpcbpolicy *new)
 {
 	struct secpolicy *sp;
 
 	sp = ipsec_deepcopy_policy(old->sp_in);
 	if (sp) {
 		KEY_FREESP(&new->sp_in);
 		new->sp_in = sp;
 	} else
 		return (ENOBUFS);
 
 	sp = ipsec_deepcopy_policy(old->sp_out);
 	if (sp) {
 		KEY_FREESP(&new->sp_out);
 		new->sp_out = sp;
 	} else
 		return (ENOBUFS);
 
 	new->priv = old->priv;
 
 	return (0);
 }
 
 struct ipsecrequest *
 ipsec_newisr(void)
 {
 	struct ipsecrequest *p;
 
 	p = malloc(sizeof(struct ipsecrequest), M_IPSEC_SR, M_NOWAIT|M_ZERO);
 	if (p != NULL)
 		IPSECREQUEST_LOCK_INIT(p);
 	return (p);
 }
 
 void
 ipsec_delisr(struct ipsecrequest *p)
 {
 
 	IPSECREQUEST_LOCK_DESTROY(p);
 	free(p, M_IPSEC_SR);
 }
 
 /* Deep-copy a policy in PCB. */
 static struct secpolicy *
 ipsec_deepcopy_policy(struct secpolicy *src)
 {
 	struct ipsecrequest *newchain = NULL;
 	struct ipsecrequest *p;
 	struct ipsecrequest **q;
 	struct ipsecrequest *r;
 	struct secpolicy *dst;
 
 	if (src == NULL)
 		return (NULL);
 	dst = KEY_NEWSP();
 	if (dst == NULL)
 		return (NULL);
 
 	/*
 	 * Deep-copy IPsec request chain.  This is required since struct
 	 * ipsecrequest is not reference counted.
 	 */
 	q = &newchain;
 	for (p = src->req; p; p = p->next) {
 		*q = ipsec_newisr();
 		if (*q == NULL)
 			goto fail;
 		(*q)->saidx.proto = p->saidx.proto;
 		(*q)->saidx.mode = p->saidx.mode;
 		(*q)->level = p->level;
 		(*q)->saidx.reqid = p->saidx.reqid;
 
 		bcopy(&p->saidx.src, &(*q)->saidx.src, sizeof((*q)->saidx.src));
 		bcopy(&p->saidx.dst, &(*q)->saidx.dst, sizeof((*q)->saidx.dst));
 
 		(*q)->sp = dst;
 
 		q = &((*q)->next);
 	}
 
 	dst->req = newchain;
 	dst->state = src->state;
 	dst->policy = src->policy;
 	/* Do not touch the refcnt fields. */
 
 	return (dst);
 
 fail:
 	for (p = newchain; p; p = r) {
 		r = p->next;
 		ipsec_delisr(p);
 		p = NULL;
 	}
 	return (NULL);
 }
 
 /* Set policy and IPsec request if present. */
 static int
 ipsec_set_policy_internal(struct secpolicy **pcb_sp, int optname,
     caddr_t request, size_t len, struct ucred *cred)
 {
 	struct sadb_x_policy *xpl;
 	struct secpolicy *newsp = NULL;
 	int error;
 
 	/* Sanity check. */
 	if (pcb_sp == NULL || *pcb_sp == NULL || request == NULL)
 		return (EINVAL);
 	if (len < sizeof(*xpl))
 		return (EINVAL);
 	xpl = (struct sadb_x_policy *)request;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("%s: passed policy\n", __func__);
 		kdebug_sadb_x_policy((struct sadb_ext *)xpl));
 
 	/* Check policy type. */
 	/* ipsec_set_policy_internal() accepts IPSEC, ENTRUST and BYPASS. */
 	if (xpl->sadb_x_policy_type == IPSEC_POLICY_DISCARD
 	 || xpl->sadb_x_policy_type == IPSEC_POLICY_NONE)
 		return (EINVAL);
 
 	/* Check privileged socket. */
 	if (cred != NULL && xpl->sadb_x_policy_type == IPSEC_POLICY_BYPASS) {
 		error = priv_check_cred(cred, PRIV_NETINET_IPSEC, 0);
 		if (error)
 			return (EACCES);
 	}
 
 	/* Allocating new SP entry. */
 	if ((newsp = key_msg2sp(xpl, len, &error)) == NULL)
 		return (error);
 
 	newsp->state = IPSEC_SPSTATE_ALIVE;
 
 	/* Clear old SP and set new SP. */
 	KEY_FREESP(pcb_sp);
 	*pcb_sp = newsp;
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("%s: new policy\n", __func__);
 		kdebug_secpolicy(newsp));
 
 	return (0);
 }
 
 int
 ipsec_set_policy(struct inpcb *inp, int optname, caddr_t request,
     size_t len, struct ucred *cred)
 {
 	struct sadb_x_policy *xpl;
 	struct secpolicy **pcb_sp;
 
 	/* Sanity check. */
 	if (inp == NULL || request == NULL)
 		return (EINVAL);
 	if (len < sizeof(*xpl))
 		return (EINVAL);
 	xpl = (struct sadb_x_policy *)request;
 
 	/* Select direction. */
 	switch (xpl->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 		pcb_sp = &inp->inp_sp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		pcb_sp = &inp->inp_sp->sp_out;
 		break;
 	default:
 		ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__,
 			xpl->sadb_x_policy_dir));
 		return (EINVAL);
 	}
 
 	return (ipsec_set_policy_internal(pcb_sp, optname, request, len, cred));
 }
 
 int
 ipsec_get_policy(struct inpcb *inp, caddr_t request, size_t len,
     struct mbuf **mp)
 {
 	struct sadb_x_policy *xpl;
 	struct secpolicy *pcb_sp;
 
 	/* Sanity check. */
 	if (inp == NULL || request == NULL || mp == NULL)
 		return (EINVAL);
 	IPSEC_ASSERT(inp->inp_sp != NULL, ("null inp_sp"));
 	if (len < sizeof(*xpl))
 		return (EINVAL);
 	xpl = (struct sadb_x_policy *)request;
 
 	/* Select direction. */
 	switch (xpl->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 		pcb_sp = inp->inp_sp->sp_in;
 		break;
 	case IPSEC_DIR_OUTBOUND:
 		pcb_sp = inp->inp_sp->sp_out;
 		break;
 	default:
 		ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__,
 			xpl->sadb_x_policy_dir));
 		return (EINVAL);
 	}
 
 	/* Sanity check. Should be an IPSEC_ASSERT. */
 	if (pcb_sp == NULL)
 		return (EINVAL);
 
 	*mp = key_sp2msg(pcb_sp);
 	if (!*mp) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return (ENOBUFS);
 	}
 
 	(*mp)->m_type = MT_DATA;
 	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 		printf("%s:\n", __func__); kdebug_mbuf(*mp));
 
 	return (0);
 }
 
 /* Delete policy in PCB. */
 int
 ipsec_delete_pcbpolicy(struct inpcb *inp)
 {
 	IPSEC_ASSERT(inp != NULL, ("null inp"));
 
 	if (inp->inp_sp == NULL)
 		return (0);
 
 	if (inp->inp_sp->sp_in != NULL)
 		KEY_FREESP(&inp->inp_sp->sp_in);
 
 	if (inp->inp_sp->sp_out != NULL)
 		KEY_FREESP(&inp->inp_sp->sp_out);
 
 	ipsec_delpcbpolicy(inp->inp_sp);
 	inp->inp_sp = NULL;
 
 	return (0);
 }
 
 /*
  * Return current level.
  * Either IPSEC_LEVEL_USE or IPSEC_LEVEL_REQUIRE are always returned.
  */
 u_int
 ipsec_get_reqlevel(struct ipsecrequest *isr)
 {
 	u_int level = 0;
 	u_int esp_trans_deflev, esp_net_deflev;
 	u_int ah_trans_deflev, ah_net_deflev;
 
 	IPSEC_ASSERT(isr != NULL && isr->sp != NULL, ("null argument"));
 	IPSEC_ASSERT(isr->sp->spidx.src.sa.sa_family == isr->sp->spidx.dst.sa.sa_family,
 		("af family mismatch, src %u, dst %u",
 		 isr->sp->spidx.src.sa.sa_family,
 		 isr->sp->spidx.dst.sa.sa_family));
 
 /* XXX Note that we have ipseclog() expanded here - code sync issue. */
 #define IPSEC_CHECK_DEFAULT(lev) \
 	(((lev) != IPSEC_LEVEL_USE && (lev) != IPSEC_LEVEL_REQUIRE	      \
 			&& (lev) != IPSEC_LEVEL_UNIQUE)			      \
 		? (V_ipsec_debug						      \
 			? log(LOG_INFO, "fixed system default level " #lev ":%d->%d\n",\
 				(lev), IPSEC_LEVEL_REQUIRE)		      \
 			: 0),						      \
 			(lev) = IPSEC_LEVEL_REQUIRE,			      \
 			(lev)						      \
 		: (lev))
 
 	/* Set default level. */
 	switch (((struct sockaddr *)&isr->sp->spidx.src)->sa_family) {
 #ifdef INET
 	case AF_INET:
 		esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_trans_deflev);
 		esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_net_deflev);
 		ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_trans_deflev);
 		ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_net_deflev);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_trans_deflev);
 		esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_net_deflev);
 		ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_trans_deflev);
 		ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_net_deflev);
 		break;
 #endif /* INET6 */
 	default:
 		panic("%s: unknown af %u",
 			__func__, isr->sp->spidx.src.sa.sa_family);
 	}
 
 #undef IPSEC_CHECK_DEFAULT
 
 	/* Set level. */
 	switch (isr->level) {
 	case IPSEC_LEVEL_DEFAULT:
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 			if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
 				level = esp_net_deflev;
 			else
 				level = esp_trans_deflev;
 			break;
 		case IPPROTO_AH:
 			if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
 				level = ah_net_deflev;
 			else
 				level = ah_trans_deflev;
 			break;
 		case IPPROTO_IPCOMP:
 			/*
 			 * We don't really care, as IPcomp document says that
 			 * we shouldn't compress small packets.
 			 */
 			level = IPSEC_LEVEL_USE;
 			break;
 		default:
 			panic("%s: Illegal protocol defined %u\n", __func__,
 				isr->saidx.proto);
 		}
 		break;
 
 	case IPSEC_LEVEL_USE:
 	case IPSEC_LEVEL_REQUIRE:
 		level = isr->level;
 		break;
 	case IPSEC_LEVEL_UNIQUE:
 		level = IPSEC_LEVEL_REQUIRE;
 		break;
 
 	default:
 		panic("%s: Illegal IPsec level %u\n", __func__, isr->level);
 	}
 
 	return (level);
 }
 
 /*
  * Check security policy requirements against the actual
  * packet contents.  Return one if the packet should be
  * reject as "invalid"; otherwiser return zero to have the
  * packet treated as "valid".
  *
  * OUT:
  *	0: valid
  *	1: invalid
  */
 int
 ipsec_in_reject(struct secpolicy *sp, struct mbuf *m)
 {
 	struct ipsecrequest *isr;
 	int need_auth;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("%s: using SP\n", __func__); kdebug_secpolicy(sp));
 
 	/* Check policy. */
 	switch (sp->policy) {
 	case IPSEC_POLICY_DISCARD:
 		return (1);
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		return (0);
 	}
 
 	IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
 		("invalid policy %u", sp->policy));
 
 	/* XXX Should compare policy against IPsec header history. */
 
 	need_auth = 0;
 	for (isr = sp->req; isr != NULL; isr = isr->next) {
 		if (ipsec_get_reqlevel(isr) != IPSEC_LEVEL_REQUIRE)
 			continue;
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 			if ((m->m_flags & M_DECRYPTED) == 0) {
 				KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				    printf("%s: ESP m_flags:%x\n", __func__,
 					    m->m_flags));
 				return (1);
 			}
 
 			if (!need_auth &&
 			    isr->sav != NULL &&
 			    isr->sav->tdb_authalgxform != NULL &&
 			    (m->m_flags & M_AUTHIPDGM) == 0) {
 				KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				    printf("%s: ESP/AH m_flags:%x\n", __func__,
 					    m->m_flags));
 				return (1);
 			}
 			break;
 		case IPPROTO_AH:
 			need_auth = 1;
 			if ((m->m_flags & M_AUTHIPHDR) == 0) {
 				KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
 				    printf("%s: AH m_flags:%x\n", __func__,
 					    m->m_flags));
 				return (1);
 			}
 			break;
 		case IPPROTO_IPCOMP:
 			/*
 			 * We don't really care, as IPcomp document
 			 * says that we shouldn't compress small
 			 * packets.  IPComp policy should always be
 			 * treated as being in "use" level.
 			 */
 			break;
 		}
 	}
 	return (0);		/* Valid. */
 }
 
 static int
 ipsec46_in_reject(struct mbuf *m, struct inpcb *inp)
 {
 	struct secpolicy *sp;
 	int error;
 	int result;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 
 	/*
 	 * Get SP for this packet.
 	 * When we are called from ip_forward(), we call
 	 * ipsec_getpolicybyaddr() with IP_FORWARDING flag.
 	 */
 	if (inp == NULL)
 		sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error);
 	else
 		sp = ipsec_getpolicybysock(m, IPSEC_DIR_INBOUND, inp, &error);
 
 	if (sp != NULL) {
 		result = ipsec_in_reject(sp, m);
 		KEY_FREESP(&sp);
 	} else {
 		result = 0;	/* XXX Should be panic?
 				 * -> No, there may be error. */
 	}
 	return (result);
 }
 
 /*
  * Check AH/ESP integrity.
  * This function is called from tcp_input(), udp_input(),
  * and {ah,esp}4_input for tunnel mode.
  */
 int
 ipsec4_in_reject(struct mbuf *m, struct inpcb *inp)
 {
 	int result;
 
 	result = ipsec46_in_reject(m, inp);
 	if (result)
 		IPSECSTAT_INC(ips_in_polvio);
 
 	return (result);
 }
 
 #ifdef INET6
 /*
  * Check AH/ESP integrity.
  * This function is called from tcp6_input(), udp6_input(),
  * and {ah,esp}6_input for tunnel mode.
  */
 int
 ipsec6_in_reject(struct mbuf *m, struct inpcb *inp)
 {
 	int result;
 
 	result = ipsec46_in_reject(m, inp);
 	if (result)
 		IPSEC6STAT_INC(ips_in_polvio);
 
 	return (result);
 }
 #endif
 
 /*
  * Compute the byte size to be occupied by IPsec header.
  * In case it is tunnelled, it includes the size of outer IP header.
  * NOTE: SP passed is freed in this function.
  */
 static size_t
 ipsec_hdrsiz_internal(struct secpolicy *sp)
 {
 	struct ipsecrequest *isr;
 	size_t size;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("%s: using SP\n", __func__); kdebug_secpolicy(sp));
 
 	switch (sp->policy) {
 	case IPSEC_POLICY_DISCARD:
 	case IPSEC_POLICY_BYPASS:
 	case IPSEC_POLICY_NONE:
 		return (0);
 	}
 
 	IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
 		("invalid policy %u", sp->policy));
 
 	size = 0;
 	for (isr = sp->req; isr != NULL; isr = isr->next) {
 		size_t clen = 0;
 
 		switch (isr->saidx.proto) {
 		case IPPROTO_ESP:
 			clen = esp_hdrsiz(isr->sav);
 			break;
 		case IPPROTO_AH:
 			clen = ah_hdrsiz(isr->sav);
 			break;
 		case IPPROTO_IPCOMP:
 			clen = sizeof(struct ipcomp);
 			break;
 		}
 
 		if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
 			switch (isr->saidx.dst.sa.sa_family) {
 			case AF_INET:
 				clen += sizeof(struct ip);
 				break;
 #ifdef INET6
 			case AF_INET6:
 				clen += sizeof(struct ip6_hdr);
 				break;
 #endif
 			default:
 				ipseclog((LOG_ERR, "%s: unknown AF %d in "
 				    "IPsec tunnel SA\n", __func__,
 				    ((struct sockaddr *)&isr->saidx.dst)->sa_family));
 				break;
 			}
 		}
 		size += clen;
 	}
 
 	return (size);
 }
 
 /* 
  * This function is called from ipsec_hdrsiz_tcp(), ip_ipsec_mtu(),
  * disabled ip6_ipsec_mtu() and ip6_forward().
  */
 size_t
 ipsec_hdrsiz(struct mbuf *m, u_int dir, struct inpcb *inp)
 {
 	struct secpolicy *sp;
 	int error;
 	size_t size;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 
 	/* Get SP for this packet.
 	 * When we are called from ip_forward(), we call
 	 * ipsec_getpolicybyaddr() with IP_FORWARDING flag.
 	 */
 	if (inp == NULL)
 		sp = ipsec_getpolicybyaddr(m, dir, IP_FORWARDING, &error);
 	else
 		sp = ipsec_getpolicybysock(m, dir, inp, &error);
 
 	if (sp != NULL) {
 		size = ipsec_hdrsiz_internal(sp);
 		KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 			printf("%s: size:%lu.\n", __func__,
 				(unsigned long)size));
 
 		KEY_FREESP(&sp);
 	} else {
 		size = 0;	/* XXX Should be panic?
 				 * -> No, we are called w/o knowing if
 				 *    IPsec processing is needed. */
 	}
 	return (size);
 }
 
 /*
  * Check the variable replay window.
  * ipsec_chkreplay() performs replay check before ICV verification.
  * ipsec_updatereplay() updates replay bitmap.  This must be called after
  * ICV verification (it also performs replay check, which is usually done
  * beforehand).
  * 0 (zero) is returned if packet disallowed, 1 if packet permitted.
  *
  * Based on RFC 2401.
  */
 int
 ipsec_chkreplay(u_int32_t seq, struct secasvar *sav)
 {
 	const struct secreplay *replay;
 	u_int32_t diff;
 	int fr;
 	u_int32_t wsizeb;	/* Constant: bits of window size. */
 	int frlast;		/* Constant: last frame. */
 
 	IPSEC_ASSERT(sav != NULL, ("Null SA"));
 	IPSEC_ASSERT(sav->replay != NULL, ("Null replay state"));
 
 	replay = sav->replay;
 
 	if (replay->wsize == 0)
 		return (1);	/* No need to check replay. */
 
 	/* Constant. */
 	frlast = replay->wsize - 1;
 	wsizeb = replay->wsize << 3;
 
 	/* Sequence number of 0 is invalid. */
 	if (seq == 0)
 		return (0);
 
 	/* First time is always okay. */
 	if (replay->count == 0)
 		return (1);
 
 	if (seq > replay->lastseq) {
 		/* Larger sequences are okay. */
 		return (1);
 	} else {
 		/* seq is equal or less than lastseq. */
 		diff = replay->lastseq - seq;
 
 		/* Over range to check, i.e. too old or wrapped. */
 		if (diff >= wsizeb)
 			return (0);
 
 		fr = frlast - diff / 8;
 
 		/* This packet already seen? */
 		if ((replay->bitmap)[fr] & (1 << (diff % 8)))
 			return (0);
 
 		/* Out of order but good. */
 		return (1);
 	}
 }
 
 /*
  * Check replay counter whether to update or not.
  * OUT:	0:	OK
  *	1:	NG
  */
 int
 ipsec_updatereplay(u_int32_t seq, struct secasvar *sav)
 {
 	struct secreplay *replay;
 	u_int32_t diff;
 	int fr;
 	u_int32_t wsizeb;	/* Constant: bits of window size. */
 	int frlast;		/* Constant: last frame. */
 
 	IPSEC_ASSERT(sav != NULL, ("Null SA"));
 	IPSEC_ASSERT(sav->replay != NULL, ("Null replay state"));
 
 	replay = sav->replay;
 
 	if (replay->wsize == 0)
 		goto ok;	/* No need to check replay. */
 
 	/* Constant. */
 	frlast = replay->wsize - 1;
 	wsizeb = replay->wsize << 3;
 
 	/* Sequence number of 0 is invalid. */
 	if (seq == 0)
 		return (1);
 
 	/* First time. */
 	if (replay->count == 0) {
 		replay->lastseq = seq;
 		bzero(replay->bitmap, replay->wsize);
 		(replay->bitmap)[frlast] = 1;
 		goto ok;
 	}
 
 	if (seq > replay->lastseq) {
 		/* seq is larger than lastseq. */
 		diff = seq - replay->lastseq;
 
 		/* New larger sequence number. */
 		if (diff < wsizeb) {
 			/* In window. */
 			/* Set bit for this packet. */
 			vshiftl(replay->bitmap, diff, replay->wsize);
 			(replay->bitmap)[frlast] |= 1;
 		} else {
 			/* This packet has a "way larger". */
 			bzero(replay->bitmap, replay->wsize);
 			(replay->bitmap)[frlast] = 1;
 		}
 		replay->lastseq = seq;
 
 		/* Larger is good. */
 	} else {
 		/* seq is equal or less than lastseq. */
 		diff = replay->lastseq - seq;
 
 		/* Over range to check, i.e. too old or wrapped. */
 		if (diff >= wsizeb)
 			return (1);
 
 		fr = frlast - diff / 8;
 
 		/* This packet already seen? */
 		if ((replay->bitmap)[fr] & (1 << (diff % 8)))
 			return (1);
 
 		/* Mark as seen. */
 		(replay->bitmap)[fr] |= (1 << (diff % 8));
 
 		/* Out of order but good. */
 	}
 
 ok:
 	if (replay->count == ~0) {
 
 		/* Set overflow flag. */
 		replay->overflow++;
 
 		/* Don't increment, no more packets accepted. */
 		if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0)
 			return (1);
 
 		ipseclog((LOG_WARNING, "%s: replay counter made %d cycle. %s\n",
 		    __func__, replay->overflow, ipsec_logsastr(sav)));
 	}
 
 	replay->count++;
 
 	return (0);
 }
 
 /*
  * Shift variable length buffer to left.
  * IN:	bitmap: pointer to the buffer
  * 	nbit:	the number of to shift.
  *	wsize:	buffer size (bytes).
  */
 static void
 vshiftl(unsigned char *bitmap, int nbit, int wsize)
 {
 	int s, j, i;
 	unsigned char over;
 
 	for (j = 0; j < nbit; j += 8) {
 		s = (nbit - j < 8) ? (nbit - j): 8;
 		bitmap[0] <<= s;
 		for (i = 1; i < wsize; i++) {
 			over = (bitmap[i] >> (8 - s));
 			bitmap[i] <<= s;
 			bitmap[i-1] |= over;
 		}
 	}
 }
 
 #ifdef INET
 /* Return a printable string for the IPv4 address. */
 static char *
 inet_ntoa4(struct in_addr ina)
 {
 	static char buf[4][4 * sizeof "123" + 4];
 	unsigned char *ucp = (unsigned char *) &ina;
 	static int i = 3;
 
 	/* XXX-BZ Returns static buffer. */
 	i = (i + 1) % 4;
 	sprintf(buf[i], "%d.%d.%d.%d", ucp[0] & 0xff, ucp[1] & 0xff,
 	    ucp[2] & 0xff, ucp[3] & 0xff);
 	return (buf[i]);
 }
 #endif
 
 /* Return a printable string for the address. */
 char *
 ipsec_address(union sockaddr_union* sa)
 {
 #ifdef INET6
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	switch (sa->sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		return (inet_ntoa4(sa->sin.sin_addr));
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		return (ip6_sprintf(ip6buf, &sa->sin6.sin6_addr));
 #endif /* INET6 */
 	default:
 		return ("(unknown address family)");
 	}
 }
 
 const char *
 ipsec_logsastr(struct secasvar *sav)
 {
 	static char buf[256];
 	char *p;
 	struct secasindex *saidx = &sav->sah->saidx;
 
 	IPSEC_ASSERT(saidx->src.sa.sa_family == saidx->dst.sa.sa_family,
 		("address family mismatch"));
 
 	p = buf;
 	snprintf(buf, sizeof(buf), "SA(SPI=%u ", (u_int32_t)ntohl(sav->spi));
 	while (p && *p)
 		p++;
 	/* NB: only use ipsec_address on one address at a time. */
 	snprintf(p, sizeof (buf) - (p - buf), "src=%s ",
 		ipsec_address(&saidx->src));
 	while (p && *p)
 		p++;
 	snprintf(p, sizeof (buf) - (p - buf), "dst=%s)",
 		ipsec_address(&saidx->dst));
 
 	return (buf);
 }
 
 void
 ipsec_dumpmbuf(struct mbuf *m)
 {
 	int totlen;
 	int i;
 	u_char *p;
 
 	totlen = 0;
 	printf("---\n");
 	while (m) {
 		p = mtod(m, u_char *);
 		for (i = 0; i < m->m_len; i++) {
 			printf("%02x ", p[i]);
 			totlen++;
 			if (totlen % 16 == 0)
 				printf("\n");
 		}
 		m = m->m_next;
 	}
 	if (totlen % 16 != 0)
 		printf("\n");
 	printf("---\n");
 }
 
 static void
 ipsec_init(const void *unused __unused)
 {
 
 	SECPOLICY_LOCK_INIT(&V_ip4_def_policy);
 	V_ip4_def_policy.refcnt = 1;			/* NB: disallow free. */
 }
 VNET_SYSINIT(ipsec_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, ipsec_init,
     NULL);
 
 
 /* XXX This stuff doesn't belong here... */
 
 static	struct xformsw* xforms = NULL;
 
 /*
  * Register a transform; typically at system startup.
  */
 void
 xform_register(struct xformsw* xsp)
 {
 
 	xsp->xf_next = xforms;
 	xforms = xsp;
 }
 
 /*
  * Initialize transform support in an sav.
  */
 int
 xform_init(struct secasvar *sav, int xftype)
 {
 	struct xformsw *xsp;
 
 	if (sav->tdb_xform != NULL)	/* Previously initialized. */
 		return (0);
 	for (xsp = xforms; xsp; xsp = xsp->xf_next)
 		if (xsp->xf_type == xftype)
 			return ((*xsp->xf_init)(sav, xsp));
 	return (EINVAL);
 }
Index: user/dchagin/lemul/sys/netipsec/ipsec_input.c
===================================================================
--- user/dchagin/lemul/sys/netipsec/ipsec_input.c	(revision 274240)
+++ user/dchagin/lemul/sys/netipsec/ipsec_input.c	(revision 274241)
@@ -1,955 +1,944 @@
 /*	$FreeBSD$	*/
 /*	$OpenBSD: ipsec_input.c,v 1.63 2003/02/20 18:35:43 deraadt Exp $	*/
 /*-
  * The authors of this code are John Ioannidis (ji@tla.org),
  * Angelos D. Keromytis (kermit@csd.uch.gr) and
  * Niels Provos (provos@physnet.uni-hamburg.de).
  *
  * This code was written by John Ioannidis for BSD/OS in Athens, Greece,
  * in November 1995.
  *
  * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
  * by Angelos D. Keromytis.
  *
  * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
  * and Niels Provos.
  *
  * Additional features in 1999 by Angelos D. Keromytis.
  *
  * Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
  * Angelos D. Keromytis and Niels Provos.
  * Copyright (c) 2001, Angelos D. Keromytis.
  *
  * Permission to use, copy, and modify this software with or without fee
  * is hereby granted, provided that this entire notice is included in
  * all copies of any software which is or includes a copy or
  * modification of this software.
  * You may use this code under the GNU public license if you so wish. Please
  * contribute changes back to the authors under this freer than GPL license
  * so that we may further the use of strong encryption without limitations to
  * all.
  *
  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
  * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
  * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
  * PURPOSE.
  */
 
 /*
  * IPsec input processing.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_enc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/errno.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 
 #include <netinet/ip6.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet/icmp6.h>
 #endif
 
 #include <netipsec/ipsec.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/ah_var.h>
 #include <netipsec/esp.h>
 #include <netipsec/esp_var.h>
 #include <netipsec/ipcomp_var.h>
 
 #include <netipsec/key.h>
 #include <netipsec/keydb.h>
 
 #include <netipsec/xform.h>
 #include <netinet6/ip6protosw.h>
 
 #include <machine/in_cksum.h>
 #include <machine/stdarg.h>
 
 #ifdef DEV_ENC
 #include <net/if_enc.h>
 #endif
 
 
 #define	IPSEC_ISTAT(proto, name)	do {	\
 	if ((proto) == IPPROTO_ESP)		\
 		ESPSTAT_INC(esps_##name);	\
 	else if ((proto) == IPPROTO_AH)		\
 		AHSTAT_INC(ahs_##name);		\
 	else					\
 		IPCOMPSTAT_INC(ipcomps_##name);	\
 } while (0)
 
 #ifdef INET
 static void ipsec4_common_ctlinput(int, struct sockaddr *, void *, int);
 #endif
 
 /*
  * ipsec_common_input gets called when an IPsec-protected packet
  * is received by IPv4 or IPv6.  Its job is to find the right SA
  * and call the appropriate transform.  The transform callback
  * takes care of further processing (like ingress filtering).
  */
 static int
 ipsec_common_input(struct mbuf *m, int skip, int protoff, int af, int sproto)
 {
 	union sockaddr_union dst_address;
 	struct secasvar *sav;
 	u_int32_t spi;
 	int error;
 #ifdef INET
 #ifdef IPSEC_NAT_T
 	struct m_tag *tag;
 #endif
 #endif
 
 	IPSEC_ISTAT(sproto, input);
 
 	IPSEC_ASSERT(m != NULL, ("null packet"));
 
 	IPSEC_ASSERT(sproto == IPPROTO_ESP || sproto == IPPROTO_AH ||
 		sproto == IPPROTO_IPCOMP,
 		("unexpected security protocol %u", sproto));
 
 	if ((sproto == IPPROTO_ESP && !V_esp_enable) ||
 	    (sproto == IPPROTO_AH && !V_ah_enable) ||
 	    (sproto == IPPROTO_IPCOMP && !V_ipcomp_enable)) {
 		m_freem(m);
 		IPSEC_ISTAT(sproto, pdrops);
 		return EOPNOTSUPP;
 	}
 
 	if (m->m_pkthdr.len - skip < 2 * sizeof (u_int32_t)) {
 		m_freem(m);
 		IPSEC_ISTAT(sproto, hdrops);
 		DPRINTF(("%s: packet too small\n", __func__));
 		return EINVAL;
 	}
 
 	/* Retrieve the SPI from the relevant IPsec header */
 	if (sproto == IPPROTO_ESP)
 		m_copydata(m, skip, sizeof(u_int32_t), (caddr_t) &spi);
 	else if (sproto == IPPROTO_AH)
 		m_copydata(m, skip + sizeof(u_int32_t), sizeof(u_int32_t),
 		    (caddr_t) &spi);
 	else if (sproto == IPPROTO_IPCOMP) {
 		u_int16_t cpi;
 		m_copydata(m, skip + sizeof(u_int16_t), sizeof(u_int16_t),
 		    (caddr_t) &cpi);
 		spi = ntohl(htons(cpi));
 	}
 
 	/*
 	 * Find the SA and (indirectly) call the appropriate
 	 * kernel crypto routine. The resulting mbuf chain is a valid
 	 * IP packet ready to go through input processing.
 	 */
 	bzero(&dst_address, sizeof (dst_address));
 	dst_address.sa.sa_family = af;
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		dst_address.sin.sin_len = sizeof(struct sockaddr_in);
 		m_copydata(m, offsetof(struct ip, ip_dst),
 		    sizeof(struct in_addr),
 		    (caddr_t) &dst_address.sin.sin_addr);
 #ifdef IPSEC_NAT_T
 		/* Find the source port for NAT-T; see udp*_espdecap. */
 		tag = m_tag_find(m, PACKET_TAG_IPSEC_NAT_T_PORTS, NULL);
 		if (tag != NULL)
 			dst_address.sin.sin_port = ((u_int16_t *)(tag + 1))[1];
 #endif /* IPSEC_NAT_T */
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		dst_address.sin6.sin6_len = sizeof(struct sockaddr_in6);
 		m_copydata(m, offsetof(struct ip6_hdr, ip6_dst),
 		    sizeof(struct in6_addr),
 		    (caddr_t) &dst_address.sin6.sin6_addr);
 		break;
 #endif /* INET6 */
 	default:
 		DPRINTF(("%s: unsupported protocol family %u\n", __func__, af));
 		m_freem(m);
 		IPSEC_ISTAT(sproto, nopf);
 		return EPFNOSUPPORT;
 	}
 
 	/* NB: only pass dst since key_allocsa follows RFC2401 */
 	sav = KEY_ALLOCSA(&dst_address, sproto, spi);
 	if (sav == NULL) {
 		DPRINTF(("%s: no key association found for SA %s/%08lx/%u\n",
 			  __func__, ipsec_address(&dst_address),
 			  (u_long) ntohl(spi), sproto));
 		IPSEC_ISTAT(sproto, notdb);
 		m_freem(m);
 		return ENOENT;
 	}
 
 	if (sav->tdb_xform == NULL) {
 		DPRINTF(("%s: attempted to use uninitialized SA %s/%08lx/%u\n",
 			 __func__, ipsec_address(&dst_address),
 			 (u_long) ntohl(spi), sproto));
 		IPSEC_ISTAT(sproto, noxform);
 		KEY_FREESAV(&sav);
 		m_freem(m);
 		return ENXIO;
 	}
 
 	/*
 	 * Call appropriate transform and return -- callback takes care of
 	 * everything else.
 	 */
 	error = (*sav->tdb_xform->xf_input)(m, sav, skip, protoff);
 	KEY_FREESAV(&sav);
 	return error;
 }
 
 #ifdef INET
 /*
  * Common input handler for IPv4 AH, ESP, and IPCOMP.
  */
 int
 ipsec4_common_input(struct mbuf *m, ...)
 {
 	va_list ap;
 	int off, nxt;
 
 	va_start(ap, m);
 	off = va_arg(ap, int);
 	nxt = va_arg(ap, int);
 	va_end(ap);
 
 	return ipsec_common_input(m, off, offsetof(struct ip, ip_p),
 				  AF_INET, nxt);
 }
 
 int
 ah4_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 	int off;
 
 	m = *mp;
 	off = *offp;
 	*mp = NULL;
 
 	ipsec4_common_input(m, off, IPPROTO_AH);
 	return (IPPROTO_DONE);
 }
 void
 ah4_ctlinput(int cmd, struct sockaddr *sa, void *v)
 {
 	if (sa->sa_family == AF_INET &&
 	    sa->sa_len == sizeof(struct sockaddr_in))
 		ipsec4_common_ctlinput(cmd, sa, v, IPPROTO_AH);
 }
 
 int
 esp4_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 	int off;
 
 	m = *mp;
 	off = *offp;
 	mp = NULL;
 
 	ipsec4_common_input(m, off, IPPROTO_ESP);
 	return (IPPROTO_DONE);
 }
 
 void
 esp4_ctlinput(int cmd, struct sockaddr *sa, void *v)
 {
 	if (sa->sa_family == AF_INET &&
 	    sa->sa_len == sizeof(struct sockaddr_in))
 		ipsec4_common_ctlinput(cmd, sa, v, IPPROTO_ESP);
 }
 
 int
 ipcomp4_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 	int off;
 
 	m = *mp;
 	off = *offp;
 	mp = NULL;
 
 	ipsec4_common_input(m, off, IPPROTO_IPCOMP);
 	return (IPPROTO_DONE);
 }
 
 /*
  * IPsec input callback for INET protocols.
  * This routine is called as the transform callback.
  * Takes care of filtering and other sanity checks on
  * the processed packet.
  */
 int
 ipsec4_common_input_cb(struct mbuf *m, struct secasvar *sav,
 			int skip, int protoff, struct m_tag *mt)
 {
 	int prot, af, sproto, isr_prot;
 	struct ip *ip;
 	struct m_tag *mtag;
 	struct tdb_ident *tdbi;
 	struct secasindex *saidx;
 	int error;
 #ifdef INET6
 #ifdef notyet
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 #endif
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(sav != NULL, ("null SA"));
 	IPSEC_ASSERT(sav->sah != NULL, ("null SAH"));
 	saidx = &sav->sah->saidx;
 	af = saidx->dst.sa.sa_family;
 	IPSEC_ASSERT(af == AF_INET, ("unexpected af %u", af));
 	sproto = saidx->proto;
 	IPSEC_ASSERT(sproto == IPPROTO_ESP || sproto == IPPROTO_AH ||
 		sproto == IPPROTO_IPCOMP,
 		("unexpected security protocol %u", sproto));
 
 	/* Sanity check */
 	if (m == NULL) {
 		DPRINTF(("%s: null mbuf", __func__));
 		IPSEC_ISTAT(sproto, badkcr);
 		KEY_FREESAV(&sav);
 		return EINVAL;
 	}
 
 	if (skip != 0) {
 		/*
 		 * Fix IPv4 header
 		 * XXXGL: do we need this entire block?
 		 */
 		if (m->m_len < skip && (m = m_pullup(m, skip)) == NULL) {
 			DPRINTF(("%s: processing failed for SA %s/%08lx\n",
 			    __func__, ipsec_address(&sav->sah->saidx.dst),
 			    (u_long) ntohl(sav->spi)));
 			IPSEC_ISTAT(sproto, hdrops);
 			error = ENOBUFS;
 			goto bad;
 		}
 
 		ip = mtod(m, struct ip *);
 		ip->ip_len = htons(m->m_pkthdr.len);
 		ip->ip_sum = 0;
 		ip->ip_sum = in_cksum(m, ip->ip_hl << 2);
 	} else {
 		ip = mtod(m, struct ip *);
 	}
 	prot = ip->ip_p;
 
 #ifdef DEV_ENC
 	if_inc_counter(encif, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(encif, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 
-	/*
-	 * Pass the mbuf to enc0 for bpf and pfil. We will filter the IPIP
-	 * packet later after it has been decapsulated.
-	 */
+	/* Pass the mbuf to enc0 for bpf and pfil. */
 	ipsec_bpf(m, sav, AF_INET, ENC_IN|ENC_BEFORE);
-
-	if (prot != IPPROTO_IPIP)
-		if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_BEFORE)) != 0)
-			return (error);
+	if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_BEFORE)) != 0)
+		return (error);
 #endif /* DEV_ENC */
 
 	/* IP-in-IP encapsulation */
 	if (prot == IPPROTO_IPIP &&
 	    saidx->mode != IPSEC_MODE_TRANSPORT) {
 
 		if (m->m_pkthdr.len - skip < sizeof(struct ip)) {
 			IPSEC_ISTAT(sproto, hdrops);
 			error = EINVAL;
 			goto bad;
 		}
 		/* enc0: strip outer IPv4 header */
 		m_striphdr(m, 0, ip->ip_hl << 2);
 
 #ifdef notyet
 		/* XXX PROXY address isn't recorded in SAH */
 		/*
 		 * Check that the inner source address is the same as
 		 * the proxy address, if available.
 		 */
 		if ((saidx->proxy.sa.sa_family == AF_INET &&
 		    saidx->proxy.sin.sin_addr.s_addr !=
 		    INADDR_ANY &&
 		    ipn.ip_src.s_addr !=
 		    saidx->proxy.sin.sin_addr.s_addr) ||
 		    (saidx->proxy.sa.sa_family != AF_INET &&
 			saidx->proxy.sa.sa_family != 0)) {
 
 			DPRINTF(("%s: inner source address %s doesn't "
 			    "correspond to expected proxy source %s, "
 			    "SA %s/%08lx\n", __func__,
 			    inet_ntoa4(ipn.ip_src),
 			    ipsp_address(saidx->proxy),
 			    ipsp_address(saidx->dst),
 			    (u_long) ntohl(sav->spi)));
 
 			IPSEC_ISTAT(sproto, pdrops);
 			error = EACCES;
 			goto bad;
 		}
 #endif /* notyet */
 	}
 #ifdef INET6
 	/* IPv6-in-IP encapsulation. */
 	else if (prot == IPPROTO_IPV6 &&
 	    saidx->mode != IPSEC_MODE_TRANSPORT) {
 
 		if (m->m_pkthdr.len - skip < sizeof(struct ip6_hdr)) {
 			IPSEC_ISTAT(sproto, hdrops);
 			error = EINVAL;
 			goto bad;
 		}
 		/* enc0: strip IPv4 header, keep IPv6 header only */
 		m_striphdr(m, 0, ip->ip_hl << 2);
 #ifdef notyet 
 		/*
 		 * Check that the inner source address is the same as
 		 * the proxy address, if available.
 		 */
 		if ((saidx->proxy.sa.sa_family == AF_INET6 &&
 		    !IN6_IS_ADDR_UNSPECIFIED(&saidx->proxy.sin6.sin6_addr) &&
 		    !IN6_ARE_ADDR_EQUAL(&ip6n.ip6_src,
 			&saidx->proxy.sin6.sin6_addr)) ||
 		    (saidx->proxy.sa.sa_family != AF_INET6 &&
 			saidx->proxy.sa.sa_family != 0)) {
 
 			DPRINTF(("%s: inner source address %s doesn't "
 			    "correspond to expected proxy source %s, "
 			    "SA %s/%08lx\n", __func__,
 			    ip6_sprintf(ip6buf, &ip6n.ip6_src),
 			    ipsec_address(&saidx->proxy),
 			    ipsec_address(&saidx->dst),
 			    (u_long) ntohl(sav->spi)));
 
 			IPSEC_ISTAT(sproto, pdrops);
 			error = EACCES;
 			goto bad;
 		}
 #endif /* notyet */
 	}
 #endif /* INET6 */
 	else if (prot != IPPROTO_IPV6 && saidx->mode == IPSEC_MODE_ANY) {
 		/*
 		 * When mode is wildcard, inner protocol is IPv6 and
 		 * we have no INET6 support - drop this packet a bit later.
 		 * In other cases we assume transport mode and outer
 		 * header was already stripped in xform_xxx_cb.
 		 */
 		prot = IPPROTO_IPIP;
 	}
 
 	/*
 	 * Record what we've done to the packet (under what SA it was
 	 * processed). If we've been passed an mtag, it means the packet
 	 * was already processed by an ethernet/crypto combo card and
 	 * thus has a tag attached with all the right information, but
 	 * with a PACKET_TAG_IPSEC_IN_CRYPTO_DONE as opposed to
 	 * PACKET_TAG_IPSEC_IN_DONE type; in that case, just change the type.
 	 */
 	if (mt == NULL && sproto != IPPROTO_IPCOMP) {
 		mtag = m_tag_get(PACKET_TAG_IPSEC_IN_DONE,
 		    sizeof(struct tdb_ident), M_NOWAIT);
 		if (mtag == NULL) {
 			DPRINTF(("%s: failed to get tag\n", __func__));
 			IPSEC_ISTAT(sproto, hdrops);
 			error = ENOMEM;
 			goto bad;
 		}
 
 		tdbi = (struct tdb_ident *)(mtag + 1);
 		bcopy(&saidx->dst, &tdbi->dst, saidx->dst.sa.sa_len);
 		tdbi->proto = sproto;
 		tdbi->spi = sav->spi;
 		/* Cache those two for enc(4) in xform_ipip. */
 		tdbi->alg_auth = sav->alg_auth;
 		tdbi->alg_enc = sav->alg_enc;
 
 		m_tag_prepend(m, mtag);
 	} else if (mt != NULL) {
 		mt->m_tag_id = PACKET_TAG_IPSEC_IN_DONE;
 		/* XXX do we need to mark m_flags??? */
 	}
 
 	key_sa_recordxfer(sav, m);		/* record data transfer */
 
 	/*
 	 * In transport mode requeue decrypted mbuf back to IPv4 protocol
 	 * handler. This is necessary to correctly expose rcvif.
 	 */
 	if (saidx->mode == IPSEC_MODE_TRANSPORT)
 		prot = IPPROTO_IPIP;
 #ifdef DEV_ENC
 	/*
 	 * Pass the mbuf to enc0 for bpf and pfil.
 	 */
 	if (prot == IPPROTO_IPIP)
 		ipsec_bpf(m, sav, AF_INET, ENC_IN|ENC_AFTER);
 #ifdef INET6
 	if (prot == IPPROTO_IPV6)
 		ipsec_bpf(m, sav, AF_INET6, ENC_IN|ENC_AFTER);
 #endif
 
 	if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_AFTER)) != 0)
 		return (error);
 #endif /* DEV_ENC */
 
 	/*
 	 * Re-dispatch via software interrupt.
 	 */
 
 	switch (prot) {
 	case IPPROTO_IPIP:
 		isr_prot = NETISR_IP;
 		break;
 #ifdef INET6
 	case IPPROTO_IPV6:
 		isr_prot = NETISR_IPV6;
 		break;
 #endif
 	default:
 		DPRINTF(("%s: cannot handle inner ip proto %d\n",
 			    __func__, prot));
 		IPSEC_ISTAT(sproto, nopf);
 		error = EPFNOSUPPORT;
 		goto bad;
 	}
 
 	error = netisr_queue_src(isr_prot, (uintptr_t)sav->spi, m);
 	if (error) {
 		IPSEC_ISTAT(sproto, qfull);
 		DPRINTF(("%s: queue full; proto %u packet dropped\n",
 			__func__, sproto));
 		return error;
 	}
 	return 0;
 bad:
 	m_freem(m);
 	return error;
 }
 
 void
 ipsec4_common_ctlinput(int cmd, struct sockaddr *sa, void *v, int proto)
 {
 	/* XXX nothing just yet */
 }
 #endif /* INET */
 
 #ifdef INET6
 /* IPv6 AH wrapper. */
 int
 ipsec6_common_input(struct mbuf **mp, int *offp, int proto)
 {
 	int l = 0;
 	int protoff;
 	struct ip6_ext ip6e;
 
 	if (*offp < sizeof(struct ip6_hdr)) {
 		DPRINTF(("%s: bad offset %u\n", __func__, *offp));
 		return IPPROTO_DONE;
 	} else if (*offp == sizeof(struct ip6_hdr)) {
 		protoff = offsetof(struct ip6_hdr, ip6_nxt);
 	} else {
 		/* Chase down the header chain... */
 		protoff = sizeof(struct ip6_hdr);
 
 		do {
 			protoff += l;
 			m_copydata(*mp, protoff, sizeof(ip6e),
 			    (caddr_t) &ip6e);
 
 			if (ip6e.ip6e_nxt == IPPROTO_AH)
 				l = (ip6e.ip6e_len + 2) << 2;
 			else
 				l = (ip6e.ip6e_len + 1) << 3;
 			IPSEC_ASSERT(l > 0, ("l went zero or negative"));
 		} while (protoff + l < *offp);
 
 		/* Malformed packet check */
 		if (protoff + l != *offp) {
 			DPRINTF(("%s: bad packet header chain, protoff %u, "
 				"l %u, off %u\n", __func__, protoff, l, *offp));
 			IPSEC_ISTAT(proto, hdrops);
 			m_freem(*mp);
 			*mp = NULL;
 			return IPPROTO_DONE;
 		}
 		protoff += offsetof(struct ip6_ext, ip6e_nxt);
 	}
 	(void) ipsec_common_input(*mp, *offp, protoff, AF_INET6, proto);
 	return IPPROTO_DONE;
 }
 
 /*
  * IPsec input callback, called by the transform callback. Takes care of
  * filtering and other sanity checks on the processed packet.
  */
 int
 ipsec6_common_input_cb(struct mbuf *m, struct secasvar *sav, int skip, int protoff,
     struct m_tag *mt)
 {
 	int prot, af, sproto;
 	struct ip6_hdr *ip6;
 	struct m_tag *mtag;
 	struct tdb_ident *tdbi;
 	struct secasindex *saidx;
 	int nxt;
 	u_int8_t nxt8;
 	int error, nest;
 #ifdef notyet
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(sav != NULL, ("null SA"));
 	IPSEC_ASSERT(sav->sah != NULL, ("null SAH"));
 	saidx = &sav->sah->saidx;
 	af = saidx->dst.sa.sa_family;
 	IPSEC_ASSERT(af == AF_INET6, ("unexpected af %u", af));
 	sproto = saidx->proto;
 	IPSEC_ASSERT(sproto == IPPROTO_ESP || sproto == IPPROTO_AH ||
 		sproto == IPPROTO_IPCOMP,
 		("unexpected security protocol %u", sproto));
 
 	/* Sanity check */
 	if (m == NULL) {
 		DPRINTF(("%s: null mbuf", __func__));
 		IPSEC_ISTAT(sproto, badkcr);
 		error = EINVAL;
 		goto bad;
 	}
 
 	/* Fix IPv6 header */
 	if (m->m_len < sizeof(struct ip6_hdr) &&
 	    (m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
 
 		DPRINTF(("%s: processing failed for SA %s/%08lx\n",
 		    __func__, ipsec_address(&sav->sah->saidx.dst),
 		    (u_long) ntohl(sav->spi)));
 
 		IPSEC_ISTAT(sproto, hdrops);
 		error = EACCES;
 		goto bad;
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
 
 	/* Save protocol */
 	prot = 0;
 	m_copydata(m, protoff, 1, (unsigned char *) &prot);
 
 #ifdef DEV_ENC
 	if_inc_counter(encif, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(encif, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 
-	/*
-	 * Pass the mbuf to enc0 for bpf and pfil. We will filter the IPIP
-	 * packet later after it has been decapsulated.
-	 */
+	/* Pass the mbuf to enc0 for bpf and pfil. */
 	ipsec_bpf(m, sav, AF_INET6, ENC_IN|ENC_BEFORE);
-
-	/* XXX-BZ does not make sense. */
-	if (prot != IPPROTO_IPIP)
-		if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_BEFORE)) != 0)
-			return (error);
+	if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_BEFORE)) != 0)
+		return (error);
 #endif /* DEV_ENC */
 
 #ifdef INET
 	/* IP-in-IP encapsulation */
 	if (prot == IPPROTO_IPIP) {
 		if (m->m_pkthdr.len - skip < sizeof(struct ip)) {
 			IPSEC_ISTAT(sproto, hdrops);
 			error = EINVAL;
 			goto bad;
 		}
 		/* ipn will now contain the inner IPv4 header */
 	 	m_striphdr(m, 0, skip);
 		skip = 0;
 #ifdef notyet
 		/*
 		 * Check that the inner source address is the same as
 		 * the proxy address, if available.
 		 */
 		if ((saidx->proxy.sa.sa_family == AF_INET &&
 		    saidx->proxy.sin.sin_addr.s_addr != INADDR_ANY &&
 		    ipn.ip_src.s_addr != saidx->proxy.sin.sin_addr.s_addr) ||
 		    (saidx->proxy.sa.sa_family != AF_INET &&
 			saidx->proxy.sa.sa_family != 0)) {
 
 			DPRINTF(("%s: inner source address %s doesn't "
 			    "correspond to expected proxy source %s, "
 			    "SA %s/%08lx\n", __func__,
 			    inet_ntoa4(ipn.ip_src),
 			    ipsec_address(&saidx->proxy),
 			    ipsec_address(&saidx->dst),
 			    (u_long) ntohl(sav->spi)));
 
 			IPSEC_ISTAT(sproto, pdrops);
 			error = EACCES;
 			goto bad;
 		}
 #endif /* notyet */
 	}
 #endif /* INET */
 	/* IPv6-in-IP encapsulation */
 	if (prot == IPPROTO_IPV6) {
 		if (m->m_pkthdr.len - skip < sizeof(struct ip6_hdr)) {
 			IPSEC_ISTAT(sproto, hdrops);
 			error = EINVAL;
 			goto bad;
 		}
 		/* ip6n will now contain the inner IPv6 header. */
 		m_striphdr(m, 0, skip);
 		skip = 0;
 #ifdef notyet
 		/*
 		 * Check that the inner source address is the same as
 		 * the proxy address, if available.
 		 */
 		if ((saidx->proxy.sa.sa_family == AF_INET6 &&
 		    !IN6_IS_ADDR_UNSPECIFIED(&saidx->proxy.sin6.sin6_addr) &&
 		    !IN6_ARE_ADDR_EQUAL(&ip6n.ip6_src,
 			&saidx->proxy.sin6.sin6_addr)) ||
 		    (saidx->proxy.sa.sa_family != AF_INET6 &&
 			saidx->proxy.sa.sa_family != 0)) {
 
 			DPRINTF(("%s: inner source address %s doesn't "
 			    "correspond to expected proxy source %s, "
 			    "SA %s/%08lx\n", __func__,
 			    ip6_sprintf(ip6buf, &ip6n.ip6_src),
 			    ipsec_address(&saidx->proxy),
 			    ipsec_address(&saidx->dst),
 			    (u_long) ntohl(sav->spi)));
 
 			IPSEC_ISTAT(sproto, pdrops);
 			error = EACCES;
 			goto bad;
 		}
 #endif /* notyet */
 	}
 
 	/*
 	 * Record what we've done to the packet (under what SA it was
 	 * processed). If we've been passed an mtag, it means the packet
 	 * was already processed by an ethernet/crypto combo card and
 	 * thus has a tag attached with all the right information, but
 	 * with a PACKET_TAG_IPSEC_IN_CRYPTO_DONE as opposed to
 	 * PACKET_TAG_IPSEC_IN_DONE type; in that case, just change the type.
 	 */
 	if (mt == NULL && sproto != IPPROTO_IPCOMP) {
 		mtag = m_tag_get(PACKET_TAG_IPSEC_IN_DONE,
 		    sizeof(struct tdb_ident), M_NOWAIT);
 		if (mtag == NULL) {
 			DPRINTF(("%s: failed to get tag\n", __func__));
 			IPSEC_ISTAT(sproto, hdrops);
 			error = ENOMEM;
 			goto bad;
 		}
 
 		tdbi = (struct tdb_ident *)(mtag + 1);
 		bcopy(&saidx->dst, &tdbi->dst, sizeof(union sockaddr_union));
 		tdbi->proto = sproto;
 		tdbi->spi = sav->spi;
 		/* Cache those two for enc(4) in xform_ipip. */
 		tdbi->alg_auth = sav->alg_auth;
 		tdbi->alg_enc = sav->alg_enc;
 
 		m_tag_prepend(m, mtag);
 	} else {
 		if (mt != NULL)
 			mt->m_tag_id = PACKET_TAG_IPSEC_IN_DONE;
 		/* XXX do we need to mark m_flags??? */
 	}
 
 	key_sa_recordxfer(sav, m);
 
 #ifdef DEV_ENC
 	/*
 	 * Pass the mbuf to enc0 for bpf and pfil.
 	 */
 #ifdef INET
 	if (prot == IPPROTO_IPIP)
 		ipsec_bpf(m, sav, AF_INET, ENC_IN|ENC_AFTER);
 #endif
 	if (prot == IPPROTO_IPV6)
 		ipsec_bpf(m, sav, AF_INET6, ENC_IN|ENC_AFTER);
 
 	if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_AFTER)) != 0)
 		return (error);
 #endif /* DEV_ENC */
 	/* Retrieve new protocol */
 	/* We have stripped the IP6 header from the mbuf, we have to use the backuped proto value instead */
 	nxt8 = prot;
 
 	/*
 	 * See the end of ip6_input for this logic.
 	 * IPPROTO_IPV[46] case will be processed just like other ones
 	 */
 	nest = 0;
 	nxt = nxt8;
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			error = EINVAL;
 			goto bad;
 		}
 
 		/*
 		 * Protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < skip) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			error = EINVAL;
 			goto bad;
 		}
 		/*
 		 * Enforce IPsec policy checking if we are seeing last header.
 		 * note that we do not visit this with protocols with pcb layer
 		 * code - like udp/tcp/raw ip.
 		 */
 		if ((inet6sw[ip6_protox[nxt]].pr_flags & PR_LASTHDR) != 0 &&
 		    ipsec6_in_reject(m, NULL)) {
 			error = EINVAL;
 			goto bad;
 		}
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &skip, nxt);
 	}
 	return 0;
 bad:
 	if (m)
 		m_freem(m);
 	return error;
 }
 
 void
 esp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 	struct ip6ctlparam *ip6cp = NULL;
 	struct mbuf *m = NULL;
 	struct ip6_hdr *ip6;
 	int off;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 	if ((unsigned)cmd >= PRC_NCMDS)
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		off = 0;	/* calm gcc */
 	}
 
 	if (ip6 != NULL) {
 
 		struct ip6ctlparam ip6cp1;
 
 		/*
 		 * Notify the error to all possible sockets via pfctlinput2.
 		 * Since the upper layer information (such as protocol type,
 		 * source and destination ports) is embedded in the encrypted
 		 * data and might have been cut, we can't directly call
 		 * an upper layer ctlinput function. However, the pcbnotify
 		 * function will consider source and destination addresses
 		 * as well as the flow info value, and may be able to find
 		 * some PCB that should be notified.
 		 * Although pfctlinput2 will call esp6_ctlinput(), there is
 		 * no possibility of an infinite loop of function calls,
 		 * because we don't pass the inner IPv6 header.
 		 */
 		bzero(&ip6cp1, sizeof(ip6cp1));
 		ip6cp1.ip6c_src = ip6cp->ip6c_src;
 		pfctlinput2(cmd, sa, (void *)&ip6cp1);
 
 		/*
 		 * Then go to special cases that need ESP header information.
 		 * XXX: We assume that when ip6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		if (cmd == PRC_MSGSIZE) {
 			struct secasvar *sav;
 			u_int32_t spi;
 			int valid;
 
 			/* check header length before using m_copydata */
 			if (m->m_pkthdr.len < off + sizeof (struct esp))
 				return;
 			m_copydata(m, off + offsetof(struct esp, esp_spi),
 				sizeof(u_int32_t), (caddr_t) &spi);
 			/*
 			 * Check to see if we have a valid SA corresponding to
 			 * the address in the ICMP message payload.
 			 */
 			sav = KEY_ALLOCSA((union sockaddr_union *)sa,
 					IPPROTO_ESP, spi);
 			valid = (sav != NULL);
 			if (sav)
 				KEY_FREESAV(&sav);
 
 			/* XXX Further validation? */
 
 			/*
 			 * Depending on whether the SA is "valid" and
 			 * routing table size (mtudisc_{hi,lo}wat), we will:
 			 * - recalcurate the new MTU and create the
 			 *   corresponding routing entry, or
 			 * - ignore the MTU change notification.
 			 */
 			icmp6_mtudisc_update(ip6cp, valid);
 		}
 	} else {
 		/* we normally notify any pcb here */
 	}
 }
 #endif /* INET6 */
Index: user/dchagin/lemul/sys/netipsec/key.c
===================================================================
--- user/dchagin/lemul/sys/netipsec/key.c	(revision 274240)
+++ user/dchagin/lemul/sys/netipsec/key.c	(revision 274241)
@@ -1,8024 +1,8024 @@
 /*	$FreeBSD$	*/
 /*	$KAME: key.c,v 1.191 2001/06/27 10:46:49 sakane Exp $	*/
 
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This code is referd to RFC 2367
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #include <net/raw_cb.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_var.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #endif /* INET6 */
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in_pcb.h>
 #endif
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif /* INET6 */
 
 #include <net/pfkeyv2.h>
 #include <netipsec/keydb.h>
 #include <netipsec/key.h>
 #include <netipsec/keysock.h>
 #include <netipsec/key_debug.h>
 
 #include <netipsec/ipsec.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 
 #include <netipsec/xform.h>
 
 #include <machine/stdarg.h>
 
 /* randomness */
 #include <sys/random.h>
 
 #define FULLMASK	0xff
 #define	_BITS(bytes)	((bytes) << 3)
 
 /*
  * Note on SA reference counting:
  * - SAs that are not in DEAD state will have (total external reference + 1)
  *   following value in reference count field.  they cannot be freed and are
  *   referenced from SA header.
  * - SAs that are in DEAD state will have (total external reference)
  *   in reference count field.  they are ready to be freed.  reference from
  *   SA header will be removed in key_delsav(), when the reference count
  *   field hits 0 (= no external reference other than from SA header.
  */
 
 VNET_DEFINE(u_int32_t, key_debug_level) = 0;
 static VNET_DEFINE(u_int, key_spi_trycnt) = 1000;
 static VNET_DEFINE(u_int32_t, key_spi_minval) = 0x100;
 static VNET_DEFINE(u_int32_t, key_spi_maxval) = 0x0fffffff;	/* XXX */
 static VNET_DEFINE(u_int32_t, policy_id) = 0;
 /*interval to initialize randseed,1(m)*/
 static VNET_DEFINE(u_int, key_int_random) = 60;
 /* interval to expire acquiring, 30(s)*/
 static VNET_DEFINE(u_int, key_larval_lifetime) = 30;
 /* counter for blocking SADB_ACQUIRE.*/
 static VNET_DEFINE(int, key_blockacq_count) = 10;
 /* lifetime for blocking SADB_ACQUIRE.*/
 static VNET_DEFINE(int, key_blockacq_lifetime) = 20;
 /* preferred old sa rather than new sa.*/
 static VNET_DEFINE(int, key_preferred_oldsa) = 1;
 #define	V_key_spi_trycnt	VNET(key_spi_trycnt)
 #define	V_key_spi_minval	VNET(key_spi_minval)
 #define	V_key_spi_maxval	VNET(key_spi_maxval)
 #define	V_policy_id		VNET(policy_id)
 #define	V_key_int_random	VNET(key_int_random)
 #define	V_key_larval_lifetime	VNET(key_larval_lifetime)
 #define	V_key_blockacq_count	VNET(key_blockacq_count)
 #define	V_key_blockacq_lifetime	VNET(key_blockacq_lifetime)
 #define	V_key_preferred_oldsa	VNET(key_preferred_oldsa)
 
 static VNET_DEFINE(u_int32_t, acq_seq) = 0;
 #define	V_acq_seq		VNET(acq_seq)
 
 								/* SPD */
 static VNET_DEFINE(LIST_HEAD(_sptree, secpolicy), sptree[IPSEC_DIR_MAX]);
 #define	V_sptree		VNET(sptree)
 static struct mtx sptree_lock;
 #define	SPTREE_LOCK_INIT() \
 	mtx_init(&sptree_lock, "sptree", \
 		"fast ipsec security policy database", MTX_DEF)
 #define	SPTREE_LOCK_DESTROY()	mtx_destroy(&sptree_lock)
 #define	SPTREE_LOCK()		mtx_lock(&sptree_lock)
 #define	SPTREE_UNLOCK()	mtx_unlock(&sptree_lock)
 #define	SPTREE_LOCK_ASSERT()	mtx_assert(&sptree_lock, MA_OWNED)
 
 static VNET_DEFINE(LIST_HEAD(_sahtree, secashead), sahtree);	/* SAD */
 #define	V_sahtree		VNET(sahtree)
 static struct mtx sahtree_lock;
 #define	SAHTREE_LOCK_INIT() \
 	mtx_init(&sahtree_lock, "sahtree", \
 		"fast ipsec security association database", MTX_DEF)
 #define	SAHTREE_LOCK_DESTROY()	mtx_destroy(&sahtree_lock)
 #define	SAHTREE_LOCK()		mtx_lock(&sahtree_lock)
 #define	SAHTREE_UNLOCK()	mtx_unlock(&sahtree_lock)
 #define	SAHTREE_LOCK_ASSERT()	mtx_assert(&sahtree_lock, MA_OWNED)
 
 							/* registed list */
 static VNET_DEFINE(LIST_HEAD(_regtree, secreg), regtree[SADB_SATYPE_MAX + 1]);
 #define	V_regtree		VNET(regtree)
 static struct mtx regtree_lock;
 #define	REGTREE_LOCK_INIT() \
 	mtx_init(&regtree_lock, "regtree", "fast ipsec regtree", MTX_DEF)
 #define	REGTREE_LOCK_DESTROY()	mtx_destroy(&regtree_lock)
 #define	REGTREE_LOCK()		mtx_lock(&regtree_lock)
 #define	REGTREE_UNLOCK()	mtx_unlock(&regtree_lock)
 #define	REGTREE_LOCK_ASSERT()	mtx_assert(&regtree_lock, MA_OWNED)
 
 static VNET_DEFINE(LIST_HEAD(_acqtree, secacq), acqtree); /* acquiring list */
 #define	V_acqtree		VNET(acqtree)
 static struct mtx acq_lock;
 #define	ACQ_LOCK_INIT() \
 	mtx_init(&acq_lock, "acqtree", "fast ipsec acquire list", MTX_DEF)
 #define	ACQ_LOCK_DESTROY()	mtx_destroy(&acq_lock)
 #define	ACQ_LOCK()		mtx_lock(&acq_lock)
 #define	ACQ_UNLOCK()		mtx_unlock(&acq_lock)
 #define	ACQ_LOCK_ASSERT()	mtx_assert(&acq_lock, MA_OWNED)
 
 							/* SP acquiring list */
 static VNET_DEFINE(LIST_HEAD(_spacqtree, secspacq), spacqtree);
 #define	V_spacqtree		VNET(spacqtree)
 static struct mtx spacq_lock;
 #define	SPACQ_LOCK_INIT() \
 	mtx_init(&spacq_lock, "spacqtree", \
 		"fast ipsec security policy acquire list", MTX_DEF)
 #define	SPACQ_LOCK_DESTROY()	mtx_destroy(&spacq_lock)
 #define	SPACQ_LOCK()		mtx_lock(&spacq_lock)
 #define	SPACQ_UNLOCK()		mtx_unlock(&spacq_lock)
 #define	SPACQ_LOCK_ASSERT()	mtx_assert(&spacq_lock, MA_OWNED)
 
 /* search order for SAs */
 static const u_int saorder_state_valid_prefer_old[] = {
 	SADB_SASTATE_DYING, SADB_SASTATE_MATURE,
 };
 static const u_int saorder_state_valid_prefer_new[] = {
 	SADB_SASTATE_MATURE, SADB_SASTATE_DYING,
 };
 static const u_int saorder_state_alive[] = {
 	/* except DEAD */
 	SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL
 };
 static const u_int saorder_state_any[] = {
 	SADB_SASTATE_MATURE, SADB_SASTATE_DYING,
 	SADB_SASTATE_LARVAL, SADB_SASTATE_DEAD
 };
 
 static const int minsize[] = {
 	sizeof(struct sadb_msg),	/* SADB_EXT_RESERVED */
 	sizeof(struct sadb_sa),		/* SADB_EXT_SA */
 	sizeof(struct sadb_lifetime),	/* SADB_EXT_LIFETIME_CURRENT */
 	sizeof(struct sadb_lifetime),	/* SADB_EXT_LIFETIME_HARD */
 	sizeof(struct sadb_lifetime),	/* SADB_EXT_LIFETIME_SOFT */
 	sizeof(struct sadb_address),	/* SADB_EXT_ADDRESS_SRC */
 	sizeof(struct sadb_address),	/* SADB_EXT_ADDRESS_DST */
 	sizeof(struct sadb_address),	/* SADB_EXT_ADDRESS_PROXY */
 	sizeof(struct sadb_key),	/* SADB_EXT_KEY_AUTH */
 	sizeof(struct sadb_key),	/* SADB_EXT_KEY_ENCRYPT */
 	sizeof(struct sadb_ident),	/* SADB_EXT_IDENTITY_SRC */
 	sizeof(struct sadb_ident),	/* SADB_EXT_IDENTITY_DST */
 	sizeof(struct sadb_sens),	/* SADB_EXT_SENSITIVITY */
 	sizeof(struct sadb_prop),	/* SADB_EXT_PROPOSAL */
 	sizeof(struct sadb_supported),	/* SADB_EXT_SUPPORTED_AUTH */
 	sizeof(struct sadb_supported),	/* SADB_EXT_SUPPORTED_ENCRYPT */
 	sizeof(struct sadb_spirange),	/* SADB_EXT_SPIRANGE */
 	0,				/* SADB_X_EXT_KMPRIVATE */
 	sizeof(struct sadb_x_policy),	/* SADB_X_EXT_POLICY */
 	sizeof(struct sadb_x_sa2),	/* SADB_X_SA2 */
 	sizeof(struct sadb_x_nat_t_type),/* SADB_X_EXT_NAT_T_TYPE */
 	sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_SPORT */
 	sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_DPORT */
 	sizeof(struct sadb_address),	/* SADB_X_EXT_NAT_T_OAI */
 	sizeof(struct sadb_address),	/* SADB_X_EXT_NAT_T_OAR */
 	sizeof(struct sadb_x_nat_t_frag),/* SADB_X_EXT_NAT_T_FRAG */
 };
 static const int maxsize[] = {
 	sizeof(struct sadb_msg),	/* SADB_EXT_RESERVED */
 	sizeof(struct sadb_sa),		/* SADB_EXT_SA */
 	sizeof(struct sadb_lifetime),	/* SADB_EXT_LIFETIME_CURRENT */
 	sizeof(struct sadb_lifetime),	/* SADB_EXT_LIFETIME_HARD */
 	sizeof(struct sadb_lifetime),	/* SADB_EXT_LIFETIME_SOFT */
 	0,				/* SADB_EXT_ADDRESS_SRC */
 	0,				/* SADB_EXT_ADDRESS_DST */
 	0,				/* SADB_EXT_ADDRESS_PROXY */
 	0,				/* SADB_EXT_KEY_AUTH */
 	0,				/* SADB_EXT_KEY_ENCRYPT */
 	0,				/* SADB_EXT_IDENTITY_SRC */
 	0,				/* SADB_EXT_IDENTITY_DST */
 	0,				/* SADB_EXT_SENSITIVITY */
 	0,				/* SADB_EXT_PROPOSAL */
 	0,				/* SADB_EXT_SUPPORTED_AUTH */
 	0,				/* SADB_EXT_SUPPORTED_ENCRYPT */
 	sizeof(struct sadb_spirange),	/* SADB_EXT_SPIRANGE */
 	0,				/* SADB_X_EXT_KMPRIVATE */
 	0,				/* SADB_X_EXT_POLICY */
 	sizeof(struct sadb_x_sa2),	/* SADB_X_SA2 */
 	sizeof(struct sadb_x_nat_t_type),/* SADB_X_EXT_NAT_T_TYPE */
 	sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_SPORT */
 	sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_DPORT */
 	0,				/* SADB_X_EXT_NAT_T_OAI */
 	0,				/* SADB_X_EXT_NAT_T_OAR */
 	sizeof(struct sadb_x_nat_t_frag),/* SADB_X_EXT_NAT_T_FRAG */
 };
 
 static VNET_DEFINE(int, ipsec_esp_keymin) = 256;
 static VNET_DEFINE(int, ipsec_esp_auth) = 0;
 static VNET_DEFINE(int, ipsec_ah_keymin) = 128;
 
 #define	V_ipsec_esp_keymin	VNET(ipsec_esp_keymin)
 #define	V_ipsec_esp_auth	VNET(ipsec_esp_auth)
 #define	V_ipsec_ah_keymin	VNET(ipsec_ah_keymin)
 
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_key);
 #endif
 
-SYSCTL_VNET_INT(_net_key, KEYCTL_DEBUG_LEVEL,	debug,
-	CTLFLAG_RW, &VNET_NAME(key_debug_level),	0,	"");
+SYSCTL_INT(_net_key, KEYCTL_DEBUG_LEVEL,	debug,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_debug_level), 0, "");
 
 /* max count of trial for the decision of spi value */
-SYSCTL_VNET_INT(_net_key, KEYCTL_SPI_TRY, spi_trycnt,
-	CTLFLAG_RW, &VNET_NAME(key_spi_trycnt),	0,	"");
+SYSCTL_INT(_net_key, KEYCTL_SPI_TRY, spi_trycnt,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_spi_trycnt), 0, "");
 
 /* minimum spi value to allocate automatically. */
-SYSCTL_VNET_INT(_net_key, KEYCTL_SPI_MIN_VALUE,
-	spi_minval,	CTLFLAG_RW, &VNET_NAME(key_spi_minval),	0,	"");
+SYSCTL_INT(_net_key, KEYCTL_SPI_MIN_VALUE, spi_minval,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_spi_minval), 0, "");
 
 /* maximun spi value to allocate automatically. */
-SYSCTL_VNET_INT(_net_key, KEYCTL_SPI_MAX_VALUE,
-	spi_maxval,	CTLFLAG_RW, &VNET_NAME(key_spi_maxval),	0,	"");
+SYSCTL_INT(_net_key, KEYCTL_SPI_MAX_VALUE, spi_maxval,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_spi_maxval), 0, "");
 
 /* interval to initialize randseed */
-SYSCTL_VNET_INT(_net_key, KEYCTL_RANDOM_INT,
-	int_random,	CTLFLAG_RW, &VNET_NAME(key_int_random),	0,	"");
+SYSCTL_INT(_net_key, KEYCTL_RANDOM_INT, int_random,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_int_random), 0, "");
 
 /* lifetime for larval SA */
-SYSCTL_VNET_INT(_net_key, KEYCTL_LARVAL_LIFETIME,
-	larval_lifetime, CTLFLAG_RW, &VNET_NAME(key_larval_lifetime),	0, "");
+SYSCTL_INT(_net_key, KEYCTL_LARVAL_LIFETIME, larval_lifetime,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_larval_lifetime), 0, "");
 
 /* counter for blocking to send SADB_ACQUIRE to IKEd */
-SYSCTL_VNET_INT(_net_key, KEYCTL_BLOCKACQ_COUNT,
-	blockacq_count,	CTLFLAG_RW, &VNET_NAME(key_blockacq_count),	0, "");
+SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_COUNT, blockacq_count,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_blockacq_count), 0, "");
 
 /* lifetime for blocking to send SADB_ACQUIRE to IKEd */
-SYSCTL_VNET_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME,
-	blockacq_lifetime, CTLFLAG_RW, &VNET_NAME(key_blockacq_lifetime), 0, "");
+SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME, blockacq_lifetime,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_blockacq_lifetime), 0, "");
 
 /* ESP auth */
-SYSCTL_VNET_INT(_net_key, KEYCTL_ESP_AUTH,	esp_auth,
-	CTLFLAG_RW, &VNET_NAME(ipsec_esp_auth),	0,	"");
+SYSCTL_INT(_net_key, KEYCTL_ESP_AUTH, esp_auth,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_esp_auth), 0, "");
 
 /* minimum ESP key length */
-SYSCTL_VNET_INT(_net_key, KEYCTL_ESP_KEYMIN,
-	esp_keymin, CTLFLAG_RW, &VNET_NAME(ipsec_esp_keymin),	0,	"");
+SYSCTL_INT(_net_key, KEYCTL_ESP_KEYMIN, esp_keymin,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_esp_keymin), 0, "");
 
 /* minimum AH key length */
-SYSCTL_VNET_INT(_net_key, KEYCTL_AH_KEYMIN,	ah_keymin,
-	CTLFLAG_RW, &VNET_NAME(ipsec_ah_keymin),	0,	"");
+SYSCTL_INT(_net_key, KEYCTL_AH_KEYMIN, ah_keymin,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_ah_keymin), 0, "");
 
 /* perfered old SA rather than new SA */
-SYSCTL_VNET_INT(_net_key, KEYCTL_PREFERED_OLDSA,
-	preferred_oldsa, CTLFLAG_RW, &VNET_NAME(key_preferred_oldsa),	0, "");
+SYSCTL_INT(_net_key, KEYCTL_PREFERED_OLDSA, preferred_oldsa,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_preferred_oldsa), 0, "");
 
 #define __LIST_CHAINED(elm) \
 	(!((elm)->chain.le_next == NULL && (elm)->chain.le_prev == NULL))
 #define LIST_INSERT_TAIL(head, elm, type, field) \
 do {\
 	struct type *curelm = LIST_FIRST(head); \
 	if (curelm == NULL) {\
 		LIST_INSERT_HEAD(head, elm, field); \
 	} else { \
 		while (LIST_NEXT(curelm, field)) \
 			curelm = LIST_NEXT(curelm, field);\
 		LIST_INSERT_AFTER(curelm, elm, field);\
 	}\
 } while (0)
 
 #define KEY_CHKSASTATE(head, sav, name) \
 do { \
 	if ((head) != (sav)) {						\
 		ipseclog((LOG_DEBUG, "%s: state mismatched (TREE=%d SA=%d)\n", \
 			(name), (head), (sav)));			\
 		continue;						\
 	}								\
 } while (0)
 
 #define KEY_CHKSPDIR(head, sp, name) \
 do { \
 	if ((head) != (sp)) {						\
 		ipseclog((LOG_DEBUG, "%s: direction mismatched (TREE=%d SP=%d), " \
 			"anyway continue.\n",				\
 			(name), (head), (sp)));				\
 	}								\
 } while (0)
 
 MALLOC_DEFINE(M_IPSEC_SA, "secasvar", "ipsec security association");
 MALLOC_DEFINE(M_IPSEC_SAH, "sahead", "ipsec sa head");
 MALLOC_DEFINE(M_IPSEC_SP, "ipsecpolicy", "ipsec security policy");
 MALLOC_DEFINE(M_IPSEC_SR, "ipsecrequest", "ipsec security request");
 MALLOC_DEFINE(M_IPSEC_MISC, "ipsec-misc", "ipsec miscellaneous");
 MALLOC_DEFINE(M_IPSEC_SAQ, "ipsec-saq", "ipsec sa acquire");
 MALLOC_DEFINE(M_IPSEC_SAR, "ipsec-reg", "ipsec sa acquire");
 
 /*
  * set parameters into secpolicyindex buffer.
  * Must allocate secpolicyindex buffer passed to this function.
  */
 #define KEY_SETSECSPIDX(_dir, s, d, ps, pd, ulp, idx) \
 do { \
 	bzero((idx), sizeof(struct secpolicyindex));                         \
 	(idx)->dir = (_dir);                                                 \
 	(idx)->prefs = (ps);                                                 \
 	(idx)->prefd = (pd);                                                 \
 	(idx)->ul_proto = (ulp);                                             \
 	bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len);     \
 	bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len);     \
 } while (0)
 
 /*
  * set parameters into secasindex buffer.
  * Must allocate secasindex buffer before calling this function.
  */
 #define KEY_SETSECASIDX(p, m, r, s, d, idx) \
 do { \
 	bzero((idx), sizeof(struct secasindex));                             \
 	(idx)->proto = (p);                                                  \
 	(idx)->mode = (m);                                                   \
 	(idx)->reqid = (r);                                                  \
 	bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len);     \
 	bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len);     \
 } while (0)
 
 /* key statistics */
 struct _keystat {
 	u_long getspi_count; /* the avarage of count to try to get new SPI */
 } keystat;
 
 struct sadb_msghdr {
 	struct sadb_msg *msg;
 	struct sadb_ext *ext[SADB_EXT_MAX + 1];
 	int extoff[SADB_EXT_MAX + 1];
 	int extlen[SADB_EXT_MAX + 1];
 };
 
 #ifndef IPSEC_DEBUG2
 static struct callout key_timer;
 #endif
 
 static struct secasvar *key_allocsa_policy __P((const struct secasindex *));
 static void key_freesp_so __P((struct secpolicy **));
 static struct secasvar *key_do_allocsa_policy __P((struct secashead *, u_int));
 static void key_delsp __P((struct secpolicy *));
 static struct secpolicy *key_getsp __P((struct secpolicyindex *));
 static void _key_delsp(struct secpolicy *sp);
 static struct secpolicy *key_getspbyid __P((u_int32_t));
 static u_int32_t key_newreqid __P((void));
 static struct mbuf *key_gather_mbuf __P((struct mbuf *,
 	const struct sadb_msghdr *, int, int, ...));
 static int key_spdadd __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static u_int32_t key_getnewspid __P((void));
 static int key_spddelete __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_spddelete2 __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_spdget __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_spdflush __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_spddump __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static struct mbuf *key_setdumpsp __P((struct secpolicy *,
 	u_int8_t, u_int32_t, u_int32_t));
 static u_int key_getspreqmsglen __P((struct secpolicy *));
 static int key_spdexpire __P((struct secpolicy *));
 static struct secashead *key_newsah __P((struct secasindex *));
 static void key_delsah __P((struct secashead *));
 static struct secasvar *key_newsav __P((struct mbuf *,
 	const struct sadb_msghdr *, struct secashead *, int *,
 	const char*, int));
 #define	KEY_NEWSAV(m, sadb, sah, e)				\
 	key_newsav(m, sadb, sah, e, __FILE__, __LINE__)
 static void key_delsav __P((struct secasvar *));
 static struct secashead *key_getsah __P((struct secasindex *));
 static struct secasvar *key_checkspidup __P((struct secasindex *, u_int32_t));
 static struct secasvar *key_getsavbyspi __P((struct secashead *, u_int32_t));
 static int key_setsaval __P((struct secasvar *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_mature __P((struct secasvar *));
 static struct mbuf *key_setdumpsa __P((struct secasvar *, u_int8_t,
 	u_int8_t, u_int32_t, u_int32_t));
 static struct mbuf *key_setsadbmsg __P((u_int8_t, u_int16_t, u_int8_t,
 	u_int32_t, pid_t, u_int16_t));
 static struct mbuf *key_setsadbsa __P((struct secasvar *));
 static struct mbuf *key_setsadbaddr __P((u_int16_t,
 	const struct sockaddr *, u_int8_t, u_int16_t));
 #ifdef IPSEC_NAT_T
 static struct mbuf *key_setsadbxport(u_int16_t, u_int16_t);
 static struct mbuf *key_setsadbxtype(u_int16_t);
 #endif
 static void key_porttosaddr(struct sockaddr *, u_int16_t);
 #define	KEY_PORTTOSADDR(saddr, port)				\
 	key_porttosaddr((struct sockaddr *)(saddr), (port))
 static struct mbuf *key_setsadbxsa2 __P((u_int8_t, u_int32_t, u_int32_t));
 static struct mbuf *key_setsadbxpolicy __P((u_int16_t, u_int8_t,
 	u_int32_t));
 static struct seckey *key_dup_keymsg(const struct sadb_key *, u_int, 
 				     struct malloc_type *);
 static struct seclifetime *key_dup_lifemsg(const struct sadb_lifetime *src,
 					    struct malloc_type *type);
 #ifdef INET6
 static int key_ismyaddr6 __P((struct sockaddr_in6 *));
 #endif
 
 /* flags for key_cmpsaidx() */
 #define CMP_HEAD	1	/* protocol, addresses. */
 #define CMP_MODE_REQID	2	/* additionally HEAD, reqid, mode. */
 #define CMP_REQID	3	/* additionally HEAD, reaid. */
 #define CMP_EXACTLY	4	/* all elements. */
 static int key_cmpsaidx
 	__P((const struct secasindex *, const struct secasindex *, int));
 
 static int key_cmpspidx_exactly
 	__P((struct secpolicyindex *, struct secpolicyindex *));
 static int key_cmpspidx_withmask
 	__P((struct secpolicyindex *, struct secpolicyindex *));
 static int key_sockaddrcmp __P((const struct sockaddr *, const struct sockaddr *, int));
 static int key_bbcmp __P((const void *, const void *, u_int));
 static u_int16_t key_satype2proto __P((u_int8_t));
 static u_int8_t key_proto2satype __P((u_int16_t));
 
 static int key_getspi __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static u_int32_t key_do_getnewspi __P((struct sadb_spirange *,
 					struct secasindex *));
 static int key_update __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 #ifdef IPSEC_DOSEQCHECK
 static struct secasvar *key_getsavbyseq __P((struct secashead *, u_int32_t));
 #endif
 static int key_add __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_setident __P((struct secashead *, struct mbuf *,
 	const struct sadb_msghdr *));
 static struct mbuf *key_getmsgbuf_x1 __P((struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_delete __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_get __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 
 static void key_getcomb_setlifetime __P((struct sadb_comb *));
 static struct mbuf *key_getcomb_esp __P((void));
 static struct mbuf *key_getcomb_ah __P((void));
 static struct mbuf *key_getcomb_ipcomp __P((void));
 static struct mbuf *key_getprop __P((const struct secasindex *));
 
 static int key_acquire __P((const struct secasindex *, struct secpolicy *));
 static struct secacq *key_newacq __P((const struct secasindex *));
 static struct secacq *key_getacq __P((const struct secasindex *));
 static struct secacq *key_getacqbyseq __P((u_int32_t));
 static struct secspacq *key_newspacq __P((struct secpolicyindex *));
 static struct secspacq *key_getspacq __P((struct secpolicyindex *));
 static int key_acquire2 __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_register __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_expire __P((struct secasvar *));
 static int key_flush __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_dump __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_promisc __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *));
 static int key_senderror __P((struct socket *, struct mbuf *, int));
 static int key_validate_ext __P((const struct sadb_ext *, int));
 static int key_align __P((struct mbuf *, struct sadb_msghdr *));
 static struct mbuf *key_setlifetime(struct seclifetime *src, 
 				     u_int16_t exttype);
 static struct mbuf *key_setkey(struct seckey *src, u_int16_t exttype);
 
 #if 0
 static const char *key_getfqdn __P((void));
 static const char *key_getuserfqdn __P((void));
 #endif
 static void key_sa_chgstate __P((struct secasvar *, u_int8_t));
 
 static __inline void
 sa_initref(struct secasvar *sav)
 {
 
 	refcount_init(&sav->refcnt, 1);
 }
 static __inline void
 sa_addref(struct secasvar *sav)
 {
 
 	refcount_acquire(&sav->refcnt);
 	IPSEC_ASSERT(sav->refcnt != 0, ("SA refcnt overflow"));
 }
 static __inline int
 sa_delref(struct secasvar *sav)
 {
 
 	IPSEC_ASSERT(sav->refcnt > 0, ("SA refcnt underflow"));
 	return (refcount_release(&sav->refcnt));
 }
 
 #define	SP_ADDREF(p) do {						\
 	(p)->refcnt++;							\
 	IPSEC_ASSERT((p)->refcnt != 0, ("SP refcnt overflow"));		\
 } while (0)
 #define	SP_DELREF(p) do {						\
 	IPSEC_ASSERT((p)->refcnt > 0, ("SP refcnt underflow"));		\
 	(p)->refcnt--;							\
 } while (0)
  
 
 /*
  * Update the refcnt while holding the SPTREE lock.
  */
 void
 key_addref(struct secpolicy *sp)
 {
 	SPTREE_LOCK();
 	SP_ADDREF(sp);
 	SPTREE_UNLOCK();
 }
 
 /*
  * Return 0 when there are known to be no SP's for the specified
  * direction.  Otherwise return 1.  This is used by IPsec code
  * to optimize performance.
  */
 int
 key_havesp(u_int dir)
 {
 
 	return (dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND ?
 		LIST_FIRST(&V_sptree[dir]) != NULL : 1);
 }
 
 /* %%% IPsec policy management */
 /*
  * allocating a SP for OUTBOUND or INBOUND packet.
  * Must call key_freesp() later.
  * OUT:	NULL:	not found
  *	others:	found and return the pointer.
  */
 struct secpolicy *
 key_allocsp(struct secpolicyindex *spidx, u_int dir, const char* where, int tag)
 {
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(spidx != NULL, ("null spidx"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s from %s:%u\n", __func__, where, tag));
 
 	/* get a SP entry */
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("*** objects\n");
 		kdebug_secpolicyindex(spidx));
 
 	SPTREE_LOCK();
 	LIST_FOREACH(sp, &V_sptree[dir], chain) {
 		KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 			printf("*** in SPD\n");
 			kdebug_secpolicyindex(&sp->spidx));
 
 		if (sp->state == IPSEC_SPSTATE_DEAD)
 			continue;
 		if (key_cmpspidx_withmask(&sp->spidx, spidx))
 			goto found;
 	}
 	sp = NULL;
 found:
 	if (sp) {
 		/* sanity check */
 		KEY_CHKSPDIR(sp->spidx.dir, dir, __func__);
 
 		/* found a SPD entry */
 		sp->lastused = time_second;
 		SP_ADDREF(sp);
 	}
 	SPTREE_UNLOCK();
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__,
 			sp, sp ? sp->id : 0, sp ? sp->refcnt : 0));
 	return sp;
 }
 
 /*
  * allocating a SP for OUTBOUND or INBOUND packet.
  * Must call key_freesp() later.
  * OUT:	NULL:	not found
  *	others:	found and return the pointer.
  */
 struct secpolicy *
 key_allocsp2(u_int32_t spi,
 	     union sockaddr_union *dst,
 	     u_int8_t proto,
 	     u_int dir,
 	     const char* where, int tag)
 {
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(dst != NULL, ("null dst"));
 	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND,
 		("invalid direction %u", dir));
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s from %s:%u\n", __func__, where, tag));
 
 	/* get a SP entry */
 	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 		printf("*** objects\n");
 		printf("spi %u proto %u dir %u\n", spi, proto, dir);
 		kdebug_sockaddr(&dst->sa));
 
 	SPTREE_LOCK();
 	LIST_FOREACH(sp, &V_sptree[dir], chain) {
 		KEYDEBUG(KEYDEBUG_IPSEC_DATA,
 			printf("*** in SPD\n");
 			kdebug_secpolicyindex(&sp->spidx));
 
 		if (sp->state == IPSEC_SPSTATE_DEAD)
 			continue;
 		/* compare simple values, then dst address */
 		if (sp->spidx.ul_proto != proto)
 			continue;
 		/* NB: spi's must exist and match */
 		if (!sp->req || !sp->req->sav || sp->req->sav->spi != spi)
 			continue;
 		if (key_sockaddrcmp(&sp->spidx.dst.sa, &dst->sa, 1) == 0)
 			goto found;
 	}
 	sp = NULL;
 found:
 	if (sp) {
 		/* sanity check */
 		KEY_CHKSPDIR(sp->spidx.dir, dir, __func__);
 
 		/* found a SPD entry */
 		sp->lastused = time_second;
 		SP_ADDREF(sp);
 	}
 	SPTREE_UNLOCK();
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__,
 			sp, sp ? sp->id : 0, sp ? sp->refcnt : 0));
 	return sp;
 }
 
 #if 0
 /*
  * return a policy that matches this particular inbound packet.
  * XXX slow
  */
 struct secpolicy *
 key_gettunnel(const struct sockaddr *osrc,
 	      const struct sockaddr *odst,
 	      const struct sockaddr *isrc,
 	      const struct sockaddr *idst,
 	      const char* where, int tag)
 {
 	struct secpolicy *sp;
 	const int dir = IPSEC_DIR_INBOUND;
 	struct ipsecrequest *r1, *r2, *p;
 	struct secpolicyindex spidx;
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s from %s:%u\n", __func__, where, tag));
 
 	if (isrc->sa_family != idst->sa_family) {
 		ipseclog((LOG_ERR, "%s: protocol family mismatched %d != %d\n.",
 			__func__, isrc->sa_family, idst->sa_family));
 		sp = NULL;
 		goto done;
 	}
 
 	SPTREE_LOCK();
 	LIST_FOREACH(sp, &V_sptree[dir], chain) {
 		if (sp->state == IPSEC_SPSTATE_DEAD)
 			continue;
 
 		r1 = r2 = NULL;
 		for (p = sp->req; p; p = p->next) {
 			if (p->saidx.mode != IPSEC_MODE_TUNNEL)
 				continue;
 
 			r1 = r2;
 			r2 = p;
 
 			if (!r1) {
 				/* here we look at address matches only */
 				spidx = sp->spidx;
 				if (isrc->sa_len > sizeof(spidx.src) ||
 				    idst->sa_len > sizeof(spidx.dst))
 					continue;
 				bcopy(isrc, &spidx.src, isrc->sa_len);
 				bcopy(idst, &spidx.dst, idst->sa_len);
 				if (!key_cmpspidx_withmask(&sp->spidx, &spidx))
 					continue;
 			} else {
 				if (key_sockaddrcmp(&r1->saidx.src.sa, isrc, 0) ||
 				    key_sockaddrcmp(&r1->saidx.dst.sa, idst, 0))
 					continue;
 			}
 
 			if (key_sockaddrcmp(&r2->saidx.src.sa, osrc, 0) ||
 			    key_sockaddrcmp(&r2->saidx.dst.sa, odst, 0))
 				continue;
 
 			goto found;
 		}
 	}
 	sp = NULL;
 found:
 	if (sp) {
 		sp->lastused = time_second;
 		SP_ADDREF(sp);
 	}
 	SPTREE_UNLOCK();
 done:
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__,
 			sp, sp ? sp->id : 0, sp ? sp->refcnt : 0));
 	return sp;
 }
 #endif
 
 /*
  * allocating an SA entry for an *OUTBOUND* packet.
  * checking each request entries in SP, and acquire an SA if need.
  * OUT:	0: there are valid requests.
  *	ENOENT: policy may be valid, but SA with REQUIRE is on acquiring.
  */
 int
 key_checkrequest(struct ipsecrequest *isr, const struct secasindex *saidx)
 {
 	u_int level;
 	int error;
 	struct secasvar *sav;
 
 	IPSEC_ASSERT(isr != NULL, ("null isr"));
 	IPSEC_ASSERT(saidx != NULL, ("null saidx"));
 	IPSEC_ASSERT(saidx->mode == IPSEC_MODE_TRANSPORT ||
 		saidx->mode == IPSEC_MODE_TUNNEL,
 		("unexpected policy %u", saidx->mode));
 
 	/*
 	 * XXX guard against protocol callbacks from the crypto
 	 * thread as they reference ipsecrequest.sav which we
 	 * temporarily null out below.  Need to rethink how we
 	 * handle bundled SA's in the callback thread.
 	 */
 	IPSECREQUEST_LOCK_ASSERT(isr);
 
 	/* get current level */
 	level = ipsec_get_reqlevel(isr);
 
 	/*
 	 * We check new SA in the IPsec request because a different
 	 * SA may be involved each time this request is checked, either
 	 * because new SAs are being configured, or this request is
 	 * associated with an unconnected datagram socket, or this request
 	 * is associated with a system default policy.
 	 *
 	 * key_allocsa_policy should allocate the oldest SA available.
 	 * See key_do_allocsa_policy(), and draft-jenkins-ipsec-rekeying-03.txt.
 	 */
 	sav = key_allocsa_policy(saidx);
 	if (sav != isr->sav) {
 		/* SA need to be updated. */
 		if (!IPSECREQUEST_UPGRADE(isr)) {
 			/* Kick everyone off. */
 			IPSECREQUEST_UNLOCK(isr);
 			IPSECREQUEST_WLOCK(isr);
 		}
 		if (isr->sav != NULL)
 			KEY_FREESAV(&isr->sav);
 		isr->sav = sav;
 		IPSECREQUEST_DOWNGRADE(isr);
 	} else if (sav != NULL)
 		KEY_FREESAV(&sav);
 
 	/* When there is SA. */
 	if (isr->sav != NULL) {
 		if (isr->sav->state != SADB_SASTATE_MATURE &&
 		    isr->sav->state != SADB_SASTATE_DYING)
 			return EINVAL;
 		return 0;
 	}
 
 	/* there is no SA */
 	error = key_acquire(saidx, isr->sp);
 	if (error != 0) {
 		/* XXX What should I do ? */
 		ipseclog((LOG_DEBUG, "%s: error %d returned from key_acquire\n",
 			__func__, error));
 		return error;
 	}
 
 	if (level != IPSEC_LEVEL_REQUIRE) {
 		/* XXX sigh, the interface to this routine is botched */
 		IPSEC_ASSERT(isr->sav == NULL, ("unexpected SA"));
 		return 0;
 	} else {
 		return ENOENT;
 	}
 }
 
 /*
  * allocating a SA for policy entry from SAD.
  * NOTE: searching SAD of aliving state.
  * OUT:	NULL:	not found.
  *	others:	found and return the pointer.
  */
 static struct secasvar *
 key_allocsa_policy(const struct secasindex *saidx)
 {
 #define	N(a)	_ARRAYLEN(a)
 	struct secashead *sah;
 	struct secasvar *sav;
 	u_int stateidx, arraysize;
 	const u_int *state_valid;
 
 	state_valid = NULL;	/* silence gcc */
 	arraysize = 0;		/* silence gcc */
 
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE_REQID)) {
 			if (V_key_preferred_oldsa) {
 				state_valid = saorder_state_valid_prefer_old;
 				arraysize = N(saorder_state_valid_prefer_old);
 			} else {
 				state_valid = saorder_state_valid_prefer_new;
 				arraysize = N(saorder_state_valid_prefer_new);
 			}
 			break;
 		}
 	}
 	SAHTREE_UNLOCK();
 	if (sah == NULL)
 		return NULL;
 
 	/* search valid state */
 	for (stateidx = 0; stateidx < arraysize; stateidx++) {
 		sav = key_do_allocsa_policy(sah, state_valid[stateidx]);
 		if (sav != NULL)
 			return sav;
 	}
 
 	return NULL;
 #undef N
 }
 
 /*
  * searching SAD with direction, protocol, mode and state.
  * called by key_allocsa_policy().
  * OUT:
  *	NULL	: not found
  *	others	: found, pointer to a SA.
  */
 static struct secasvar *
 key_do_allocsa_policy(struct secashead *sah, u_int state)
 {
 	struct secasvar *sav, *nextsav, *candidate, *d;
 
 	/* initilize */
 	candidate = NULL;
 
 	SAHTREE_LOCK();
 	for (sav = LIST_FIRST(&sah->savtree[state]);
 	     sav != NULL;
 	     sav = nextsav) {
 
 		nextsav = LIST_NEXT(sav, chain);
 
 		/* sanity check */
 		KEY_CHKSASTATE(sav->state, state, __func__);
 
 		/* initialize */
 		if (candidate == NULL) {
 			candidate = sav;
 			continue;
 		}
 
 		/* Which SA is the better ? */
 
 		IPSEC_ASSERT(candidate->lft_c != NULL,
 			("null candidate lifetime"));
 		IPSEC_ASSERT(sav->lft_c != NULL, ("null sav lifetime"));
 
 		/* What the best method is to compare ? */
 		if (V_key_preferred_oldsa) {
 			if (candidate->lft_c->addtime >
 					sav->lft_c->addtime) {
 				candidate = sav;
 			}
 			continue;
 			/*NOTREACHED*/
 		}
 
 		/* preferred new sa rather than old sa */
 		if (candidate->lft_c->addtime <
 				sav->lft_c->addtime) {
 			d = candidate;
 			candidate = sav;
 		} else
 			d = sav;
 
 		/*
 		 * prepared to delete the SA when there is more
 		 * suitable candidate and the lifetime of the SA is not
 		 * permanent.
 		 */
 		if (d->lft_h->addtime != 0) {
 			struct mbuf *m, *result;
 			u_int8_t satype;
 
 			key_sa_chgstate(d, SADB_SASTATE_DEAD);
 
 			IPSEC_ASSERT(d->refcnt > 0, ("bogus ref count"));
 
 			satype = key_proto2satype(d->sah->saidx.proto);
 			if (satype == 0)
 				goto msgfail;
 
 			m = key_setsadbmsg(SADB_DELETE, 0,
 			    satype, 0, 0, d->refcnt - 1);
 			if (!m)
 				goto msgfail;
 			result = m;
 
 			/* set sadb_address for saidx's. */
 			m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
 				&d->sah->saidx.src.sa,
 				d->sah->saidx.src.sa.sa_len << 3,
 				IPSEC_ULPROTO_ANY);
 			if (!m)
 				goto msgfail;
 			m_cat(result, m);
 
 			/* set sadb_address for saidx's. */
 			m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
 				&d->sah->saidx.dst.sa,
 				d->sah->saidx.dst.sa.sa_len << 3,
 				IPSEC_ULPROTO_ANY);
 			if (!m)
 				goto msgfail;
 			m_cat(result, m);
 
 			/* create SA extension */
 			m = key_setsadbsa(d);
 			if (!m)
 				goto msgfail;
 			m_cat(result, m);
 
 			if (result->m_len < sizeof(struct sadb_msg)) {
 				result = m_pullup(result,
 						sizeof(struct sadb_msg));
 				if (result == NULL)
 					goto msgfail;
 			}
 
 			result->m_pkthdr.len = 0;
 			for (m = result; m; m = m->m_next)
 				result->m_pkthdr.len += m->m_len;
 			mtod(result, struct sadb_msg *)->sadb_msg_len =
 				PFKEY_UNIT64(result->m_pkthdr.len);
 
 			if (key_sendup_mbuf(NULL, result,
 					KEY_SENDUP_REGISTERED))
 				goto msgfail;
 		 msgfail:
 			KEY_FREESAV(&d);
 		}
 	}
 	if (candidate) {
 		sa_addref(candidate);
 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 			printf("DP %s cause refcnt++:%d SA:%p\n",
 				__func__, candidate->refcnt, candidate));
 	}
 	SAHTREE_UNLOCK();
 
 	return candidate;
 }
 
 /*
  * allocating a usable SA entry for a *INBOUND* packet.
  * Must call key_freesav() later.
  * OUT: positive:	pointer to a usable sav (i.e. MATURE or DYING state).
  *	NULL:		not found, or error occured.
  *
  * In the comparison, no source address is used--for RFC2401 conformance.
  * To quote, from section 4.1:
  *	A security association is uniquely identified by a triple consisting
  *	of a Security Parameter Index (SPI), an IP Destination Address, and a
  *	security protocol (AH or ESP) identifier.
  * Note that, however, we do need to keep source address in IPsec SA.
  * IKE specification and PF_KEY specification do assume that we
  * keep source address in IPsec SA.  We see a tricky situation here.
  */
 struct secasvar *
 key_allocsa(
 	union sockaddr_union *dst,
 	u_int proto,
 	u_int32_t spi,
 	const char* where, int tag)
 {
 	struct secashead *sah;
 	struct secasvar *sav;
 	u_int stateidx, arraysize, state;
 	const u_int *saorder_state_valid;
 #ifdef IPSEC_NAT_T
 	int natt_chkport;
 #endif
 
 	IPSEC_ASSERT(dst != NULL, ("null dst address"));
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s from %s:%u\n", __func__, where, tag));
 
 #ifdef IPSEC_NAT_T
         natt_chkport = (dst->sa.sa_family == AF_INET &&
 	    dst->sa.sa_len == sizeof(struct sockaddr_in) &&
 	    dst->sin.sin_port != 0);
 #endif
 
 	/*
 	 * searching SAD.
 	 * XXX: to be checked internal IP header somewhere.  Also when
 	 * IPsec tunnel packet is received.  But ESP tunnel mode is
 	 * encrypted so we can't check internal IP header.
 	 */
 	SAHTREE_LOCK();
 	if (V_key_preferred_oldsa) {
 		saorder_state_valid = saorder_state_valid_prefer_old;
 		arraysize = _ARRAYLEN(saorder_state_valid_prefer_old);
 	} else {
 		saorder_state_valid = saorder_state_valid_prefer_new;
 		arraysize = _ARRAYLEN(saorder_state_valid_prefer_new);
 	}
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		int checkport;
 
 		/* search valid state */
 		for (stateidx = 0; stateidx < arraysize; stateidx++) {
 			state = saorder_state_valid[stateidx];
 			LIST_FOREACH(sav, &sah->savtree[state], chain) {
 				/* sanity check */
 				KEY_CHKSASTATE(sav->state, state, __func__);
 				/* do not return entries w/ unusable state */
 				if (sav->state != SADB_SASTATE_MATURE &&
 				    sav->state != SADB_SASTATE_DYING)
 					continue;
 				if (proto != sav->sah->saidx.proto)
 					continue;
 				if (spi != sav->spi)
 					continue;
 				checkport = 0;
 #ifdef IPSEC_NAT_T
 				/*
 				 * Really only check ports when this is a NAT-T
 				 * SA.  Otherwise other lookups providing ports
 				 * might suffer.
 				 */
 				if (sav->natt_type && natt_chkport)
 					checkport = 1;
 #endif
 #if 0	/* don't check src */
 				/* check src address */
 				if (key_sockaddrcmp(&src->sa,	
 				    &sav->sah->saidx.src.sa, checkport) != 0)
 					continue;
 #endif
 				/* check dst address */
 				if (key_sockaddrcmp(&dst->sa,
 				    &sav->sah->saidx.dst.sa, checkport) != 0)
 					continue;
 				sa_addref(sav);
 				goto done;
 			}
 		}
 	}
 	sav = NULL;
 done:
 	SAHTREE_UNLOCK();
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s return SA:%p; refcnt %u\n", __func__,
 			sav, sav ? sav->refcnt : 0));
 	return sav;
 }
 
 /*
  * Must be called after calling key_allocsp().
  * For both the packet without socket and key_freeso().
  */
 void
 _key_freesp(struct secpolicy **spp, const char* where, int tag)
 {
 	struct secpolicy *sp = *spp;
 
 	IPSEC_ASSERT(sp != NULL, ("null sp"));
 
 	SPTREE_LOCK();
 	SP_DELREF(sp);
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s SP:%p (ID=%u) from %s:%u; refcnt now %u\n",
 			__func__, sp, sp->id, where, tag, sp->refcnt));
 
 	if (sp->refcnt == 0) {
 		*spp = NULL;
 		key_delsp(sp);
 	}
 	SPTREE_UNLOCK();
 }
 
 /*
  * Must be called after calling key_allocsp().
  * For the packet with socket.
  */
 void
 key_freeso(struct socket *so)
 {
 	IPSEC_ASSERT(so != NULL, ("null so"));
 
 	switch (so->so_proto->pr_domain->dom_family) {
 #if defined(INET) || defined(INET6)
 #ifdef INET
 	case PF_INET:
 #endif
 #ifdef INET6
 	case PF_INET6:
 #endif
 	    {
 		struct inpcb *pcb = sotoinpcb(so);
 
 		/* Does it have a PCB ? */
 		if (pcb == NULL)
 			return;
 		key_freesp_so(&pcb->inp_sp->sp_in);
 		key_freesp_so(&pcb->inp_sp->sp_out);
 	    }
 		break;
 #endif /* INET || INET6 */
 	default:
 		ipseclog((LOG_DEBUG, "%s: unknown address family=%d.\n",
 		    __func__, so->so_proto->pr_domain->dom_family));
 		return;
 	}
 }
 
 static void
 key_freesp_so(struct secpolicy **sp)
 {
 	IPSEC_ASSERT(sp != NULL && *sp != NULL, ("null sp"));
 
 	if ((*sp)->policy == IPSEC_POLICY_ENTRUST ||
 	    (*sp)->policy == IPSEC_POLICY_BYPASS)
 		return;
 
 	IPSEC_ASSERT((*sp)->policy == IPSEC_POLICY_IPSEC,
 		("invalid policy %u", (*sp)->policy));
 	KEY_FREESP(sp);
 }
 
 void
 key_addrefsa(struct secasvar *sav, const char* where, int tag)
 {
 
 	IPSEC_ASSERT(sav != NULL, ("null sav"));
 	IPSEC_ASSERT(sav->refcnt > 0, ("refcount must exist"));
 
 	sa_addref(sav);
 }
 
 /*
  * Must be called after calling key_allocsa().
  * This function is called by key_freesp() to free some SA allocated
  * for a policy.
  */
 void
 key_freesav(struct secasvar **psav, const char* where, int tag)
 {
 	struct secasvar *sav = *psav;
 
 	IPSEC_ASSERT(sav != NULL, ("null sav"));
 
 	if (sa_delref(sav)) {
 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 			printf("DP %s SA:%p (SPI %u) from %s:%u; refcnt now %u\n",
 				__func__, sav, ntohl(sav->spi), where, tag, sav->refcnt));
 		*psav = NULL;
 		key_delsav(sav);
 	} else {
 		KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 			printf("DP %s SA:%p (SPI %u) from %s:%u; refcnt now %u\n",
 				__func__, sav, ntohl(sav->spi), where, tag, sav->refcnt));
 	}
 }
 
 /* %%% SPD management */
 /*
  * free security policy entry.
  */
 static void
 key_delsp(struct secpolicy *sp)
 {
 	struct ipsecrequest *isr, *nextisr;
 
 	IPSEC_ASSERT(sp != NULL, ("null sp"));
 	SPTREE_LOCK_ASSERT();
 
 	sp->state = IPSEC_SPSTATE_DEAD;
 
 	IPSEC_ASSERT(sp->refcnt == 0,
 		("SP with references deleted (refcnt %u)", sp->refcnt));
 
 	/* remove from SP index */
 	if (__LIST_CHAINED(sp))
 		LIST_REMOVE(sp, chain);
 
 	for (isr = sp->req; isr != NULL; isr = nextisr) {
 		if (isr->sav != NULL) {
 			KEY_FREESAV(&isr->sav);
 			isr->sav = NULL;
 		}
 
 		nextisr = isr->next;
 		ipsec_delisr(isr);
 	}
 	_key_delsp(sp);
 }
 
 /*
  * search SPD
  * OUT:	NULL	: not found
  *	others	: found, pointer to a SP.
  */
 static struct secpolicy *
 key_getsp(struct secpolicyindex *spidx)
 {
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(spidx != NULL, ("null spidx"));
 
 	SPTREE_LOCK();
 	LIST_FOREACH(sp, &V_sptree[spidx->dir], chain) {
 		if (sp->state == IPSEC_SPSTATE_DEAD)
 			continue;
 		if (key_cmpspidx_exactly(spidx, &sp->spidx)) {
 			SP_ADDREF(sp);
 			break;
 		}
 	}
 	SPTREE_UNLOCK();
 
 	return sp;
 }
 
 /*
  * get SP by index.
  * OUT:	NULL	: not found
  *	others	: found, pointer to a SP.
  */
 static struct secpolicy *
 key_getspbyid(u_int32_t id)
 {
 	struct secpolicy *sp;
 
 	SPTREE_LOCK();
 	LIST_FOREACH(sp, &V_sptree[IPSEC_DIR_INBOUND], chain) {
 		if (sp->state == IPSEC_SPSTATE_DEAD)
 			continue;
 		if (sp->id == id) {
 			SP_ADDREF(sp);
 			goto done;
 		}
 	}
 
 	LIST_FOREACH(sp, &V_sptree[IPSEC_DIR_OUTBOUND], chain) {
 		if (sp->state == IPSEC_SPSTATE_DEAD)
 			continue;
 		if (sp->id == id) {
 			SP_ADDREF(sp);
 			goto done;
 		}
 	}
 done:
 	SPTREE_UNLOCK();
 
 	return sp;
 }
 
 struct secpolicy *
 key_newsp(const char* where, int tag)
 {
 	struct secpolicy *newsp = NULL;
 
 	newsp = (struct secpolicy *)
 		malloc(sizeof(struct secpolicy), M_IPSEC_SP, M_NOWAIT|M_ZERO);
 	if (newsp) {
 		SECPOLICY_LOCK_INIT(newsp);
 		newsp->refcnt = 1;
 		newsp->req = NULL;
 	}
 
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s from %s:%u return SP:%p\n", __func__,
 			where, tag, newsp));
 	return newsp;
 }
 
 static void
 _key_delsp(struct secpolicy *sp)
 {
 	SECPOLICY_LOCK_DESTROY(sp);
 	free(sp, M_IPSEC_SP);
 }
 
 /*
  * create secpolicy structure from sadb_x_policy structure.
  * NOTE: `state', `secpolicyindex' in secpolicy structure are not set,
  * so must be set properly later.
  */
 struct secpolicy *
 key_msg2sp(xpl0, len, error)
 	struct sadb_x_policy *xpl0;
 	size_t len;
 	int *error;
 {
 	struct secpolicy *newsp;
 
 	IPSEC_ASSERT(xpl0 != NULL, ("null xpl0"));
 	IPSEC_ASSERT(len >= sizeof(*xpl0), ("policy too short: %zu", len));
 
 	if (len != PFKEY_EXTLEN(xpl0)) {
 		ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n", __func__));
 		*error = EINVAL;
 		return NULL;
 	}
 
 	if ((newsp = KEY_NEWSP()) == NULL) {
 		*error = ENOBUFS;
 		return NULL;
 	}
 
 	newsp->spidx.dir = xpl0->sadb_x_policy_dir;
 	newsp->policy = xpl0->sadb_x_policy_type;
 
 	/* check policy */
 	switch (xpl0->sadb_x_policy_type) {
 	case IPSEC_POLICY_DISCARD:
 	case IPSEC_POLICY_NONE:
 	case IPSEC_POLICY_ENTRUST:
 	case IPSEC_POLICY_BYPASS:
 		newsp->req = NULL;
 		break;
 
 	case IPSEC_POLICY_IPSEC:
 	    {
 		int tlen;
 		struct sadb_x_ipsecrequest *xisr;
 		struct ipsecrequest **p_isr = &newsp->req;
 
 		/* validity check */
 		if (PFKEY_EXTLEN(xpl0) < sizeof(*xpl0)) {
 			ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n",
 				__func__));
 			KEY_FREESP(&newsp);
 			*error = EINVAL;
 			return NULL;
 		}
 
 		tlen = PFKEY_EXTLEN(xpl0) - sizeof(*xpl0);
 		xisr = (struct sadb_x_ipsecrequest *)(xpl0 + 1);
 
 		while (tlen > 0) {
 			/* length check */
 			if (xisr->sadb_x_ipsecrequest_len < sizeof(*xisr)) {
 				ipseclog((LOG_DEBUG, "%s: invalid ipsecrequest "
 					"length.\n", __func__));
 				KEY_FREESP(&newsp);
 				*error = EINVAL;
 				return NULL;
 			}
 
 			/* allocate request buffer */
 			/* NB: data structure is zero'd */
 			*p_isr = ipsec_newisr();
 			if ((*p_isr) == NULL) {
 				ipseclog((LOG_DEBUG,
 				    "%s: No more memory.\n", __func__));
 				KEY_FREESP(&newsp);
 				*error = ENOBUFS;
 				return NULL;
 			}
 
 			/* set values */
 			switch (xisr->sadb_x_ipsecrequest_proto) {
 			case IPPROTO_ESP:
 			case IPPROTO_AH:
 			case IPPROTO_IPCOMP:
 				break;
 			default:
 				ipseclog((LOG_DEBUG,
 				    "%s: invalid proto type=%u\n", __func__,
 				    xisr->sadb_x_ipsecrequest_proto));
 				KEY_FREESP(&newsp);
 				*error = EPROTONOSUPPORT;
 				return NULL;
 			}
 			(*p_isr)->saidx.proto = xisr->sadb_x_ipsecrequest_proto;
 
 			switch (xisr->sadb_x_ipsecrequest_mode) {
 			case IPSEC_MODE_TRANSPORT:
 			case IPSEC_MODE_TUNNEL:
 				break;
 			case IPSEC_MODE_ANY:
 			default:
 				ipseclog((LOG_DEBUG,
 				    "%s: invalid mode=%u\n", __func__,
 				    xisr->sadb_x_ipsecrequest_mode));
 				KEY_FREESP(&newsp);
 				*error = EINVAL;
 				return NULL;
 			}
 			(*p_isr)->saidx.mode = xisr->sadb_x_ipsecrequest_mode;
 
 			switch (xisr->sadb_x_ipsecrequest_level) {
 			case IPSEC_LEVEL_DEFAULT:
 			case IPSEC_LEVEL_USE:
 			case IPSEC_LEVEL_REQUIRE:
 				break;
 			case IPSEC_LEVEL_UNIQUE:
 				/* validity check */
 				/*
 				 * If range violation of reqid, kernel will
 				 * update it, don't refuse it.
 				 */
 				if (xisr->sadb_x_ipsecrequest_reqid
 						> IPSEC_MANUAL_REQID_MAX) {
 					ipseclog((LOG_DEBUG,
 					    "%s: reqid=%d range "
 					    "violation, updated by kernel.\n",
 					    __func__,
 					    xisr->sadb_x_ipsecrequest_reqid));
 					xisr->sadb_x_ipsecrequest_reqid = 0;
 				}
 
 				/* allocate new reqid id if reqid is zero. */
 				if (xisr->sadb_x_ipsecrequest_reqid == 0) {
 					u_int32_t reqid;
 					if ((reqid = key_newreqid()) == 0) {
 						KEY_FREESP(&newsp);
 						*error = ENOBUFS;
 						return NULL;
 					}
 					(*p_isr)->saidx.reqid = reqid;
 					xisr->sadb_x_ipsecrequest_reqid = reqid;
 				} else {
 				/* set it for manual keying. */
 					(*p_isr)->saidx.reqid =
 						xisr->sadb_x_ipsecrequest_reqid;
 				}
 				break;
 
 			default:
 				ipseclog((LOG_DEBUG, "%s: invalid level=%u\n",
 					__func__,
 					xisr->sadb_x_ipsecrequest_level));
 				KEY_FREESP(&newsp);
 				*error = EINVAL;
 				return NULL;
 			}
 			(*p_isr)->level = xisr->sadb_x_ipsecrequest_level;
 
 			/* set IP addresses if there */
 			if (xisr->sadb_x_ipsecrequest_len > sizeof(*xisr)) {
 				struct sockaddr *paddr;
 
 				paddr = (struct sockaddr *)(xisr + 1);
 
 				/* validity check */
 				if (paddr->sa_len
 				    > sizeof((*p_isr)->saidx.src)) {
 					ipseclog((LOG_DEBUG, "%s: invalid "
 						"request address length.\n",
 						__func__));
 					KEY_FREESP(&newsp);
 					*error = EINVAL;
 					return NULL;
 				}
 				bcopy(paddr, &(*p_isr)->saidx.src,
 					paddr->sa_len);
 
 				paddr = (struct sockaddr *)((caddr_t)paddr
 							+ paddr->sa_len);
 
 				/* validity check */
 				if (paddr->sa_len
 				    > sizeof((*p_isr)->saidx.dst)) {
 					ipseclog((LOG_DEBUG, "%s: invalid "
 						"request address length.\n",
 						__func__));
 					KEY_FREESP(&newsp);
 					*error = EINVAL;
 					return NULL;
 				}
 				bcopy(paddr, &(*p_isr)->saidx.dst,
 					paddr->sa_len);
 			}
 
 			(*p_isr)->sp = newsp;
 
 			/* initialization for the next. */
 			p_isr = &(*p_isr)->next;
 			tlen -= xisr->sadb_x_ipsecrequest_len;
 
 			/* validity check */
 			if (tlen < 0) {
 				ipseclog((LOG_DEBUG, "%s: becoming tlen < 0.\n",
 					__func__));
 				KEY_FREESP(&newsp);
 				*error = EINVAL;
 				return NULL;
 			}
 
 			xisr = (struct sadb_x_ipsecrequest *)((caddr_t)xisr
 			                 + xisr->sadb_x_ipsecrequest_len);
 		}
 	    }
 		break;
 	default:
 		ipseclog((LOG_DEBUG, "%s: invalid policy type.\n", __func__));
 		KEY_FREESP(&newsp);
 		*error = EINVAL;
 		return NULL;
 	}
 
 	*error = 0;
 	return newsp;
 }
 
 static u_int32_t
 key_newreqid()
 {
 	static u_int32_t auto_reqid = IPSEC_MANUAL_REQID_MAX + 1;
 
 	auto_reqid = (auto_reqid == ~0
 			? IPSEC_MANUAL_REQID_MAX + 1 : auto_reqid + 1);
 
 	/* XXX should be unique check */
 
 	return auto_reqid;
 }
 
 /*
  * copy secpolicy struct to sadb_x_policy structure indicated.
  */
 struct mbuf *
 key_sp2msg(sp)
 	struct secpolicy *sp;
 {
 	struct sadb_x_policy *xpl;
 	int tlen;
 	caddr_t p;
 	struct mbuf *m;
 
 	IPSEC_ASSERT(sp != NULL, ("null policy"));
 
 	tlen = key_getspreqmsglen(sp);
 
 	m = m_get2(tlen, M_NOWAIT, MT_DATA, 0);
 	if (m == NULL)
 		return (NULL);
 	m_align(m, tlen);
 	m->m_len = tlen;
 	xpl = mtod(m, struct sadb_x_policy *);
 	bzero(xpl, tlen);
 
 	xpl->sadb_x_policy_len = PFKEY_UNIT64(tlen);
 	xpl->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
 	xpl->sadb_x_policy_type = sp->policy;
 	xpl->sadb_x_policy_dir = sp->spidx.dir;
 	xpl->sadb_x_policy_id = sp->id;
 	p = (caddr_t)xpl + sizeof(*xpl);
 
 	/* if is the policy for ipsec ? */
 	if (sp->policy == IPSEC_POLICY_IPSEC) {
 		struct sadb_x_ipsecrequest *xisr;
 		struct ipsecrequest *isr;
 
 		for (isr = sp->req; isr != NULL; isr = isr->next) {
 
 			xisr = (struct sadb_x_ipsecrequest *)p;
 
 			xisr->sadb_x_ipsecrequest_proto = isr->saidx.proto;
 			xisr->sadb_x_ipsecrequest_mode = isr->saidx.mode;
 			xisr->sadb_x_ipsecrequest_level = isr->level;
 			xisr->sadb_x_ipsecrequest_reqid = isr->saidx.reqid;
 
 			p += sizeof(*xisr);
 			bcopy(&isr->saidx.src, p, isr->saidx.src.sa.sa_len);
 			p += isr->saidx.src.sa.sa_len;
 			bcopy(&isr->saidx.dst, p, isr->saidx.dst.sa.sa_len);
 			p += isr->saidx.src.sa.sa_len;
 
 			xisr->sadb_x_ipsecrequest_len =
 				PFKEY_ALIGN8(sizeof(*xisr)
 					+ isr->saidx.src.sa.sa_len
 					+ isr->saidx.dst.sa.sa_len);
 		}
 	}
 
 	return m;
 }
 
 /* m will not be freed nor modified */
 static struct mbuf *
 #ifdef __STDC__
 key_gather_mbuf(struct mbuf *m, const struct sadb_msghdr *mhp,
 	int ndeep, int nitem, ...)
 #else
 key_gather_mbuf(m, mhp, ndeep, nitem, va_alist)
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 	int ndeep;
 	int nitem;
 	va_dcl
 #endif
 {
 	va_list ap;
 	int idx;
 	int i;
 	struct mbuf *result = NULL, *n;
 	int len;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 
 	va_start(ap, nitem);
 	for (i = 0; i < nitem; i++) {
 		idx = va_arg(ap, int);
 		if (idx < 0 || idx > SADB_EXT_MAX)
 			goto fail;
 		/* don't attempt to pull empty extension */
 		if (idx == SADB_EXT_RESERVED && mhp->msg == NULL)
 			continue;
 		if (idx != SADB_EXT_RESERVED  &&
 		    (mhp->ext[idx] == NULL || mhp->extlen[idx] == 0))
 			continue;
 
 		if (idx == SADB_EXT_RESERVED) {
 			len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
 
 			IPSEC_ASSERT(len <= MHLEN, ("header too big %u", len));
 
 			MGETHDR(n, M_NOWAIT, MT_DATA);
 			if (!n)
 				goto fail;
 			n->m_len = len;
 			n->m_next = NULL;
 			m_copydata(m, 0, sizeof(struct sadb_msg),
 			    mtod(n, caddr_t));
 		} else if (i < ndeep) {
 			len = mhp->extlen[idx];
 			n = m_get2(len, M_NOWAIT, MT_DATA, 0);
 			if (n == NULL)
 				goto fail;
 			m_align(n, len);
 			n->m_len = len;
 			m_copydata(m, mhp->extoff[idx], mhp->extlen[idx],
 			    mtod(n, caddr_t));
 		} else {
 			n = m_copym(m, mhp->extoff[idx], mhp->extlen[idx],
 			    M_NOWAIT);
 		}
 		if (n == NULL)
 			goto fail;
 
 		if (result)
 			m_cat(result, n);
 		else
 			result = n;
 	}
 	va_end(ap);
 
 	if ((result->m_flags & M_PKTHDR) != 0) {
 		result->m_pkthdr.len = 0;
 		for (n = result; n; n = n->m_next)
 			result->m_pkthdr.len += n->m_len;
 	}
 
 	return result;
 
 fail:
 	m_freem(result);
 	va_end(ap);
 	return NULL;
 }
 
 /*
  * SADB_X_SPDADD, SADB_X_SPDSETIDX or SADB_X_SPDUPDATE processing
  * add an entry to SP database, when received
  *   <base, address(SD), (lifetime(H),) policy>
  * from the user(?).
  * Adding to SP database,
  * and send
  *   <base, address(SD), (lifetime(H),) policy>
  * to the socket which was send.
  *
  * SPDADD set a unique policy entry.
  * SPDSETIDX like SPDADD without a part of policy requests.
  * SPDUPDATE replace a unique policy entry.
  *
  * m will always be freed.
  */
 static int
 key_spdadd(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct sadb_address *src0, *dst0;
 	struct sadb_x_policy *xpl0, *xpl;
 	struct sadb_lifetime *lft = NULL;
 	struct secpolicyindex spidx;
 	struct secpolicy *newsp;
 	int error;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL ||
 	    mhp->ext[SADB_X_EXT_POLICY] == NULL) {
 		ipseclog((LOG_DEBUG, "key_spdadd: invalid message is passed.\n"));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL) {
 		if (mhp->extlen[SADB_EXT_LIFETIME_HARD]
 			< sizeof(struct sadb_lifetime)) {
 			ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 				__func__));
 			return key_senderror(so, m, EINVAL);
 		}
 		lft = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD];
 	}
 
 	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
 	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
 	xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY];
 
 	/* 
 	 * Note: do not parse SADB_X_EXT_NAT_T_* here:
 	 * we are processing traffic endpoints.
 	 */
 
 	/* make secindex */
 	/* XXX boundary check against sa_len */
 	KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
 	                src0 + 1,
 	                dst0 + 1,
 	                src0->sadb_address_prefixlen,
 	                dst0->sadb_address_prefixlen,
 	                src0->sadb_address_proto,
 	                &spidx);
 
 	/* checking the direciton. */
 	switch (xpl0->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 	case IPSEC_DIR_OUTBOUND:
 		break;
 	default:
 		ipseclog((LOG_DEBUG, "%s: Invalid SP direction.\n", __func__));
 		mhp->msg->sadb_msg_errno = EINVAL;
 		return 0;
 	}
 
 	/* check policy */
 	/* key_spdadd() accepts DISCARD, NONE and IPSEC. */
 	if (xpl0->sadb_x_policy_type == IPSEC_POLICY_ENTRUST
 	 || xpl0->sadb_x_policy_type == IPSEC_POLICY_BYPASS) {
 		ipseclog((LOG_DEBUG, "%s: Invalid policy type.\n", __func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* policy requests are mandatory when action is ipsec. */
         if (mhp->msg->sadb_msg_type != SADB_X_SPDSETIDX
 	 && xpl0->sadb_x_policy_type == IPSEC_POLICY_IPSEC
 	 && mhp->extlen[SADB_X_EXT_POLICY] <= sizeof(*xpl0)) {
 		ipseclog((LOG_DEBUG, "%s: some policy requests part required\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/*
 	 * checking there is SP already or not.
 	 * SPDUPDATE doesn't depend on whether there is a SP or not.
 	 * If the type is either SPDADD or SPDSETIDX AND a SP is found,
 	 * then error.
 	 */
 	newsp = key_getsp(&spidx);
 	if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) {
 		if (newsp) {
 			SPTREE_LOCK();
 			newsp->state = IPSEC_SPSTATE_DEAD;
 			SPTREE_UNLOCK();
 			KEY_FREESP(&newsp);
 		}
 	} else {
 		if (newsp != NULL) {
 			KEY_FREESP(&newsp);
 			ipseclog((LOG_DEBUG, "%s: a SP entry exists already.\n",
 				__func__));
 			return key_senderror(so, m, EEXIST);
 		}
 	}
 
 	/* allocation new SP entry */
 	if ((newsp = key_msg2sp(xpl0, PFKEY_EXTLEN(xpl0), &error)) == NULL) {
 		return key_senderror(so, m, error);
 	}
 
 	if ((newsp->id = key_getnewspid()) == 0) {
 		_key_delsp(newsp);
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
 	                src0 + 1,
 	                dst0 + 1,
 	                src0->sadb_address_prefixlen,
 	                dst0->sadb_address_prefixlen,
 	                src0->sadb_address_proto,
 	                &newsp->spidx);
 
 	/* sanity check on addr pair */
 	if (((struct sockaddr *)(src0 + 1))->sa_family !=
 			((struct sockaddr *)(dst0+ 1))->sa_family) {
 		_key_delsp(newsp);
 		return key_senderror(so, m, EINVAL);
 	}
 	if (((struct sockaddr *)(src0 + 1))->sa_len !=
 			((struct sockaddr *)(dst0+ 1))->sa_len) {
 		_key_delsp(newsp);
 		return key_senderror(so, m, EINVAL);
 	}
 #if 1
 	if (newsp->req && newsp->req->saidx.src.sa.sa_family && newsp->req->saidx.dst.sa.sa_family) {
 		if (newsp->req->saidx.src.sa.sa_family != newsp->req->saidx.dst.sa.sa_family) {
 			_key_delsp(newsp);
 			return key_senderror(so, m, EINVAL);
 		}
 	}
 #endif
 
 	newsp->created = time_second;
 	newsp->lastused = newsp->created;
 	newsp->lifetime = lft ? lft->sadb_lifetime_addtime : 0;
 	newsp->validtime = lft ? lft->sadb_lifetime_usetime : 0;
 
 	newsp->refcnt = 1;	/* do not reclaim until I say I do */
 	newsp->state = IPSEC_SPSTATE_ALIVE;
 	LIST_INSERT_TAIL(&V_sptree[newsp->spidx.dir], newsp, secpolicy, chain);
 
 	/* delete the entry in spacqtree */
 	if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) {
 		struct secspacq *spacq = key_getspacq(&spidx);
 		if (spacq != NULL) {
 			/* reset counter in order to deletion by timehandler. */
 			spacq->created = time_second;
 			spacq->count = 0;
 			SPACQ_UNLOCK();
 		}
     	}
 
     {
 	struct mbuf *n, *mpolicy;
 	struct sadb_msg *newmsg;
 	int off;
 
 	/*
 	 * Note: do not send SADB_X_EXT_NAT_T_* here:
 	 * we are sending traffic endpoints.
 	 */
 
 	/* create new sadb_msg to reply. */
 	if (lft) {
 		n = key_gather_mbuf(m, mhp, 2, 5, SADB_EXT_RESERVED,
 		    SADB_X_EXT_POLICY, SADB_EXT_LIFETIME_HARD,
 		    SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
 	} else {
 		n = key_gather_mbuf(m, mhp, 2, 4, SADB_EXT_RESERVED,
 		    SADB_X_EXT_POLICY,
 		    SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
 	}
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	if (n->m_len < sizeof(*newmsg)) {
 		n = m_pullup(n, sizeof(*newmsg));
 		if (!n)
 			return key_senderror(so, m, ENOBUFS);
 	}
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
 
 	off = 0;
 	mpolicy = m_pulldown(n, PFKEY_ALIGN8(sizeof(struct sadb_msg)),
 	    sizeof(*xpl), &off);
 	if (mpolicy == NULL) {
 		/* n is already freed */
 		return key_senderror(so, m, ENOBUFS);
 	}
 	xpl = (struct sadb_x_policy *)(mtod(mpolicy, caddr_t) + off);
 	if (xpl->sadb_x_policy_exttype != SADB_X_EXT_POLICY) {
 		m_freem(n);
 		return key_senderror(so, m, EINVAL);
 	}
 	xpl->sadb_x_policy_id = newsp->id;
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /*
  * get new policy id.
  * OUT:
  *	0:	failure.
  *	others: success.
  */
 static u_int32_t
 key_getnewspid()
 {
 	u_int32_t newid = 0;
 	int count = V_key_spi_trycnt;	/* XXX */
 	struct secpolicy *sp;
 
 	/* when requesting to allocate spi ranged */
 	while (count--) {
 		newid = (V_policy_id = (V_policy_id == ~0 ? 1 : V_policy_id + 1));
 
 		if ((sp = key_getspbyid(newid)) == NULL)
 			break;
 
 		KEY_FREESP(&sp);
 	}
 
 	if (count == 0 || newid == 0) {
 		ipseclog((LOG_DEBUG, "%s: to allocate policy id is failed.\n",
 			__func__));
 		return 0;
 	}
 
 	return newid;
 }
 
 /*
  * SADB_SPDDELETE processing
  * receive
  *   <base, address(SD), policy(*)>
  * from the user(?), and set SADB_SASTATE_DEAD,
  * and send,
  *   <base, address(SD), policy(*)>
  * to the ikmpd.
  * policy(*) including direction of policy.
  *
  * m will always be freed.
  */
 static int
 key_spddelete(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct sadb_address *src0, *dst0;
 	struct sadb_x_policy *xpl0;
 	struct secpolicyindex spidx;
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(so != NULL, ("null so"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL ||
 	    mhp->ext[SADB_X_EXT_POLICY] == NULL) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
 	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
 	xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY];
 
 	/*
 	 * Note: do not parse SADB_X_EXT_NAT_T_* here:
 	 * we are processing traffic endpoints.
 	 */
 
 	/* make secindex */
 	/* XXX boundary check against sa_len */
 	KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
 	                src0 + 1,
 	                dst0 + 1,
 	                src0->sadb_address_prefixlen,
 	                dst0->sadb_address_prefixlen,
 	                src0->sadb_address_proto,
 	                &spidx);
 
 	/* checking the direciton. */
 	switch (xpl0->sadb_x_policy_dir) {
 	case IPSEC_DIR_INBOUND:
 	case IPSEC_DIR_OUTBOUND:
 		break;
 	default:
 		ipseclog((LOG_DEBUG, "%s: Invalid SP direction.\n", __func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* Is there SP in SPD ? */
 	if ((sp = key_getsp(&spidx)) == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no SP found.\n", __func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* save policy id to buffer to be returned. */
 	xpl0->sadb_x_policy_id = sp->id;
 
 	SPTREE_LOCK();
 	sp->state = IPSEC_SPSTATE_DEAD;
 	SPTREE_UNLOCK();
 	KEY_FREESP(&sp);
 
     {
 	struct mbuf *n;
 	struct sadb_msg *newmsg;
 
 	/*
 	 * Note: do not send SADB_X_EXT_NAT_T_* here:
 	 * we are sending traffic endpoints.
 	 */
 
 	/* create new sadb_msg to reply. */
 	n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED,
 	    SADB_X_EXT_POLICY, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /*
  * SADB_SPDDELETE2 processing
  * receive
  *   <base, policy(*)>
  * from the user(?), and set SADB_SASTATE_DEAD,
  * and send,
  *   <base, policy(*)>
  * to the ikmpd.
  * policy(*) including direction of policy.
  *
  * m will always be freed.
  */
 static int
 key_spddelete2(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	u_int32_t id;
 	struct secpolicy *sp;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	if (mhp->ext[SADB_X_EXT_POLICY] == NULL ||
 	    mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	id = ((struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id;
 
 	/* Is there SP in SPD ? */
 	if ((sp = key_getspbyid(id)) == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no SP found id:%u.\n", __func__, id));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	SPTREE_LOCK();
 	sp->state = IPSEC_SPSTATE_DEAD;
 	SPTREE_UNLOCK();
 	KEY_FREESP(&sp);
 
     {
 	struct mbuf *n, *nn;
 	struct sadb_msg *newmsg;
 	int off, len;
 
 	/* create new sadb_msg to reply. */
 	len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
 
 	MGETHDR(n, M_NOWAIT, MT_DATA);
 	if (n && len > MHLEN) {
 		MCLGET(n, M_NOWAIT);
 		if ((n->m_flags & M_EXT) == 0) {
 			m_freem(n);
 			n = NULL;
 		}
 	}
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	n->m_len = len;
 	n->m_next = NULL;
 	off = 0;
 
 	m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
 	off += PFKEY_ALIGN8(sizeof(struct sadb_msg));
 
 	IPSEC_ASSERT(off == len, ("length inconsistency (off %u len %u)",
 		off, len));
 
 	n->m_next = m_copym(m, mhp->extoff[SADB_X_EXT_POLICY],
 	    mhp->extlen[SADB_X_EXT_POLICY], M_NOWAIT);
 	if (!n->m_next) {
 		m_freem(n);
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	n->m_pkthdr.len = 0;
 	for (nn = n; nn; nn = nn->m_next)
 		n->m_pkthdr.len += nn->m_len;
 
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /*
  * SADB_X_GET processing
  * receive
  *   <base, policy(*)>
  * from the user(?),
  * and send,
  *   <base, address(SD), policy>
  * to the ikmpd.
  * policy(*) including direction of policy.
  *
  * m will always be freed.
  */
 static int
 key_spdget(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	u_int32_t id;
 	struct secpolicy *sp;
 	struct mbuf *n;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	if (mhp->ext[SADB_X_EXT_POLICY] == NULL ||
 	    mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	id = ((struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id;
 
 	/* Is there SP in SPD ? */
 	if ((sp = key_getspbyid(id)) == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no SP found id:%u.\n", __func__, id));
 		return key_senderror(so, m, ENOENT);
 	}
 
 	n = key_setdumpsp(sp, SADB_X_SPDGET, 0, mhp->msg->sadb_msg_pid);
 	KEY_FREESP(&sp);
 	if (n != NULL) {
 		m_freem(m);
 		return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
 	} else
 		return key_senderror(so, m, ENOBUFS);
 }
 
 /*
  * SADB_X_SPDACQUIRE processing.
  * Acquire policy and SA(s) for a *OUTBOUND* packet.
  * send
  *   <base, policy(*)>
  * to KMD, and expect to receive
  *   <base> with SADB_X_SPDACQUIRE if error occured,
  * or
  *   <base, policy>
  * with SADB_X_SPDUPDATE from KMD by PF_KEY.
  * policy(*) is without policy requests.
  *
  *    0     : succeed
  *    others: error number
  */
 int
 key_spdacquire(sp)
 	struct secpolicy *sp;
 {
 	struct mbuf *result = NULL, *m;
 	struct secspacq *newspacq;
 
 	IPSEC_ASSERT(sp != NULL, ("null secpolicy"));
 	IPSEC_ASSERT(sp->req == NULL, ("policy exists"));
 	IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
 		("policy not IPSEC %u", sp->policy));
 
 	/* Get an entry to check whether sent message or not. */
 	newspacq = key_getspacq(&sp->spidx);
 	if (newspacq != NULL) {
 		if (V_key_blockacq_count < newspacq->count) {
 			/* reset counter and do send message. */
 			newspacq->count = 0;
 		} else {
 			/* increment counter and do nothing. */
 			newspacq->count++;
 			return 0;
 		}
 		SPACQ_UNLOCK();
 	} else {
 		/* make new entry for blocking to send SADB_ACQUIRE. */
 		newspacq = key_newspacq(&sp->spidx);
 		if (newspacq == NULL)
 			return ENOBUFS;
 	}
 
 	/* create new sadb_msg to reply. */
 	m = key_setsadbmsg(SADB_X_SPDACQUIRE, 0, 0, 0, 0, 0);
 	if (!m)
 		return ENOBUFS;
 
 	result = m;
 
 	result->m_pkthdr.len = 0;
 	for (m = result; m; m = m->m_next)
 		result->m_pkthdr.len += m->m_len;
 
 	mtod(result, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(result->m_pkthdr.len);
 
 	return key_sendup_mbuf(NULL, m, KEY_SENDUP_REGISTERED);
 }
 
 /*
  * SADB_SPDFLUSH processing
  * receive
  *   <base>
  * from the user, and free all entries in secpctree.
  * and send,
  *   <base>
  * to the user.
  * NOTE: what to do is only marking SADB_SASTATE_DEAD.
  *
  * m will always be freed.
  */
 static int
 key_spdflush(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct sadb_msg *newmsg;
 	struct secpolicy *sp;
 	u_int dir;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	if (m->m_len != PFKEY_ALIGN8(sizeof(struct sadb_msg)))
 		return key_senderror(so, m, EINVAL);
 
 	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
 		SPTREE_LOCK();
 		LIST_FOREACH(sp, &V_sptree[dir], chain)
 			sp->state = IPSEC_SPSTATE_DEAD;
 		SPTREE_UNLOCK();
 	}
 
 	if (sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	if (m->m_next)
 		m_freem(m->m_next);
 	m->m_next = NULL;
 	m->m_pkthdr.len = m->m_len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
 	newmsg = mtod(m, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);
 
 	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
 }
 
 /*
  * SADB_SPDDUMP processing
  * receive
  *   <base>
  * from the user, and dump all SP leaves
  * and send,
  *   <base> .....
  * to the ikmpd.
  *
  * m will always be freed.
  */
 static int
 key_spddump(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct secpolicy *sp;
 	int cnt;
 	u_int dir;
 	struct mbuf *n;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* search SPD entry and get buffer size. */
 	cnt = 0;
 	SPTREE_LOCK();
 	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
 		LIST_FOREACH(sp, &V_sptree[dir], chain) {
 			cnt++;
 		}
 	}
 
 	if (cnt == 0) {
 		SPTREE_UNLOCK();
 		return key_senderror(so, m, ENOENT);
 	}
 
 	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
 		LIST_FOREACH(sp, &V_sptree[dir], chain) {
 			--cnt;
 			n = key_setdumpsp(sp, SADB_X_SPDDUMP, cnt,
 			    mhp->msg->sadb_msg_pid);
 
 			if (n)
 				key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
 		}
 	}
 
 	SPTREE_UNLOCK();
 	m_freem(m);
 	return 0;
 }
 
 static struct mbuf *
 key_setdumpsp(struct secpolicy *sp, u_int8_t type, u_int32_t seq, u_int32_t pid)
 {
 	struct mbuf *result = NULL, *m;
 	struct seclifetime lt;
 
 	m = key_setsadbmsg(type, 0, SADB_SATYPE_UNSPEC, seq, pid, sp->refcnt);
 	if (!m)
 		goto fail;
 	result = m;
 
 	/*
 	 * Note: do not send SADB_X_EXT_NAT_T_* here:
 	 * we are sending traffic endpoints.
 	 */
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
 	    &sp->spidx.src.sa, sp->spidx.prefs,
 	    sp->spidx.ul_proto);
 	if (!m)
 		goto fail;
 	m_cat(result, m);
 
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
 	    &sp->spidx.dst.sa, sp->spidx.prefd,
 	    sp->spidx.ul_proto);
 	if (!m)
 		goto fail;
 	m_cat(result, m);
 
 	m = key_sp2msg(sp);
 	if (!m)
 		goto fail;
 	m_cat(result, m);
 
 	if(sp->lifetime){
 		lt.addtime=sp->created;
 		lt.usetime= sp->lastused;
 		m = key_setlifetime(&lt, SADB_EXT_LIFETIME_CURRENT);
 		if (!m)
 			goto fail;
 		m_cat(result, m);
 		
 		lt.addtime=sp->lifetime;
 		lt.usetime= sp->validtime;
 		m = key_setlifetime(&lt, SADB_EXT_LIFETIME_HARD);
 		if (!m)
 			goto fail;
 		m_cat(result, m);
 	}
 
 	if ((result->m_flags & M_PKTHDR) == 0)
 		goto fail;
 
 	if (result->m_len < sizeof(struct sadb_msg)) {
 		result = m_pullup(result, sizeof(struct sadb_msg));
 		if (result == NULL)
 			goto fail;
 	}
 
 	result->m_pkthdr.len = 0;
 	for (m = result; m; m = m->m_next)
 		result->m_pkthdr.len += m->m_len;
 
 	mtod(result, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(result->m_pkthdr.len);
 
 	return result;
 
 fail:
 	m_freem(result);
 	return NULL;
 }
 
 /*
  * get PFKEY message length for security policy and request.
  */
 static u_int
 key_getspreqmsglen(sp)
 	struct secpolicy *sp;
 {
 	u_int tlen;
 
 	tlen = sizeof(struct sadb_x_policy);
 
 	/* if is the policy for ipsec ? */
 	if (sp->policy != IPSEC_POLICY_IPSEC)
 		return tlen;
 
 	/* get length of ipsec requests */
     {
 	struct ipsecrequest *isr;
 	int len;
 
 	for (isr = sp->req; isr != NULL; isr = isr->next) {
 		len = sizeof(struct sadb_x_ipsecrequest)
 			+ isr->saidx.src.sa.sa_len
 			+ isr->saidx.dst.sa.sa_len;
 
 		tlen += PFKEY_ALIGN8(len);
 	}
     }
 
 	return tlen;
 }
 
 /*
  * SADB_SPDEXPIRE processing
  * send
  *   <base, address(SD), lifetime(CH), policy>
  * to KMD by PF_KEY.
  *
  * OUT:	0	: succeed
  *	others	: error number
  */
 static int
 key_spdexpire(sp)
 	struct secpolicy *sp;
 {
 	struct mbuf *result = NULL, *m;
 	int len;
 	int error = -1;
 	struct sadb_lifetime *lt;
 
 	/* XXX: Why do we lock ? */
 
 	IPSEC_ASSERT(sp != NULL, ("null secpolicy"));
 
 	/* set msg header */
 	m = key_setsadbmsg(SADB_X_SPDEXPIRE, 0, 0, 0, 0, 0);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	result = m;
 
 	/* create lifetime extension (current and hard) */
 	len = PFKEY_ALIGN8(sizeof(*lt)) * 2;
 	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_align(m, len);
 	m->m_len = len;
 	bzero(mtod(m, caddr_t), len);
 	lt = mtod(m, struct sadb_lifetime *);
 	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
 	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
 	lt->sadb_lifetime_allocations = 0;
 	lt->sadb_lifetime_bytes = 0;
 	lt->sadb_lifetime_addtime = sp->created;
 	lt->sadb_lifetime_usetime = sp->lastused;
 	lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2);
 	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
 	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
 	lt->sadb_lifetime_allocations = 0;
 	lt->sadb_lifetime_bytes = 0;
 	lt->sadb_lifetime_addtime = sp->lifetime;
 	lt->sadb_lifetime_usetime = sp->validtime;
 	m_cat(result, m);
 
 	/*
 	 * Note: do not send SADB_X_EXT_NAT_T_* here:
 	 * we are sending traffic endpoints.
 	 */
 
 	/* set sadb_address for source */
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
 	    &sp->spidx.src.sa,
 	    sp->spidx.prefs, sp->spidx.ul_proto);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/* set sadb_address for destination */
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
 	    &sp->spidx.dst.sa,
 	    sp->spidx.prefd, sp->spidx.ul_proto);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/* set secpolicy */
 	m = key_sp2msg(sp);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	if ((result->m_flags & M_PKTHDR) == 0) {
 		error = EINVAL;
 		goto fail;
 	}
 
 	if (result->m_len < sizeof(struct sadb_msg)) {
 		result = m_pullup(result, sizeof(struct sadb_msg));
 		if (result == NULL) {
 			error = ENOBUFS;
 			goto fail;
 		}
 	}
 
 	result->m_pkthdr.len = 0;
 	for (m = result; m; m = m->m_next)
 		result->m_pkthdr.len += m->m_len;
 
 	mtod(result, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(result->m_pkthdr.len);
 
 	return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);
 
  fail:
 	if (result)
 		m_freem(result);
 	return error;
 }
 
 /* %%% SAD management */
 /*
  * allocating a memory for new SA head, and copy from the values of mhp.
  * OUT:	NULL	: failure due to the lack of memory.
  *	others	: pointer to new SA head.
  */
 static struct secashead *
 key_newsah(saidx)
 	struct secasindex *saidx;
 {
 	struct secashead *newsah;
 
 	IPSEC_ASSERT(saidx != NULL, ("null saidx"));
 
 	newsah = malloc(sizeof(struct secashead), M_IPSEC_SAH, M_NOWAIT|M_ZERO);
 	if (newsah != NULL) {
 		int i;
 		for (i = 0; i < sizeof(newsah->savtree)/sizeof(newsah->savtree[0]); i++)
 			LIST_INIT(&newsah->savtree[i]);
 		newsah->saidx = *saidx;
 
 		/* add to saidxtree */
 		newsah->state = SADB_SASTATE_MATURE;
 
 		SAHTREE_LOCK();
 		LIST_INSERT_HEAD(&V_sahtree, newsah, chain);
 		SAHTREE_UNLOCK();
 	}
 	return(newsah);
 }
 
 /*
  * delete SA index and all SA registerd.
  */
 static void
 key_delsah(sah)
 	struct secashead *sah;
 {
 	struct secasvar *sav, *nextsav;
 	u_int stateidx;
 	int zombie = 0;
 
 	IPSEC_ASSERT(sah != NULL, ("NULL sah"));
 	SAHTREE_LOCK_ASSERT();
 
 	/* searching all SA registerd in the secindex. */
 	for (stateidx = 0;
 	     stateidx < _ARRAYLEN(saorder_state_any);
 	     stateidx++) {
 		u_int state = saorder_state_any[stateidx];
 		LIST_FOREACH_SAFE(sav, &sah->savtree[state], chain, nextsav) {
 			if (sav->refcnt == 0) {
 				/* sanity check */
 				KEY_CHKSASTATE(state, sav->state, __func__);
 				/* 
 				 * do NOT call KEY_FREESAV here:
 				 * it will only delete the sav if refcnt == 1,
 				 * where we already know that refcnt == 0
 				 */
 				key_delsav(sav);
 			} else {
 				/* give up to delete this sa */
 				zombie++;
 			}
 		}
 	}
 	if (!zombie) {		/* delete only if there are savs */
 		/* remove from tree of SA index */
 		if (__LIST_CHAINED(sah))
 			LIST_REMOVE(sah, chain);
 		if (sah->route_cache.sa_route.ro_rt) {
 			RTFREE(sah->route_cache.sa_route.ro_rt);
 			sah->route_cache.sa_route.ro_rt = (struct rtentry *)NULL;
 		}
 		free(sah, M_IPSEC_SAH);
 	}
 }
 
 /*
  * allocating a new SA with LARVAL state.  key_add() and key_getspi() call,
  * and copy the values of mhp into new buffer.
  * When SAD message type is GETSPI:
  *	to set sequence number from acq_seq++,
  *	to set zero to SPI.
  *	not to call key_setsava().
  * OUT:	NULL	: fail
  *	others	: pointer to new secasvar.
  *
  * does not modify mbuf.  does not free mbuf on error.
  */
 static struct secasvar *
 key_newsav(m, mhp, sah, errp, where, tag)
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 	struct secashead *sah;
 	int *errp;
 	const char* where;
 	int tag;
 {
 	struct secasvar *newsav;
 	const struct sadb_sa *xsa;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 	IPSEC_ASSERT(sah != NULL, ("null secashead"));
 
 	newsav = malloc(sizeof(struct secasvar), M_IPSEC_SA, M_NOWAIT|M_ZERO);
 	if (newsav == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		*errp = ENOBUFS;
 		goto done;
 	}
 
 	switch (mhp->msg->sadb_msg_type) {
 	case SADB_GETSPI:
 		newsav->spi = 0;
 
 #ifdef IPSEC_DOSEQCHECK
 		/* sync sequence number */
 		if (mhp->msg->sadb_msg_seq == 0)
 			newsav->seq =
 				(V_acq_seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq));
 		else
 #endif
 			newsav->seq = mhp->msg->sadb_msg_seq;
 		break;
 
 	case SADB_ADD:
 		/* sanity check */
 		if (mhp->ext[SADB_EXT_SA] == NULL) {
 			free(newsav, M_IPSEC_SA);
 			newsav = NULL;
 			ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 				__func__));
 			*errp = EINVAL;
 			goto done;
 		}
 		xsa = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA];
 		newsav->spi = xsa->sadb_sa_spi;
 		newsav->seq = mhp->msg->sadb_msg_seq;
 		break;
 	default:
 		free(newsav, M_IPSEC_SA);
 		newsav = NULL;
 		*errp = EINVAL;
 		goto done;
 	}
 
 
 	/* copy sav values */
 	if (mhp->msg->sadb_msg_type != SADB_GETSPI) {
 		*errp = key_setsaval(newsav, m, mhp);
 		if (*errp) {
 			free(newsav, M_IPSEC_SA);
 			newsav = NULL;
 			goto done;
 		}
 	}
 
 	SECASVAR_LOCK_INIT(newsav);
 
 	/* reset created */
 	newsav->created = time_second;
 	newsav->pid = mhp->msg->sadb_msg_pid;
 
 	/* add to satree */
 	newsav->sah = sah;
 	sa_initref(newsav);
 	newsav->state = SADB_SASTATE_LARVAL;
 
 	SAHTREE_LOCK();
 	LIST_INSERT_TAIL(&sah->savtree[SADB_SASTATE_LARVAL], newsav,
 			secasvar, chain);
 	SAHTREE_UNLOCK();
 done:
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s from %s:%u return SP:%p\n", __func__,
 			where, tag, newsav));
 
 	return newsav;
 }
 
 /*
  * free() SA variable entry.
  */
 static void
 key_cleansav(struct secasvar *sav)
 {
 	/*
 	 * Cleanup xform state.  Note that zeroize'ing causes the
 	 * keys to be cleared; otherwise we must do it ourself.
 	 */
 	if (sav->tdb_xform != NULL) {
 		sav->tdb_xform->xf_zeroize(sav);
 		sav->tdb_xform = NULL;
 	} else {
 		KASSERT(sav->iv == NULL, ("iv but no xform"));
 		if (sav->key_auth != NULL)
 			bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth));
 		if (sav->key_enc != NULL)
 			bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc));
 	}
 	if (sav->key_auth != NULL) {
 		if (sav->key_auth->key_data != NULL)
 			free(sav->key_auth->key_data, M_IPSEC_MISC);
 		free(sav->key_auth, M_IPSEC_MISC);
 		sav->key_auth = NULL;
 	}
 	if (sav->key_enc != NULL) {
 		if (sav->key_enc->key_data != NULL)
 			free(sav->key_enc->key_data, M_IPSEC_MISC);
 		free(sav->key_enc, M_IPSEC_MISC);
 		sav->key_enc = NULL;
 	}
 	if (sav->sched) {
 		bzero(sav->sched, sav->schedlen);
 		free(sav->sched, M_IPSEC_MISC);
 		sav->sched = NULL;
 	}
 	if (sav->replay != NULL) {
 		free(sav->replay, M_IPSEC_MISC);
 		sav->replay = NULL;
 	}
 	if (sav->lft_c != NULL) {
 		free(sav->lft_c, M_IPSEC_MISC);
 		sav->lft_c = NULL;
 	}
 	if (sav->lft_h != NULL) {
 		free(sav->lft_h, M_IPSEC_MISC);
 		sav->lft_h = NULL;
 	}
 	if (sav->lft_s != NULL) {
 		free(sav->lft_s, M_IPSEC_MISC);
 		sav->lft_s = NULL;
 	}
 }
 
 /*
  * free() SA variable entry.
  */
 static void
 key_delsav(sav)
 	struct secasvar *sav;
 {
 	IPSEC_ASSERT(sav != NULL, ("null sav"));
 	IPSEC_ASSERT(sav->refcnt == 0, ("reference count %u > 0", sav->refcnt));
 
 	/* remove from SA header */
 	if (__LIST_CHAINED(sav))
 		LIST_REMOVE(sav, chain);
 	key_cleansav(sav);
 	SECASVAR_LOCK_DESTROY(sav);
 	free(sav, M_IPSEC_SA);
 }
 
 /*
  * search SAD.
  * OUT:
  *	NULL	: not found
  *	others	: found, pointer to a SA.
  */
 static struct secashead *
 key_getsah(saidx)
 	struct secasindex *saidx;
 {
 	struct secashead *sah;
 
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, saidx, CMP_REQID))
 			break;
 	}
 	SAHTREE_UNLOCK();
 
 	return sah;
 }
 
 /*
  * check not to be duplicated SPI.
  * NOTE: this function is too slow due to searching all SAD.
  * OUT:
  *	NULL	: not found
  *	others	: found, pointer to a SA.
  */
 static struct secasvar *
 key_checkspidup(saidx, spi)
 	struct secasindex *saidx;
 	u_int32_t spi;
 {
 	struct secashead *sah;
 	struct secasvar *sav;
 
 	/* check address family */
 	if (saidx->src.sa.sa_family != saidx->dst.sa.sa_family) {
 		ipseclog((LOG_DEBUG, "%s: address family mismatched.\n",
 			__func__));
 		return NULL;
 	}
 
 	sav = NULL;
 	/* check all SAD */
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (!key_ismyaddr((struct sockaddr *)&sah->saidx.dst))
 			continue;
 		sav = key_getsavbyspi(sah, spi);
 		if (sav != NULL)
 			break;
 	}
 	SAHTREE_UNLOCK();
 
 	return sav;
 }
 
 /*
  * search SAD litmited alive SA, protocol, SPI.
  * OUT:
  *	NULL	: not found
  *	others	: found, pointer to a SA.
  */
 static struct secasvar *
 key_getsavbyspi(sah, spi)
 	struct secashead *sah;
 	u_int32_t spi;
 {
 	struct secasvar *sav;
 	u_int stateidx, state;
 
 	sav = NULL;
 	SAHTREE_LOCK_ASSERT();
 	/* search all status */
 	for (stateidx = 0;
 	     stateidx < _ARRAYLEN(saorder_state_alive);
 	     stateidx++) {
 
 		state = saorder_state_alive[stateidx];
 		LIST_FOREACH(sav, &sah->savtree[state], chain) {
 
 			/* sanity check */
 			if (sav->state != state) {
 				ipseclog((LOG_DEBUG, "%s: "
 				    "invalid sav->state (queue: %d SA: %d)\n",
 				    __func__, state, sav->state));
 				continue;
 			}
 
 			if (sav->spi == spi)
 				return sav;
 		}
 	}
 
 	return NULL;
 }
 
 /*
  * copy SA values from PF_KEY message except *SPI, SEQ, PID, STATE and TYPE*.
  * You must update these if need.
  * OUT:	0:	success.
  *	!0:	failure.
  *
  * does not modify mbuf.  does not free mbuf on error.
  */
 static int
 key_setsaval(sav, m, mhp)
 	struct secasvar *sav;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	int error = 0;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* initialization */
 	sav->replay = NULL;
 	sav->key_auth = NULL;
 	sav->key_enc = NULL;
 	sav->sched = NULL;
 	sav->schedlen = 0;
 	sav->iv = NULL;
 	sav->lft_c = NULL;
 	sav->lft_h = NULL;
 	sav->lft_s = NULL;
 	sav->tdb_xform = NULL;		/* transform */
 	sav->tdb_encalgxform = NULL;	/* encoding algorithm */
 	sav->tdb_authalgxform = NULL;	/* authentication algorithm */
 	sav->tdb_compalgxform = NULL;	/* compression algorithm */
 	/*  Initialize even if NAT-T not compiled in: */
 	sav->natt_type = 0;
 	sav->natt_esp_frag_len = 0;
 
 	/* SA */
 	if (mhp->ext[SADB_EXT_SA] != NULL) {
 		const struct sadb_sa *sa0;
 
 		sa0 = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA];
 		if (mhp->extlen[SADB_EXT_SA] < sizeof(*sa0)) {
 			error = EINVAL;
 			goto fail;
 		}
 
 		sav->alg_auth = sa0->sadb_sa_auth;
 		sav->alg_enc = sa0->sadb_sa_encrypt;
 		sav->flags = sa0->sadb_sa_flags;
 
 		/* replay window */
 		if ((sa0->sadb_sa_flags & SADB_X_EXT_OLD) == 0) {
 			sav->replay = (struct secreplay *)
 				malloc(sizeof(struct secreplay)+sa0->sadb_sa_replay, M_IPSEC_MISC, M_NOWAIT|M_ZERO);
 			if (sav->replay == NULL) {
 				ipseclog((LOG_DEBUG, "%s: No more memory.\n",
 					__func__));
 				error = ENOBUFS;
 				goto fail;
 			}
 			if (sa0->sadb_sa_replay != 0)
 				sav->replay->bitmap = (caddr_t)(sav->replay+1);
 			sav->replay->wsize = sa0->sadb_sa_replay;
 		}
 	}
 
 	/* Authentication keys */
 	if (mhp->ext[SADB_EXT_KEY_AUTH] != NULL) {
 		const struct sadb_key *key0;
 		int len;
 
 		key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_AUTH];
 		len = mhp->extlen[SADB_EXT_KEY_AUTH];
 
 		error = 0;
 		if (len < sizeof(*key0)) {
 			error = EINVAL;
 			goto fail;
 		}
 		switch (mhp->msg->sadb_msg_satype) {
 		case SADB_SATYPE_AH:
 		case SADB_SATYPE_ESP:
 		case SADB_X_SATYPE_TCPSIGNATURE:
 			if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) &&
 			    sav->alg_auth != SADB_X_AALG_NULL)
 				error = EINVAL;
 			break;
 		case SADB_X_SATYPE_IPCOMP:
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error) {
 			ipseclog((LOG_DEBUG, "%s: invalid key_auth values.\n",
 				__func__));
 			goto fail;
 		}
 
 		sav->key_auth = (struct seckey *)key_dup_keymsg(key0, len,
 								M_IPSEC_MISC);
 		if (sav->key_auth == NULL ) {
 			ipseclog((LOG_DEBUG, "%s: No more memory.\n",
 				  __func__));
 			error = ENOBUFS;
 			goto fail;
 		}
 	}
 
 	/* Encryption key */
 	if (mhp->ext[SADB_EXT_KEY_ENCRYPT] != NULL) {
 		const struct sadb_key *key0;
 		int len;
 
 		key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_ENCRYPT];
 		len = mhp->extlen[SADB_EXT_KEY_ENCRYPT];
 
 		error = 0;
 		if (len < sizeof(*key0)) {
 			error = EINVAL;
 			goto fail;
 		}
 		switch (mhp->msg->sadb_msg_satype) {
 		case SADB_SATYPE_ESP:
 			if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) &&
 			    sav->alg_enc != SADB_EALG_NULL) {
 				error = EINVAL;
 				break;
 			}
 			sav->key_enc = (struct seckey *)key_dup_keymsg(key0,
 								       len,
 								       M_IPSEC_MISC);
 			if (sav->key_enc == NULL) {
 				ipseclog((LOG_DEBUG, "%s: No more memory.\n",
 					__func__));
 				error = ENOBUFS;
 				goto fail;
 			}
 			break;
 		case SADB_X_SATYPE_IPCOMP:
 			if (len != PFKEY_ALIGN8(sizeof(struct sadb_key)))
 				error = EINVAL;
 			sav->key_enc = NULL;	/*just in case*/
 			break;
 		case SADB_SATYPE_AH:
 		case SADB_X_SATYPE_TCPSIGNATURE:
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error) {
 			ipseclog((LOG_DEBUG, "%s: invalid key_enc value.\n",
 				__func__));
 			goto fail;
 		}
 	}
 
 	/* set iv */
 	sav->ivlen = 0;
 
 	switch (mhp->msg->sadb_msg_satype) {
 	case SADB_SATYPE_AH:
 		error = xform_init(sav, XF_AH);
 		break;
 	case SADB_SATYPE_ESP:
 		error = xform_init(sav, XF_ESP);
 		break;
 	case SADB_X_SATYPE_IPCOMP:
 		error = xform_init(sav, XF_IPCOMP);
 		break;
 	case SADB_X_SATYPE_TCPSIGNATURE:
 		error = xform_init(sav, XF_TCPSIGNATURE);
 		break;
 	}
 	if (error) {
 		ipseclog((LOG_DEBUG, "%s: unable to initialize SA type %u.\n",
 		        __func__, mhp->msg->sadb_msg_satype));
 		goto fail;
 	}
 
 	/* reset created */
 	sav->created = time_second;
 
 	/* make lifetime for CURRENT */
 	sav->lft_c = malloc(sizeof(struct seclifetime), M_IPSEC_MISC, M_NOWAIT);
 	if (sav->lft_c == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		error = ENOBUFS;
 		goto fail;
 	}
 
 	sav->lft_c->allocations = 0;
 	sav->lft_c->bytes = 0;
 	sav->lft_c->addtime = time_second;
 	sav->lft_c->usetime = 0;
 
 	/* lifetimes for HARD and SOFT */
     {
 	const struct sadb_lifetime *lft0;
 
 	lft0 = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD];
 	if (lft0 != NULL) {
 		if (mhp->extlen[SADB_EXT_LIFETIME_HARD] < sizeof(*lft0)) {
 			error = EINVAL;
 			goto fail;
 		}
 		sav->lft_h = key_dup_lifemsg(lft0, M_IPSEC_MISC);
 		if (sav->lft_h == NULL) {
 			ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__));
 			error = ENOBUFS;
 			goto fail;
 		}
 		/* to be initialize ? */
 	}
 
 	lft0 = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_SOFT];
 	if (lft0 != NULL) {
 		if (mhp->extlen[SADB_EXT_LIFETIME_SOFT] < sizeof(*lft0)) {
 			error = EINVAL;
 			goto fail;
 		}
 		sav->lft_s = key_dup_lifemsg(lft0, M_IPSEC_MISC);
 		if (sav->lft_s == NULL) {
 			ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__));
 			error = ENOBUFS;
 			goto fail;
 		}
 		/* to be initialize ? */
 	}
     }
 
 	return 0;
 
  fail:
 	/* initialization */
 	key_cleansav(sav);
 
 	return error;
 }
 
 /*
  * validation with a secasvar entry, and set SADB_SATYPE_MATURE.
  * OUT:	0:	valid
  *	other:	errno
  */
 static int
 key_mature(struct secasvar *sav)
 {
 	int error;
 
 	/* check SPI value */
 	switch (sav->sah->saidx.proto) {
 	case IPPROTO_ESP:
 	case IPPROTO_AH:
 		/*
 		 * RFC 4302, 2.4. Security Parameters Index (SPI), SPI values
 		 * 1-255 reserved by IANA for future use,
 		 * 0 for implementation specific, local use.
 		 */
 		if (ntohl(sav->spi) <= 255) {
 			ipseclog((LOG_DEBUG, "%s: illegal range of SPI %u.\n",
 			    __func__, (u_int32_t)ntohl(sav->spi)));
 			return EINVAL;
 		}
 		break;
 	}
 
 	/* check satype */
 	switch (sav->sah->saidx.proto) {
 	case IPPROTO_ESP:
 		/* check flags */
 		if ((sav->flags & (SADB_X_EXT_OLD|SADB_X_EXT_DERIV)) ==
 		    (SADB_X_EXT_OLD|SADB_X_EXT_DERIV)) {
 			ipseclog((LOG_DEBUG, "%s: invalid flag (derived) "
 				"given to old-esp.\n", __func__));
 			return EINVAL;
 		}
 		error = xform_init(sav, XF_ESP);
 		break;
 	case IPPROTO_AH:
 		/* check flags */
 		if (sav->flags & SADB_X_EXT_DERIV) {
 			ipseclog((LOG_DEBUG, "%s: invalid flag (derived) "
 				"given to AH SA.\n", __func__));
 			return EINVAL;
 		}
 		if (sav->alg_enc != SADB_EALG_NONE) {
 			ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
 				"mismated.\n", __func__));
 			return(EINVAL);
 		}
 		error = xform_init(sav, XF_AH);
 		break;
 	case IPPROTO_IPCOMP:
 		if (sav->alg_auth != SADB_AALG_NONE) {
 			ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
 				"mismated.\n", __func__));
 			return(EINVAL);
 		}
 		if ((sav->flags & SADB_X_EXT_RAWCPI) == 0
 		 && ntohl(sav->spi) >= 0x10000) {
 			ipseclog((LOG_DEBUG, "%s: invalid cpi for IPComp.\n",
 				__func__));
 			return(EINVAL);
 		}
 		error = xform_init(sav, XF_IPCOMP);
 		break;
 	case IPPROTO_TCP:
 		if (sav->alg_enc != SADB_EALG_NONE) {
 			ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
 				"mismated.\n", __func__));
 			return(EINVAL);
 		}
 		error = xform_init(sav, XF_TCPSIGNATURE);
 		break;
 	default:
 		ipseclog((LOG_DEBUG, "%s: Invalid satype.\n", __func__));
 		error = EPROTONOSUPPORT;
 		break;
 	}
 	if (error == 0) {
 		SAHTREE_LOCK();
 		key_sa_chgstate(sav, SADB_SASTATE_MATURE);
 		SAHTREE_UNLOCK();
 	}
 	return (error);
 }
 
 /*
  * subroutine for SADB_GET and SADB_DUMP.
  */
 static struct mbuf *
 key_setdumpsa(struct secasvar *sav, u_int8_t type, u_int8_t satype,
     u_int32_t seq, u_int32_t pid)
 {
 	struct mbuf *result = NULL, *tres = NULL, *m;
 	int i;
 	int dumporder[] = {
 		SADB_EXT_SA, SADB_X_EXT_SA2,
 		SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT,
 		SADB_EXT_LIFETIME_CURRENT, SADB_EXT_ADDRESS_SRC,
 		SADB_EXT_ADDRESS_DST, SADB_EXT_ADDRESS_PROXY, SADB_EXT_KEY_AUTH,
 		SADB_EXT_KEY_ENCRYPT, SADB_EXT_IDENTITY_SRC,
 		SADB_EXT_IDENTITY_DST, SADB_EXT_SENSITIVITY,
 #ifdef IPSEC_NAT_T
 		SADB_X_EXT_NAT_T_TYPE,
 		SADB_X_EXT_NAT_T_SPORT, SADB_X_EXT_NAT_T_DPORT,
 		SADB_X_EXT_NAT_T_OAI, SADB_X_EXT_NAT_T_OAR,
 		SADB_X_EXT_NAT_T_FRAG,
 #endif
 	};
 
 	m = key_setsadbmsg(type, 0, satype, seq, pid, sav->refcnt);
 	if (m == NULL)
 		goto fail;
 	result = m;
 
 	for (i = sizeof(dumporder)/sizeof(dumporder[0]) - 1; i >= 0; i--) {
 		m = NULL;
 		switch (dumporder[i]) {
 		case SADB_EXT_SA:
 			m = key_setsadbsa(sav);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_X_EXT_SA2:
 			m = key_setsadbxsa2(sav->sah->saidx.mode,
 					sav->replay ? sav->replay->count : 0,
 					sav->sah->saidx.reqid);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_ADDRESS_SRC:
 			m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
 			    &sav->sah->saidx.src.sa,
 			    FULLMASK, IPSEC_ULPROTO_ANY);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_ADDRESS_DST:
 			m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
 			    &sav->sah->saidx.dst.sa,
 			    FULLMASK, IPSEC_ULPROTO_ANY);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_KEY_AUTH:
 			if (!sav->key_auth)
 				continue;
 			m = key_setkey(sav->key_auth, SADB_EXT_KEY_AUTH);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_KEY_ENCRYPT:
 			if (!sav->key_enc)
 				continue;
 			m = key_setkey(sav->key_enc, SADB_EXT_KEY_ENCRYPT);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_LIFETIME_CURRENT:
 			if (!sav->lft_c)
 				continue;
 			m = key_setlifetime(sav->lft_c, 
 					    SADB_EXT_LIFETIME_CURRENT);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_LIFETIME_HARD:
 			if (!sav->lft_h)
 				continue;
 			m = key_setlifetime(sav->lft_h, 
 					    SADB_EXT_LIFETIME_HARD);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_EXT_LIFETIME_SOFT:
 			if (!sav->lft_s)
 				continue;
 			m = key_setlifetime(sav->lft_s, 
 					    SADB_EXT_LIFETIME_SOFT);
 
 			if (!m)
 				goto fail;
 			break;
 
 #ifdef IPSEC_NAT_T
 		case SADB_X_EXT_NAT_T_TYPE:
 			m = key_setsadbxtype(sav->natt_type);
 			if (!m)
 				goto fail;
 			break;
 		
 		case SADB_X_EXT_NAT_T_DPORT:
 			m = key_setsadbxport(
 			    KEY_PORTFROMSADDR(&sav->sah->saidx.dst),
 			    SADB_X_EXT_NAT_T_DPORT);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_X_EXT_NAT_T_SPORT:
 			m = key_setsadbxport(
 			    KEY_PORTFROMSADDR(&sav->sah->saidx.src),
 			    SADB_X_EXT_NAT_T_SPORT);
 			if (!m)
 				goto fail;
 			break;
 
 		case SADB_X_EXT_NAT_T_OAI:
 		case SADB_X_EXT_NAT_T_OAR:
 		case SADB_X_EXT_NAT_T_FRAG:
 			/* We do not (yet) support those. */
 			continue;
 #endif
 
 		case SADB_EXT_ADDRESS_PROXY:
 		case SADB_EXT_IDENTITY_SRC:
 		case SADB_EXT_IDENTITY_DST:
 			/* XXX: should we brought from SPD ? */
 		case SADB_EXT_SENSITIVITY:
 		default:
 			continue;
 		}
 
 		if (!m)
 			goto fail;
 		if (tres)
 			m_cat(m, tres);
 		tres = m;
 		  
 	}
 
 	m_cat(result, tres);
 	if (result->m_len < sizeof(struct sadb_msg)) {
 		result = m_pullup(result, sizeof(struct sadb_msg));
 		if (result == NULL)
 			goto fail;
 	}
 
 	result->m_pkthdr.len = 0;
 	for (m = result; m; m = m->m_next)
 		result->m_pkthdr.len += m->m_len;
 
 	mtod(result, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(result->m_pkthdr.len);
 
 	return result;
 
 fail:
 	m_freem(result);
 	m_freem(tres);
 	return NULL;
 }
 
 /*
  * set data into sadb_msg.
  */
 static struct mbuf *
 key_setsadbmsg(u_int8_t type, u_int16_t tlen, u_int8_t satype, u_int32_t seq,
     pid_t pid, u_int16_t reserved)
 {
 	struct mbuf *m;
 	struct sadb_msg *p;
 	int len;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
 	if (len > MCLBYTES)
 		return NULL;
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m && len > MHLEN) {
 		MCLGET(m, M_NOWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_freem(m);
 			m = NULL;
 		}
 	}
 	if (!m)
 		return NULL;
 	m->m_pkthdr.len = m->m_len = len;
 	m->m_next = NULL;
 
 	p = mtod(m, struct sadb_msg *);
 
 	bzero(p, len);
 	p->sadb_msg_version = PF_KEY_V2;
 	p->sadb_msg_type = type;
 	p->sadb_msg_errno = 0;
 	p->sadb_msg_satype = satype;
 	p->sadb_msg_len = PFKEY_UNIT64(tlen);
 	p->sadb_msg_reserved = reserved;
 	p->sadb_msg_seq = seq;
 	p->sadb_msg_pid = (u_int32_t)pid;
 
 	return m;
 }
 
 /*
  * copy secasvar data into sadb_address.
  */
 static struct mbuf *
 key_setsadbsa(sav)
 	struct secasvar *sav;
 {
 	struct mbuf *m;
 	struct sadb_sa *p;
 	int len;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_sa));
 	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
 	if (m == NULL)
 		return (NULL);
 	m_align(m, len);
 	m->m_len = len;
 	p = mtod(m, struct sadb_sa *);
 	bzero(p, len);
 	p->sadb_sa_len = PFKEY_UNIT64(len);
 	p->sadb_sa_exttype = SADB_EXT_SA;
 	p->sadb_sa_spi = sav->spi;
 	p->sadb_sa_replay = (sav->replay != NULL ? sav->replay->wsize : 0);
 	p->sadb_sa_state = sav->state;
 	p->sadb_sa_auth = sav->alg_auth;
 	p->sadb_sa_encrypt = sav->alg_enc;
 	p->sadb_sa_flags = sav->flags;
 
 	return m;
 }
 
 /*
  * set data into sadb_address.
  */
 static struct mbuf *
 key_setsadbaddr(u_int16_t exttype, const struct sockaddr *saddr, u_int8_t prefixlen, u_int16_t ul_proto)
 {
 	struct mbuf *m;
 	struct sadb_address *p;
 	size_t len;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_address)) +
 	    PFKEY_ALIGN8(saddr->sa_len);
 	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
 	if (m == NULL)
 		return (NULL);
 	m_align(m, len);
 	m->m_len = len;
 	p = mtod(m, struct sadb_address *);
 
 	bzero(p, len);
 	p->sadb_address_len = PFKEY_UNIT64(len);
 	p->sadb_address_exttype = exttype;
 	p->sadb_address_proto = ul_proto;
 	if (prefixlen == FULLMASK) {
 		switch (saddr->sa_family) {
 		case AF_INET:
 			prefixlen = sizeof(struct in_addr) << 3;
 			break;
 		case AF_INET6:
 			prefixlen = sizeof(struct in6_addr) << 3;
 			break;
 		default:
 			; /*XXX*/
 		}
 	}
 	p->sadb_address_prefixlen = prefixlen;
 	p->sadb_address_reserved = 0;
 
 	bcopy(saddr,
 	    mtod(m, caddr_t) + PFKEY_ALIGN8(sizeof(struct sadb_address)),
 	    saddr->sa_len);
 
 	return m;
 }
 
 /*
  * set data into sadb_x_sa2.
  */
 static struct mbuf *
 key_setsadbxsa2(u_int8_t mode, u_int32_t seq, u_int32_t reqid)
 {
 	struct mbuf *m;
 	struct sadb_x_sa2 *p;
 	size_t len;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_x_sa2));
 	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
 	if (m == NULL)
 		return (NULL);
 	m_align(m, len);
 	m->m_len = len;
 	p = mtod(m, struct sadb_x_sa2 *);
 
 	bzero(p, len);
 	p->sadb_x_sa2_len = PFKEY_UNIT64(len);
 	p->sadb_x_sa2_exttype = SADB_X_EXT_SA2;
 	p->sadb_x_sa2_mode = mode;
 	p->sadb_x_sa2_reserved1 = 0;
 	p->sadb_x_sa2_reserved2 = 0;
 	p->sadb_x_sa2_sequence = seq;
 	p->sadb_x_sa2_reqid = reqid;
 
 	return m;
 }
 
 #ifdef IPSEC_NAT_T
 /*
  * Set a type in sadb_x_nat_t_type.
  */
 static struct mbuf *
 key_setsadbxtype(u_int16_t type)
 {
 	struct mbuf *m;
 	size_t len;
 	struct sadb_x_nat_t_type *p;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_x_nat_t_type));
 
 	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
 	if (m == NULL)
 		return (NULL);
 	m_align(m, len);
 	m->m_len = len;
 	p = mtod(m, struct sadb_x_nat_t_type *);
 
 	bzero(p, len);
 	p->sadb_x_nat_t_type_len = PFKEY_UNIT64(len);
 	p->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE;
 	p->sadb_x_nat_t_type_type = type;
 
 	return (m);
 }
 /*
  * Set a port in sadb_x_nat_t_port.
  * In contrast to default RFC 2367 behaviour, port is in network byte order.
  */
 static struct mbuf *
 key_setsadbxport(u_int16_t port, u_int16_t type)
 {
 	struct mbuf *m;
 	size_t len;
 	struct sadb_x_nat_t_port *p;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_x_nat_t_port));
 
 	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
 	if (m == NULL)
 		return (NULL);
 	m_align(m, len);
 	m->m_len = len;
 	p = mtod(m, struct sadb_x_nat_t_port *);
 
 	bzero(p, len);
 	p->sadb_x_nat_t_port_len = PFKEY_UNIT64(len);
 	p->sadb_x_nat_t_port_exttype = type;
 	p->sadb_x_nat_t_port_port = port;
 
 	return (m);
 }
 
 /* 
  * Get port from sockaddr. Port is in network byte order.
  */
 u_int16_t 
 key_portfromsaddr(struct sockaddr *sa)
 {
 
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		return ((struct sockaddr_in *)sa)->sin_port;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		return ((struct sockaddr_in6 *)sa)->sin6_port;
 #endif
 	}
 	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 		printf("DP %s unexpected address family %d\n",
 			__func__, sa->sa_family));
 	return (0);
 }
 #endif /* IPSEC_NAT_T */
 
 /*
  * Set port in struct sockaddr. Port is in network byte order.
  */
 static void
 key_porttosaddr(struct sockaddr *sa, u_int16_t port)
 {
 
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		((struct sockaddr_in *)sa)->sin_port = port;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		((struct sockaddr_in6 *)sa)->sin6_port = port;
 		break;
 #endif
 	default:
 		ipseclog((LOG_DEBUG, "%s: unexpected address family %d.\n",
 			__func__, sa->sa_family));
 		break;
 	}
 }
 
 /*
  * set data into sadb_x_policy
  */
 static struct mbuf *
 key_setsadbxpolicy(u_int16_t type, u_int8_t dir, u_int32_t id)
 {
 	struct mbuf *m;
 	struct sadb_x_policy *p;
 	size_t len;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_x_policy));
 	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
 	if (m == NULL)
 		return (NULL);
 	m_align(m, len);
 	m->m_len = len;
 	p = mtod(m, struct sadb_x_policy *);
 
 	bzero(p, len);
 	p->sadb_x_policy_len = PFKEY_UNIT64(len);
 	p->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
 	p->sadb_x_policy_type = type;
 	p->sadb_x_policy_dir = dir;
 	p->sadb_x_policy_id = id;
 
 	return m;
 }
 
 /* %%% utilities */
 /* Take a key message (sadb_key) from the socket and turn it into one
  * of the kernel's key structures (seckey).
  *
  * IN: pointer to the src
  * OUT: NULL no more memory
  */
 struct seckey *
 key_dup_keymsg(const struct sadb_key *src, u_int len,
 	       struct malloc_type *type)
 {
 	struct seckey *dst;
 	dst = (struct seckey *)malloc(sizeof(struct seckey), type, M_NOWAIT);
 	if (dst != NULL) {
 		dst->bits = src->sadb_key_bits;
 		dst->key_data = (char *)malloc(len, type, M_NOWAIT);
 		if (dst->key_data != NULL) {
 			bcopy((const char *)src + sizeof(struct sadb_key), 
 			      dst->key_data, len);
 		} else {
 			ipseclog((LOG_DEBUG, "%s: No more memory.\n", 
 				  __func__));
 			free(dst, type);
 			dst = NULL;
 		}
 	} else {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", 
 			  __func__));
 
 	}
 	return dst;
 }
 
 /* Take a lifetime message (sadb_lifetime) passed in on a socket and
  * turn it into one of the kernel's lifetime structures (seclifetime).
  *
  * IN: pointer to the destination, source and malloc type
  * OUT: NULL, no more memory
  */
 
 static struct seclifetime *
 key_dup_lifemsg(const struct sadb_lifetime *src,
 		 struct malloc_type *type)
 {
 	struct seclifetime *dst = NULL;
 
 	dst = (struct seclifetime *)malloc(sizeof(struct seclifetime), 
 					   type, M_NOWAIT);
 	if (dst == NULL) {
 		/* XXX counter */
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 	} else {
 		dst->allocations = src->sadb_lifetime_allocations;
 		dst->bytes = src->sadb_lifetime_bytes;
 		dst->addtime = src->sadb_lifetime_addtime;
 		dst->usetime = src->sadb_lifetime_usetime;
 	}
 	return dst;
 }
 
 /* compare my own address
  * OUT:	1: true, i.e. my address.
  *	0: false
  */
 int
 key_ismyaddr(struct sockaddr *sa)
 {
 
 	IPSEC_ASSERT(sa != NULL, ("null sockaddr"));
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		return (in_localip(satosin(sa)->sin_addr));
 #endif
 #ifdef INET6
 	case AF_INET6:
 		return key_ismyaddr6((struct sockaddr_in6 *)sa);
 #endif
 	}
 
 	return 0;
 }
 
 #ifdef INET6
 /*
  * compare my own address for IPv6.
  * 1: ours
  * 0: other
  * NOTE: derived ip6_input() in KAME. This is necessary to modify more.
  */
 #include <netinet6/in6_var.h>
 
 static int
 key_ismyaddr6(sin6)
 	struct sockaddr_in6 *sin6;
 {
 	struct in6_ifaddr *ia;
 #if 0
 	struct in6_multi *in6m;
 #endif
 
 	IN6_IFADDR_RLOCK();
 	TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 		if (key_sockaddrcmp((struct sockaddr *)&sin6,
 		    (struct sockaddr *)&ia->ia_addr, 0) == 0) {
 			IN6_IFADDR_RUNLOCK();
 			return 1;
 		}
 
 #if 0
 		/*
 		 * XXX Multicast
 		 * XXX why do we care about multlicast here while we don't care
 		 * about IPv4 multicast??
 		 * XXX scope
 		 */
 		in6m = NULL;
 		IN6_LOOKUP_MULTI(sin6->sin6_addr, ia->ia_ifp, in6m);
 		if (in6m) {
 			IN6_IFADDR_RUNLOCK();
 			return 1;
 		}
 #endif
 	}
 	IN6_IFADDR_RUNLOCK();
 
 	/* loopback, just for safety */
 	if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))
 		return 1;
 
 	return 0;
 }
 #endif /*INET6*/
 
 /*
  * compare two secasindex structure.
  * flag can specify to compare 2 saidxes.
  * compare two secasindex structure without both mode and reqid.
  * don't compare port.
  * IN:  
  *      saidx0: source, it can be in SAD.
  *      saidx1: object.
  * OUT: 
  *      1 : equal
  *      0 : not equal
  */
 static int
 key_cmpsaidx(
 	const struct secasindex *saidx0,
 	const struct secasindex *saidx1,
 	int flag)
 {
 	int chkport = 0;
 
 	/* sanity */
 	if (saidx0 == NULL && saidx1 == NULL)
 		return 1;
 
 	if (saidx0 == NULL || saidx1 == NULL)
 		return 0;
 
 	if (saidx0->proto != saidx1->proto)
 		return 0;
 
 	if (flag == CMP_EXACTLY) {
 		if (saidx0->mode != saidx1->mode)
 			return 0;
 		if (saidx0->reqid != saidx1->reqid)
 			return 0;
 		if (bcmp(&saidx0->src, &saidx1->src, saidx0->src.sa.sa_len) != 0 ||
 		    bcmp(&saidx0->dst, &saidx1->dst, saidx0->dst.sa.sa_len) != 0)
 			return 0;
 	} else {
 
 		/* CMP_MODE_REQID, CMP_REQID, CMP_HEAD */
 		if (flag == CMP_MODE_REQID
 		  ||flag == CMP_REQID) {
 			/*
 			 * If reqid of SPD is non-zero, unique SA is required.
 			 * The result must be of same reqid in this case.
 			 */
 			if (saidx1->reqid != 0 && saidx0->reqid != saidx1->reqid)
 				return 0;
 		}
 
 		if (flag == CMP_MODE_REQID) {
 			if (saidx0->mode != IPSEC_MODE_ANY
 			 && saidx0->mode != saidx1->mode)
 				return 0;
 		}
 
 #ifdef IPSEC_NAT_T
 		/*
 		 * If NAT-T is enabled, check ports for tunnel mode.
 		 * Do not check ports if they are set to zero in the SPD.
 		 * Also do not do it for native transport mode, as there
 		 * is no port information available in the SP.
 		 */
 		if ((saidx1->mode == IPSEC_MODE_TUNNEL ||
 		     (saidx1->mode == IPSEC_MODE_TRANSPORT &&
 		      saidx1->proto == IPPROTO_ESP)) &&
 		    saidx1->src.sa.sa_family == AF_INET &&
 		    saidx1->dst.sa.sa_family == AF_INET &&
 		    ((const struct sockaddr_in *)(&saidx1->src))->sin_port &&
 		    ((const struct sockaddr_in *)(&saidx1->dst))->sin_port)
 			chkport = 1;
 #endif /* IPSEC_NAT_T */
 
 		if (key_sockaddrcmp(&saidx0->src.sa, &saidx1->src.sa, chkport) != 0) {
 			return 0;
 		}
 		if (key_sockaddrcmp(&saidx0->dst.sa, &saidx1->dst.sa, chkport) != 0) {
 			return 0;
 		}
 	}
 
 	return 1;
 }
 
 /*
  * compare two secindex structure exactly.
  * IN:
  *	spidx0: source, it is often in SPD.
  *	spidx1: object, it is often from PFKEY message.
  * OUT:
  *	1 : equal
  *	0 : not equal
  */
 static int
 key_cmpspidx_exactly(
 	struct secpolicyindex *spidx0,
 	struct secpolicyindex *spidx1)
 {
 	/* sanity */
 	if (spidx0 == NULL && spidx1 == NULL)
 		return 1;
 
 	if (spidx0 == NULL || spidx1 == NULL)
 		return 0;
 
 	if (spidx0->prefs != spidx1->prefs
 	 || spidx0->prefd != spidx1->prefd
 	 || spidx0->ul_proto != spidx1->ul_proto)
 		return 0;
 
 	return key_sockaddrcmp(&spidx0->src.sa, &spidx1->src.sa, 1) == 0 &&
 	       key_sockaddrcmp(&spidx0->dst.sa, &spidx1->dst.sa, 1) == 0;
 }
 
 /*
  * compare two secindex structure with mask.
  * IN:
  *	spidx0: source, it is often in SPD.
  *	spidx1: object, it is often from IP header.
  * OUT:
  *	1 : equal
  *	0 : not equal
  */
 static int
 key_cmpspidx_withmask(
 	struct secpolicyindex *spidx0,
 	struct secpolicyindex *spidx1)
 {
 	/* sanity */
 	if (spidx0 == NULL && spidx1 == NULL)
 		return 1;
 
 	if (spidx0 == NULL || spidx1 == NULL)
 		return 0;
 
 	if (spidx0->src.sa.sa_family != spidx1->src.sa.sa_family ||
 	    spidx0->dst.sa.sa_family != spidx1->dst.sa.sa_family ||
 	    spidx0->src.sa.sa_len != spidx1->src.sa.sa_len ||
 	    spidx0->dst.sa.sa_len != spidx1->dst.sa.sa_len)
 		return 0;
 
 	/* if spidx.ul_proto == IPSEC_ULPROTO_ANY, ignore. */
 	if (spidx0->ul_proto != (u_int16_t)IPSEC_ULPROTO_ANY
 	 && spidx0->ul_proto != spidx1->ul_proto)
 		return 0;
 
 	switch (spidx0->src.sa.sa_family) {
 	case AF_INET:
 		if (spidx0->src.sin.sin_port != IPSEC_PORT_ANY
 		 && spidx0->src.sin.sin_port != spidx1->src.sin.sin_port)
 			return 0;
 		if (!key_bbcmp(&spidx0->src.sin.sin_addr,
 		    &spidx1->src.sin.sin_addr, spidx0->prefs))
 			return 0;
 		break;
 	case AF_INET6:
 		if (spidx0->src.sin6.sin6_port != IPSEC_PORT_ANY
 		 && spidx0->src.sin6.sin6_port != spidx1->src.sin6.sin6_port)
 			return 0;
 		/*
 		 * scope_id check. if sin6_scope_id is 0, we regard it
 		 * as a wildcard scope, which matches any scope zone ID. 
 		 */
 		if (spidx0->src.sin6.sin6_scope_id &&
 		    spidx1->src.sin6.sin6_scope_id &&
 		    spidx0->src.sin6.sin6_scope_id != spidx1->src.sin6.sin6_scope_id)
 			return 0;
 		if (!key_bbcmp(&spidx0->src.sin6.sin6_addr,
 		    &spidx1->src.sin6.sin6_addr, spidx0->prefs))
 			return 0;
 		break;
 	default:
 		/* XXX */
 		if (bcmp(&spidx0->src, &spidx1->src, spidx0->src.sa.sa_len) != 0)
 			return 0;
 		break;
 	}
 
 	switch (spidx0->dst.sa.sa_family) {
 	case AF_INET:
 		if (spidx0->dst.sin.sin_port != IPSEC_PORT_ANY
 		 && spidx0->dst.sin.sin_port != spidx1->dst.sin.sin_port)
 			return 0;
 		if (!key_bbcmp(&spidx0->dst.sin.sin_addr,
 		    &spidx1->dst.sin.sin_addr, spidx0->prefd))
 			return 0;
 		break;
 	case AF_INET6:
 		if (spidx0->dst.sin6.sin6_port != IPSEC_PORT_ANY
 		 && spidx0->dst.sin6.sin6_port != spidx1->dst.sin6.sin6_port)
 			return 0;
 		/*
 		 * scope_id check. if sin6_scope_id is 0, we regard it
 		 * as a wildcard scope, which matches any scope zone ID. 
 		 */
 		if (spidx0->dst.sin6.sin6_scope_id &&
 		    spidx1->dst.sin6.sin6_scope_id &&
 		    spidx0->dst.sin6.sin6_scope_id != spidx1->dst.sin6.sin6_scope_id)
 			return 0;
 		if (!key_bbcmp(&spidx0->dst.sin6.sin6_addr,
 		    &spidx1->dst.sin6.sin6_addr, spidx0->prefd))
 			return 0;
 		break;
 	default:
 		/* XXX */
 		if (bcmp(&spidx0->dst, &spidx1->dst, spidx0->dst.sa.sa_len) != 0)
 			return 0;
 		break;
 	}
 
 	/* XXX Do we check other field ?  e.g. flowinfo */
 
 	return 1;
 }
 
 /* returns 0 on match */
 static int
 key_sockaddrcmp(
 	const struct sockaddr *sa1,
 	const struct sockaddr *sa2,
 	int port)
 {
 #ifdef satosin
 #undef satosin
 #endif
 #define satosin(s) ((const struct sockaddr_in *)s)
 #ifdef satosin6
 #undef satosin6
 #endif
 #define satosin6(s) ((const struct sockaddr_in6 *)s)
 	if (sa1->sa_family != sa2->sa_family || sa1->sa_len != sa2->sa_len)
 		return 1;
 
 	switch (sa1->sa_family) {
 	case AF_INET:
 		if (sa1->sa_len != sizeof(struct sockaddr_in))
 			return 1;
 		if (satosin(sa1)->sin_addr.s_addr !=
 		    satosin(sa2)->sin_addr.s_addr) {
 			return 1;
 		}
 		if (port && satosin(sa1)->sin_port != satosin(sa2)->sin_port)
 			return 1;
 		break;
 	case AF_INET6:
 		if (sa1->sa_len != sizeof(struct sockaddr_in6))
 			return 1;	/*EINVAL*/
 		if (satosin6(sa1)->sin6_scope_id !=
 		    satosin6(sa2)->sin6_scope_id) {
 			return 1;
 		}
 		if (!IN6_ARE_ADDR_EQUAL(&satosin6(sa1)->sin6_addr,
 		    &satosin6(sa2)->sin6_addr)) {
 			return 1;
 		}
 		if (port &&
 		    satosin6(sa1)->sin6_port != satosin6(sa2)->sin6_port) {
 			return 1;
 		}
 		break;
 	default:
 		if (bcmp(sa1, sa2, sa1->sa_len) != 0)
 			return 1;
 		break;
 	}
 
 	return 0;
 #undef satosin
 #undef satosin6
 }
 
 /*
  * compare two buffers with mask.
  * IN:
  *	addr1: source
  *	addr2: object
  *	bits:  Number of bits to compare
  * OUT:
  *	1 : equal
  *	0 : not equal
  */
 static int
 key_bbcmp(const void *a1, const void *a2, u_int bits)
 {
 	const unsigned char *p1 = a1;
 	const unsigned char *p2 = a2;
 
 	/* XXX: This could be considerably faster if we compare a word
 	 * at a time, but it is complicated on LSB Endian machines */
 
 	/* Handle null pointers */
 	if (p1 == NULL || p2 == NULL)
 		return (p1 == p2);
 
 	while (bits >= 8) {
 		if (*p1++ != *p2++)
 			return 0;
 		bits -= 8;
 	}
 
 	if (bits > 0) {
 		u_int8_t mask = ~((1<<(8-bits))-1);
 		if ((*p1 & mask) != (*p2 & mask))
 			return 0;
 	}
 	return 1;	/* Match! */
 }
 
 static void
 key_flush_spd(time_t now)
 {
 	static u_int16_t sptree_scangen = 0;
 	u_int16_t gen = sptree_scangen++;
 	struct secpolicy *sp;
 	u_int dir;
 
 	/* SPD */
 	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
 restart:
 		SPTREE_LOCK();
 		LIST_FOREACH(sp, &V_sptree[dir], chain) {
 			if (sp->scangen == gen)		/* previously handled */
 				continue;
 			sp->scangen = gen;
 			if (sp->state == IPSEC_SPSTATE_DEAD &&
 			    sp->refcnt == 1) {
 				/*
 				 * Ensure that we only decrease refcnt once,
 				 * when we're the last consumer.
 				 * Directly call SP_DELREF/key_delsp instead
 				 * of KEY_FREESP to avoid unlocking/relocking
 				 * SPTREE_LOCK before key_delsp: may refcnt
 				 * be increased again during that time ?
 				 * NB: also clean entries created by
 				 * key_spdflush
 				 */
 				SP_DELREF(sp);
 				key_delsp(sp);
 				SPTREE_UNLOCK();
 				goto restart;
 			}
 			if (sp->lifetime == 0 && sp->validtime == 0)
 				continue;
 			if ((sp->lifetime && now - sp->created > sp->lifetime)
 			 || (sp->validtime && now - sp->lastused > sp->validtime)) {
 				sp->state = IPSEC_SPSTATE_DEAD;
 				SPTREE_UNLOCK();
 				key_spdexpire(sp);
 				goto restart;
 			}
 		}
 		SPTREE_UNLOCK();
 	}
 }
 
 static void
 key_flush_sad(time_t now)
 {
 	struct secashead *sah, *nextsah;
 	struct secasvar *sav, *nextsav;
 
 	/* SAD */
 	SAHTREE_LOCK();
 	LIST_FOREACH_SAFE(sah, &V_sahtree, chain, nextsah) {
 		/* if sah has been dead, then delete it and process next sah. */
 		if (sah->state == SADB_SASTATE_DEAD) {
 			key_delsah(sah);
 			continue;
 		}
 
 		/* if LARVAL entry doesn't become MATURE, delete it. */
 		LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_LARVAL], chain, nextsav) {
 			/* Need to also check refcnt for a larval SA ??? */
 			if (now - sav->created > V_key_larval_lifetime)
 				KEY_FREESAV(&sav);
 		}
 
 		/*
 		 * check MATURE entry to start to send expire message
 		 * whether or not.
 		 */
 		LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_MATURE], chain, nextsav) {
 			/* we don't need to check. */
 			if (sav->lft_s == NULL)
 				continue;
 
 			/* sanity check */
 			if (sav->lft_c == NULL) {
 				ipseclog((LOG_DEBUG,"%s: there is no CURRENT "
 					"time, why?\n", __func__));
 				continue;
 			}
 
 			/* check SOFT lifetime */
 			if (sav->lft_s->addtime != 0 &&
 			    now - sav->created > sav->lft_s->addtime) {
 				key_sa_chgstate(sav, SADB_SASTATE_DYING);
 				/* 
 				 * Actually, only send expire message if
 				 * SA has been used, as it was done before,
 				 * but should we always send such message,
 				 * and let IKE daemon decide if it should be
 				 * renegotiated or not ?
 				 * XXX expire message will actually NOT be
 				 * sent if SA is only used after soft
 				 * lifetime has been reached, see below
 				 * (DYING state)
 				 */
 				if (sav->lft_c->usetime != 0)
 					key_expire(sav);
 			}
 			/* check SOFT lifetime by bytes */
 			/*
 			 * XXX I don't know the way to delete this SA
 			 * when new SA is installed.  Caution when it's
 			 * installed too big lifetime by time.
 			 */
 			else if (sav->lft_s->bytes != 0 &&
 			    sav->lft_s->bytes < sav->lft_c->bytes) {
 
 				key_sa_chgstate(sav, SADB_SASTATE_DYING);
 				/*
 				 * XXX If we keep to send expire
 				 * message in the status of
 				 * DYING. Do remove below code.
 				 */
 				key_expire(sav);
 			}
 		}
 
 		/* check DYING entry to change status to DEAD. */
 		LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_DYING], chain, nextsav) {
 			/* we don't need to check. */
 			if (sav->lft_h == NULL)
 				continue;
 
 			/* sanity check */
 			if (sav->lft_c == NULL) {
 				ipseclog((LOG_DEBUG, "%s: there is no CURRENT "
 					"time, why?\n", __func__));
 				continue;
 			}
 
 			if (sav->lft_h->addtime != 0 &&
 			    now - sav->created > sav->lft_h->addtime) {
 				key_sa_chgstate(sav, SADB_SASTATE_DEAD);
 				KEY_FREESAV(&sav);
 			}
 #if 0	/* XXX Should we keep to send expire message until HARD lifetime ? */
 			else if (sav->lft_s != NULL
 			      && sav->lft_s->addtime != 0
 			      && now - sav->created > sav->lft_s->addtime) {
 				/*
 				 * XXX: should be checked to be
 				 * installed the valid SA.
 				 */
 
 				/*
 				 * If there is no SA then sending
 				 * expire message.
 				 */
 				key_expire(sav);
 			}
 #endif
 			/* check HARD lifetime by bytes */
 			else if (sav->lft_h->bytes != 0 &&
 			    sav->lft_h->bytes < sav->lft_c->bytes) {
 				key_sa_chgstate(sav, SADB_SASTATE_DEAD);
 				KEY_FREESAV(&sav);
 			}
 		}
 
 		/* delete entry in DEAD */
 		LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_DEAD], chain, nextsav) {
 			/* sanity check */
 			if (sav->state != SADB_SASTATE_DEAD) {
 				ipseclog((LOG_DEBUG, "%s: invalid sav->state "
 					"(queue: %d SA: %d): kill it anyway\n",
 					__func__,
 					SADB_SASTATE_DEAD, sav->state));
 			}
 			/*
 			 * do not call key_freesav() here.
 			 * sav should already be freed, and sav->refcnt
 			 * shows other references to sav
 			 * (such as from SPD).
 			 */
 		}
 	}
 	SAHTREE_UNLOCK();
 }
 
 static void
 key_flush_acq(time_t now)
 {
 	struct secacq *acq, *nextacq;
 
 	/* ACQ tree */
 	ACQ_LOCK();
 	for (acq = LIST_FIRST(&V_acqtree); acq != NULL; acq = nextacq) {
 		nextacq = LIST_NEXT(acq, chain);
 		if (now - acq->created > V_key_blockacq_lifetime
 		 && __LIST_CHAINED(acq)) {
 			LIST_REMOVE(acq, chain);
 			free(acq, M_IPSEC_SAQ);
 		}
 	}
 	ACQ_UNLOCK();
 }
 
 static void
 key_flush_spacq(time_t now)
 {
 	struct secspacq *acq, *nextacq;
 
 	/* SP ACQ tree */
 	SPACQ_LOCK();
 	for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) {
 		nextacq = LIST_NEXT(acq, chain);
 		if (now - acq->created > V_key_blockacq_lifetime
 		 && __LIST_CHAINED(acq)) {
 			LIST_REMOVE(acq, chain);
 			free(acq, M_IPSEC_SAQ);
 		}
 	}
 	SPACQ_UNLOCK();
 }
 
 /*
  * time handler.
  * scanning SPD and SAD to check status for each entries,
  * and do to remove or to expire.
  * XXX: year 2038 problem may remain.
  */
 static void
 key_timehandler(void *arg)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	time_t now = time_second;
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		key_flush_spd(now);
 		key_flush_sad(now);
 		key_flush_acq(now);
 		key_flush_spacq(now);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 
 #ifndef IPSEC_DEBUG2
 	/* do exchange to tick time !! */
 	callout_schedule(&key_timer, hz);
 #endif /* IPSEC_DEBUG2 */
 }
 
 u_long
 key_random()
 {
 	u_long value;
 
 	key_randomfill(&value, sizeof(value));
 	return value;
 }
 
 void
 key_randomfill(p, l)
 	void *p;
 	size_t l;
 {
 	size_t n;
 	u_long v;
 	static int warn = 1;
 
 	n = 0;
 	n = (size_t)read_random(p, (u_int)l);
 	/* last resort */
 	while (n < l) {
 		v = random();
 		bcopy(&v, (u_int8_t *)p + n,
 		    l - n < sizeof(v) ? l - n : sizeof(v));
 		n += sizeof(v);
 
 		if (warn) {
 			printf("WARNING: pseudo-random number generator "
 			    "used for IPsec processing\n");
 			warn = 0;
 		}
 	}
 }
 
 /*
  * map SADB_SATYPE_* to IPPROTO_*.
  * if satype == SADB_SATYPE then satype is mapped to ~0.
  * OUT:
  *	0: invalid satype.
  */
 static u_int16_t
 key_satype2proto(u_int8_t satype)
 {
 	switch (satype) {
 	case SADB_SATYPE_UNSPEC:
 		return IPSEC_PROTO_ANY;
 	case SADB_SATYPE_AH:
 		return IPPROTO_AH;
 	case SADB_SATYPE_ESP:
 		return IPPROTO_ESP;
 	case SADB_X_SATYPE_IPCOMP:
 		return IPPROTO_IPCOMP;
 	case SADB_X_SATYPE_TCPSIGNATURE:
 		return IPPROTO_TCP;
 	default:
 		return 0;
 	}
 	/* NOTREACHED */
 }
 
 /*
  * map IPPROTO_* to SADB_SATYPE_*
  * OUT:
  *	0: invalid protocol type.
  */
 static u_int8_t
 key_proto2satype(u_int16_t proto)
 {
 	switch (proto) {
 	case IPPROTO_AH:
 		return SADB_SATYPE_AH;
 	case IPPROTO_ESP:
 		return SADB_SATYPE_ESP;
 	case IPPROTO_IPCOMP:
 		return SADB_X_SATYPE_IPCOMP;
 	case IPPROTO_TCP:
 		return SADB_X_SATYPE_TCPSIGNATURE;
 	default:
 		return 0;
 	}
 	/* NOTREACHED */
 }
 
 /* %%% PF_KEY */
 /*
  * SADB_GETSPI processing is to receive
  *	<base, (SA2), src address, dst address, (SPI range)>
  * from the IKMPd, to assign a unique spi value, to hang on the INBOUND
  * tree with the status of LARVAL, and send
  *	<base, SA(*), address(SD)>
  * to the IKMPd.
  *
  * IN:	mhp: pointer to the pointer to each header.
  * OUT:	NULL if fail.
  *	other if success, return pointer to the message to send.
  */
 static int
 key_getspi(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct sadb_address *src0, *dst0;
 	struct secasindex saidx;
 	struct secashead *newsah;
 	struct secasvar *newsav;
 	u_int8_t proto;
 	u_int32_t spi;
 	u_int8_t mode;
 	u_int32_t reqid;
 	int error;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->ext[SADB_X_EXT_SA2] != NULL) {
 		mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
 		reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
 	} else {
 		mode = IPSEC_MODE_ANY;
 		reqid = 0;
 	}
 
 	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
 	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/*
 	 * Make sure the port numbers are zero.
 	 * In case of NAT-T we will update them later if needed.
 	 */
 	switch (((struct sockaddr *)(src0 + 1))->sa_family) {
 	case AF_INET:
 		if (((struct sockaddr *)(src0 + 1))->sa_len !=
 		    sizeof(struct sockaddr_in))
 			return key_senderror(so, m, EINVAL);
 		((struct sockaddr_in *)(src0 + 1))->sin_port = 0;
 		break;
 	case AF_INET6:
 		if (((struct sockaddr *)(src0 + 1))->sa_len !=
 		    sizeof(struct sockaddr_in6))
 			return key_senderror(so, m, EINVAL);
 		((struct sockaddr_in6 *)(src0 + 1))->sin6_port = 0;
 		break;
 	default:
 		; /*???*/
 	}
 	switch (((struct sockaddr *)(dst0 + 1))->sa_family) {
 	case AF_INET:
 		if (((struct sockaddr *)(dst0 + 1))->sa_len !=
 		    sizeof(struct sockaddr_in))
 			return key_senderror(so, m, EINVAL);
 		((struct sockaddr_in *)(dst0 + 1))->sin_port = 0;
 		break;
 	case AF_INET6:
 		if (((struct sockaddr *)(dst0 + 1))->sa_len !=
 		    sizeof(struct sockaddr_in6))
 			return key_senderror(so, m, EINVAL);
 		((struct sockaddr_in6 *)(dst0 + 1))->sin6_port = 0;
 		break;
 	default:
 		; /*???*/
 	}
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);
 
 #ifdef IPSEC_NAT_T
 	/*
 	 * Handle NAT-T info if present.
 	 * We made sure the port numbers are zero above, so we do
 	 * not have to worry in case we do not update them.
 	 */
 	if (mhp->ext[SADB_X_EXT_NAT_T_OAI] != NULL)
 		ipseclog((LOG_DEBUG, "%s: NAT-T OAi present\n", __func__));
 	if (mhp->ext[SADB_X_EXT_NAT_T_OAR] != NULL)
 		ipseclog((LOG_DEBUG, "%s: NAT-T OAr present\n", __func__));
 
 	if (mhp->ext[SADB_X_EXT_NAT_T_TYPE] != NULL &&
 	    mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL &&
 	    mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) {
 		struct sadb_x_nat_t_type *type;
 		struct sadb_x_nat_t_port *sport, *dport;
 
 		if (mhp->extlen[SADB_X_EXT_NAT_T_TYPE] < sizeof(*type) ||
 		    mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) ||
 		    mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) {
 			ipseclog((LOG_DEBUG, "%s: invalid nat-t message "
 			    "passed.\n", __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 
 		sport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_SPORT];
 		dport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_DPORT];
 
 		if (sport)
 			KEY_PORTTOSADDR(&saidx.src, sport->sadb_x_nat_t_port_port);
 		if (dport)
 			KEY_PORTTOSADDR(&saidx.dst, dport->sadb_x_nat_t_port_port);
 	}
 #endif
 
 	/* SPI allocation */
 	spi = key_do_getnewspi((struct sadb_spirange *)mhp->ext[SADB_EXT_SPIRANGE],
 	                       &saidx);
 	if (spi == 0)
 		return key_senderror(so, m, EINVAL);
 
 	/* get a SA index */
 	if ((newsah = key_getsah(&saidx)) == NULL) {
 		/* create a new SA index */
 		if ((newsah = key_newsah(&saidx)) == NULL) {
 			ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__));
 			return key_senderror(so, m, ENOBUFS);
 		}
 	}
 
 	/* get a new SA */
 	/* XXX rewrite */
 	newsav = KEY_NEWSAV(m, mhp, newsah, &error);
 	if (newsav == NULL) {
 		/* XXX don't free new SA index allocated in above. */
 		return key_senderror(so, m, error);
 	}
 
 	/* set spi */
 	newsav->spi = htonl(spi);
 
 	/* delete the entry in acqtree */
 	if (mhp->msg->sadb_msg_seq != 0) {
 		struct secacq *acq;
 		if ((acq = key_getacqbyseq(mhp->msg->sadb_msg_seq)) != NULL) {
 			/* reset counter in order to deletion by timehandler. */
 			acq->created = time_second;
 			acq->count = 0;
 		}
     	}
 
     {
 	struct mbuf *n, *nn;
 	struct sadb_sa *m_sa;
 	struct sadb_msg *newmsg;
 	int off, len;
 
 	/* create new sadb_msg to reply. */
 	len = PFKEY_ALIGN8(sizeof(struct sadb_msg)) +
 	    PFKEY_ALIGN8(sizeof(struct sadb_sa));
 
 	MGETHDR(n, M_NOWAIT, MT_DATA);
 	if (len > MHLEN) {
 		MCLGET(n, M_NOWAIT);
 		if ((n->m_flags & M_EXT) == 0) {
 			m_freem(n);
 			n = NULL;
 		}
 	}
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	n->m_len = len;
 	n->m_next = NULL;
 	off = 0;
 
 	m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
 	off += PFKEY_ALIGN8(sizeof(struct sadb_msg));
 
 	m_sa = (struct sadb_sa *)(mtod(n, caddr_t) + off);
 	m_sa->sadb_sa_len = PFKEY_UNIT64(sizeof(struct sadb_sa));
 	m_sa->sadb_sa_exttype = SADB_EXT_SA;
 	m_sa->sadb_sa_spi = htonl(spi);
 	off += PFKEY_ALIGN8(sizeof(struct sadb_sa));
 
 	IPSEC_ASSERT(off == len,
 		("length inconsistency (off %u len %u)", off, len));
 
 	n->m_next = key_gather_mbuf(m, mhp, 0, 2, SADB_EXT_ADDRESS_SRC,
 	    SADB_EXT_ADDRESS_DST);
 	if (!n->m_next) {
 		m_freem(n);
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	if (n->m_len < sizeof(struct sadb_msg)) {
 		n = m_pullup(n, sizeof(struct sadb_msg));
 		if (n == NULL)
 			return key_sendup_mbuf(so, m, KEY_SENDUP_ONE);
 	}
 
 	n->m_pkthdr.len = 0;
 	for (nn = n; nn; nn = nn->m_next)
 		n->m_pkthdr.len += nn->m_len;
 
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_seq = newsav->seq;
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
     }
 }
 
 /*
  * allocating new SPI
  * called by key_getspi().
  * OUT:
  *	0:	failure.
  *	others: success.
  */
 static u_int32_t
 key_do_getnewspi(spirange, saidx)
 	struct sadb_spirange *spirange;
 	struct secasindex *saidx;
 {
 	u_int32_t newspi;
 	u_int32_t min, max;
 	int count = V_key_spi_trycnt;
 
 	/* set spi range to allocate */
 	if (spirange != NULL) {
 		min = spirange->sadb_spirange_min;
 		max = spirange->sadb_spirange_max;
 	} else {
 		min = V_key_spi_minval;
 		max = V_key_spi_maxval;
 	}
 	/* IPCOMP needs 2-byte SPI */
 	if (saidx->proto == IPPROTO_IPCOMP) {
 		u_int32_t t;
 		if (min >= 0x10000)
 			min = 0xffff;
 		if (max >= 0x10000)
 			max = 0xffff;
 		if (min > max) {
 			t = min; min = max; max = t;
 		}
 	}
 
 	if (min == max) {
 		if (key_checkspidup(saidx, min) != NULL) {
 			ipseclog((LOG_DEBUG, "%s: SPI %u exists already.\n",
 				__func__, min));
 			return 0;
 		}
 
 		count--; /* taking one cost. */
 		newspi = min;
 
 	} else {
 
 		/* init SPI */
 		newspi = 0;
 
 		/* when requesting to allocate spi ranged */
 		while (count--) {
 			/* generate pseudo-random SPI value ranged. */
 			newspi = min + (key_random() % (max - min + 1));
 
 			if (key_checkspidup(saidx, newspi) == NULL)
 				break;
 		}
 
 		if (count == 0 || newspi == 0) {
 			ipseclog((LOG_DEBUG, "%s: to allocate spi is failed.\n",
 				__func__));
 			return 0;
 		}
 	}
 
 	/* statistics */
 	keystat.getspi_count =
 		(keystat.getspi_count + V_key_spi_trycnt - count) / 2;
 
 	return newspi;
 }
 
 /*
  * SADB_UPDATE processing
  * receive
  *   <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
  *       key(AE), (identity(SD),) (sensitivity)>
  * from the ikmpd, and update a secasvar entry whose status is SADB_SASTATE_LARVAL.
  * and send
  *   <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
  *       (identity(SD),) (sensitivity)>
  * to the ikmpd.
  *
  * m will always be freed.
  */
 static int
 key_update(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct sadb_sa *sa0;
 	struct sadb_address *src0, *dst0;
 #ifdef IPSEC_NAT_T
 	struct sadb_x_nat_t_type *type;
 	struct sadb_x_nat_t_port *sport, *dport;
 	struct sadb_address *iaddr, *raddr;
 	struct sadb_x_nat_t_frag *frag;
 #endif
 	struct secasindex saidx;
 	struct secashead *sah;
 	struct secasvar *sav;
 	u_int16_t proto;
 	u_int8_t mode;
 	u_int32_t reqid;
 	int error;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->ext[SADB_EXT_SA] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL ||
 	    (mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP &&
 	     mhp->ext[SADB_EXT_KEY_ENCRYPT] == NULL) ||
 	    (mhp->msg->sadb_msg_satype == SADB_SATYPE_AH &&
 	     mhp->ext[SADB_EXT_KEY_AUTH] == NULL) ||
 	    (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL &&
 	     mhp->ext[SADB_EXT_LIFETIME_SOFT] == NULL) ||
 	    (mhp->ext[SADB_EXT_LIFETIME_HARD] == NULL &&
 	     mhp->ext[SADB_EXT_LIFETIME_SOFT] != NULL)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->ext[SADB_X_EXT_SA2] != NULL) {
 		mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
 		reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
 	} else {
 		mode = IPSEC_MODE_ANY;
 		reqid = 0;
 	}
 	/* XXX boundary checking for other extensions */
 
 	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
 	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
 	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);
 
 	/*
 	 * Make sure the port numbers are zero.
 	 * In case of NAT-T we will update them later if needed.
 	 */
 	KEY_PORTTOSADDR(&saidx.src, 0);
 	KEY_PORTTOSADDR(&saidx.dst, 0);
 
 #ifdef IPSEC_NAT_T
 	/*
 	 * Handle NAT-T info if present.
 	 */
 	if (mhp->ext[SADB_X_EXT_NAT_T_TYPE] != NULL &&
 	    mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL &&
 	    mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) {
 
 		if (mhp->extlen[SADB_X_EXT_NAT_T_TYPE] < sizeof(*type) ||
 		    mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) ||
 		    mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) {
 			ipseclog((LOG_DEBUG, "%s: invalid message.\n",
 			    __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 
 		type = (struct sadb_x_nat_t_type *)
 		    mhp->ext[SADB_X_EXT_NAT_T_TYPE];
 		sport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_SPORT];
 		dport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_DPORT];
 	} else {
 		type = 0;
 		sport = dport = 0;
 	}
 	if (mhp->ext[SADB_X_EXT_NAT_T_OAI] != NULL &&
 	    mhp->ext[SADB_X_EXT_NAT_T_OAR] != NULL) {
 		if (mhp->extlen[SADB_X_EXT_NAT_T_OAI] < sizeof(*iaddr) ||
 		    mhp->extlen[SADB_X_EXT_NAT_T_OAR] < sizeof(*raddr)) {
 			ipseclog((LOG_DEBUG, "%s: invalid message\n",
 			    __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 		iaddr = (struct sadb_address *)mhp->ext[SADB_X_EXT_NAT_T_OAI];
 		raddr = (struct sadb_address *)mhp->ext[SADB_X_EXT_NAT_T_OAR];
 		ipseclog((LOG_DEBUG, "%s: NAT-T OAi/r present\n", __func__));
 	} else {
 		iaddr = raddr = NULL;
 	}
 	if (mhp->ext[SADB_X_EXT_NAT_T_FRAG] != NULL) {
 		if (mhp->extlen[SADB_X_EXT_NAT_T_FRAG] < sizeof(*frag)) {
 			ipseclog((LOG_DEBUG, "%s: invalid message\n",
 			    __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 		frag = (struct sadb_x_nat_t_frag *)
 		    mhp->ext[SADB_X_EXT_NAT_T_FRAG];
 	} else {
 		frag = 0;
 	}
 #endif
 
 	/* get a SA header */
 	if ((sah = key_getsah(&saidx)) == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no SA index found.\n", __func__));
 		return key_senderror(so, m, ENOENT);
 	}
 
 	/* set spidx if there */
 	/* XXX rewrite */
 	error = key_setident(sah, m, mhp);
 	if (error)
 		return key_senderror(so, m, error);
 
 	/* find a SA with sequence number. */
 #ifdef IPSEC_DOSEQCHECK
 	if (mhp->msg->sadb_msg_seq != 0
 	 && (sav = key_getsavbyseq(sah, mhp->msg->sadb_msg_seq)) == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no larval SA with sequence %u "
 			"exists.\n", __func__, mhp->msg->sadb_msg_seq));
 		return key_senderror(so, m, ENOENT);
 	}
 #else
 	SAHTREE_LOCK();
 	sav = key_getsavbyspi(sah, sa0->sadb_sa_spi);
 	SAHTREE_UNLOCK();
 	if (sav == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no such a SA found (spi:%u)\n",
 			__func__, (u_int32_t)ntohl(sa0->sadb_sa_spi)));
 		return key_senderror(so, m, EINVAL);
 	}
 #endif
 
 	/* validity check */
 	if (sav->sah->saidx.proto != proto) {
 		ipseclog((LOG_DEBUG, "%s: protocol mismatched "
 			"(DB=%u param=%u)\n", __func__,
 			sav->sah->saidx.proto, proto));
 		return key_senderror(so, m, EINVAL);
 	}
 #ifdef IPSEC_DOSEQCHECK
 	if (sav->spi != sa0->sadb_sa_spi) {
 		ipseclog((LOG_DEBUG, "%s: SPI mismatched (DB:%u param:%u)\n",
 		    __func__,
 		    (u_int32_t)ntohl(sav->spi),
 		    (u_int32_t)ntohl(sa0->sadb_sa_spi)));
 		return key_senderror(so, m, EINVAL);
 	}
 #endif
 	if (sav->pid != mhp->msg->sadb_msg_pid) {
 		ipseclog((LOG_DEBUG, "%s: pid mismatched (DB:%u param:%u)\n",
 		    __func__, sav->pid, mhp->msg->sadb_msg_pid));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* copy sav values */
 	error = key_setsaval(sav, m, mhp);
 	if (error) {
 		KEY_FREESAV(&sav);
 		return key_senderror(so, m, error);
 	}
 
 #ifdef IPSEC_NAT_T
 	/*
 	 * Handle more NAT-T info if present,
 	 * now that we have a sav to fill.
 	 */
 	if (type)
 		sav->natt_type = type->sadb_x_nat_t_type_type;
 
 	if (sport)
 		KEY_PORTTOSADDR(&sav->sah->saidx.src,
 		    sport->sadb_x_nat_t_port_port);
 	if (dport)
 		KEY_PORTTOSADDR(&sav->sah->saidx.dst,
 		    dport->sadb_x_nat_t_port_port);
 
 #if 0
 	/*
 	 * In case SADB_X_EXT_NAT_T_FRAG was not given, leave it at 0.
 	 * We should actually check for a minimum MTU here, if we
 	 * want to support it in ip_output.
 	 */
 	if (frag)
 		sav->natt_esp_frag_len = frag->sadb_x_nat_t_frag_fraglen;
 #endif
 #endif
 
 	/* check SA values to be mature. */
 	if ((mhp->msg->sadb_msg_errno = key_mature(sav)) != 0) {
 		KEY_FREESAV(&sav);
 		return key_senderror(so, m, 0);
 	}
 
     {
 	struct mbuf *n;
 
 	/* set msg buf from mhp */
 	n = key_getmsgbuf_x1(m, mhp);
 	if (n == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /*
  * search SAD with sequence for a SA which state is SADB_SASTATE_LARVAL.
  * only called by key_update().
  * OUT:
  *	NULL	: not found
  *	others	: found, pointer to a SA.
  */
 #ifdef IPSEC_DOSEQCHECK
 static struct secasvar *
 key_getsavbyseq(sah, seq)
 	struct secashead *sah;
 	u_int32_t seq;
 {
 	struct secasvar *sav;
 	u_int state;
 
 	state = SADB_SASTATE_LARVAL;
 
 	/* search SAD with sequence number ? */
 	LIST_FOREACH(sav, &sah->savtree[state], chain) {
 
 		KEY_CHKSASTATE(state, sav->state, __func__);
 
 		if (sav->seq == seq) {
 			sa_addref(sav);
 			KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
 				printf("DP %s cause refcnt++:%d SA:%p\n",
 					__func__, sav->refcnt, sav));
 			return sav;
 		}
 	}
 
 	return NULL;
 }
 #endif
 
 /*
  * SADB_ADD processing
  * add an entry to SA database, when received
  *   <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
  *       key(AE), (identity(SD),) (sensitivity)>
  * from the ikmpd,
  * and send
  *   <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
  *       (identity(SD),) (sensitivity)>
  * to the ikmpd.
  *
  * IGNORE identity and sensitivity messages.
  *
  * m will always be freed.
  */
 static int
 key_add(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct sadb_sa *sa0;
 	struct sadb_address *src0, *dst0;
 #ifdef IPSEC_NAT_T
 	struct sadb_x_nat_t_type *type;
 	struct sadb_address *iaddr, *raddr;
 	struct sadb_x_nat_t_frag *frag;
 #endif
 	struct secasindex saidx;
 	struct secashead *newsah;
 	struct secasvar *newsav;
 	u_int16_t proto;
 	u_int8_t mode;
 	u_int32_t reqid;
 	int error;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->ext[SADB_EXT_SA] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL ||
 	    (mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP &&
 	     mhp->ext[SADB_EXT_KEY_ENCRYPT] == NULL) ||
 	    (mhp->msg->sadb_msg_satype == SADB_SATYPE_AH &&
 	     mhp->ext[SADB_EXT_KEY_AUTH] == NULL) ||
 	    (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL &&
 	     mhp->ext[SADB_EXT_LIFETIME_SOFT] == NULL) ||
 	    (mhp->ext[SADB_EXT_LIFETIME_HARD] == NULL &&
 	     mhp->ext[SADB_EXT_LIFETIME_SOFT] != NULL)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
 		/* XXX need more */
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->ext[SADB_X_EXT_SA2] != NULL) {
 		mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
 		reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
 	} else {
 		mode = IPSEC_MODE_ANY;
 		reqid = 0;
 	}
 
 	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
 	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
 	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);
 
 	/*
 	 * Make sure the port numbers are zero.
 	 * In case of NAT-T we will update them later if needed.
 	 */
 	KEY_PORTTOSADDR(&saidx.src, 0);
 	KEY_PORTTOSADDR(&saidx.dst, 0);
 
 #ifdef IPSEC_NAT_T
 	/*
 	 * Handle NAT-T info if present.
 	 */
 	if (mhp->ext[SADB_X_EXT_NAT_T_TYPE] != NULL &&
 	    mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL &&
 	    mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) {
 		struct sadb_x_nat_t_port *sport, *dport;
 
 		if (mhp->extlen[SADB_X_EXT_NAT_T_TYPE] < sizeof(*type) ||
 		    mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) ||
 		    mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) {
 			ipseclog((LOG_DEBUG, "%s: invalid message.\n",
 			    __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 
 		type = (struct sadb_x_nat_t_type *)
 		    mhp->ext[SADB_X_EXT_NAT_T_TYPE];
 		sport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_SPORT];
 		dport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_DPORT];
 
 		if (sport)
 			KEY_PORTTOSADDR(&saidx.src,
 			    sport->sadb_x_nat_t_port_port);
 		if (dport)
 			KEY_PORTTOSADDR(&saidx.dst,
 			    dport->sadb_x_nat_t_port_port);
 	} else {
 		type = 0;
 	}
 	if (mhp->ext[SADB_X_EXT_NAT_T_OAI] != NULL &&
 	    mhp->ext[SADB_X_EXT_NAT_T_OAR] != NULL) {
 		if (mhp->extlen[SADB_X_EXT_NAT_T_OAI] < sizeof(*iaddr) ||
 		    mhp->extlen[SADB_X_EXT_NAT_T_OAR] < sizeof(*raddr)) {
 			ipseclog((LOG_DEBUG, "%s: invalid message\n",
 			    __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 		iaddr = (struct sadb_address *)mhp->ext[SADB_X_EXT_NAT_T_OAI];
 		raddr = (struct sadb_address *)mhp->ext[SADB_X_EXT_NAT_T_OAR];
 		ipseclog((LOG_DEBUG, "%s: NAT-T OAi/r present\n", __func__));
 	} else {
 		iaddr = raddr = NULL;
 	}
 	if (mhp->ext[SADB_X_EXT_NAT_T_FRAG] != NULL) {
 		if (mhp->extlen[SADB_X_EXT_NAT_T_FRAG] < sizeof(*frag)) {
 			ipseclog((LOG_DEBUG, "%s: invalid message\n",
 			    __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 		frag = (struct sadb_x_nat_t_frag *)
 		    mhp->ext[SADB_X_EXT_NAT_T_FRAG];
 	} else {
 		frag = 0;
 	}
 #endif
 
 	/* get a SA header */
 	if ((newsah = key_getsah(&saidx)) == NULL) {
 		/* create a new SA header */
 		if ((newsah = key_newsah(&saidx)) == NULL) {
 			ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__));
 			return key_senderror(so, m, ENOBUFS);
 		}
 	}
 
 	/* set spidx if there */
 	/* XXX rewrite */
 	error = key_setident(newsah, m, mhp);
 	if (error) {
 		return key_senderror(so, m, error);
 	}
 
 	/* create new SA entry. */
 	/* We can create new SA only if SPI is differenct. */
 	SAHTREE_LOCK();
 	newsav = key_getsavbyspi(newsah, sa0->sadb_sa_spi);
 	SAHTREE_UNLOCK();
 	if (newsav != NULL) {
 		ipseclog((LOG_DEBUG, "%s: SA already exists.\n", __func__));
 		return key_senderror(so, m, EEXIST);
 	}
 	newsav = KEY_NEWSAV(m, mhp, newsah, &error);
 	if (newsav == NULL) {
 		return key_senderror(so, m, error);
 	}
 
 #ifdef IPSEC_NAT_T
 	/*
 	 * Handle more NAT-T info if present,
 	 * now that we have a sav to fill.
 	 */
 	if (type)
 		newsav->natt_type = type->sadb_x_nat_t_type_type;
 
 #if 0
 	/*
 	 * In case SADB_X_EXT_NAT_T_FRAG was not given, leave it at 0.
 	 * We should actually check for a minimum MTU here, if we
 	 * want to support it in ip_output.
 	 */
 	if (frag)
 		newsav->natt_esp_frag_len = frag->sadb_x_nat_t_frag_fraglen;
 #endif
 #endif
 
 	/* check SA values to be mature. */
 	if ((error = key_mature(newsav)) != 0) {
 		KEY_FREESAV(&newsav);
 		return key_senderror(so, m, error);
 	}
 
 	/*
 	 * don't call key_freesav() here, as we would like to keep the SA
 	 * in the database on success.
 	 */
 
     {
 	struct mbuf *n;
 
 	/* set msg buf from mhp */
 	n = key_getmsgbuf_x1(m, mhp);
 	if (n == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /* m is retained */
 static int
 key_setident(sah, m, mhp)
 	struct secashead *sah;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	const struct sadb_ident *idsrc, *iddst;
 	int idsrclen, iddstlen;
 
 	IPSEC_ASSERT(sah != NULL, ("null secashead"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* don't make buffer if not there */
 	if (mhp->ext[SADB_EXT_IDENTITY_SRC] == NULL &&
 	    mhp->ext[SADB_EXT_IDENTITY_DST] == NULL) {
 		sah->idents = NULL;
 		sah->identd = NULL;
 		return 0;
 	}
 	
 	if (mhp->ext[SADB_EXT_IDENTITY_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_IDENTITY_DST] == NULL) {
 		ipseclog((LOG_DEBUG, "%s: invalid identity.\n", __func__));
 		return EINVAL;
 	}
 
 	idsrc = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_SRC];
 	iddst = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_DST];
 	idsrclen = mhp->extlen[SADB_EXT_IDENTITY_SRC];
 	iddstlen = mhp->extlen[SADB_EXT_IDENTITY_DST];
 
 	/* validity check */
 	if (idsrc->sadb_ident_type != iddst->sadb_ident_type) {
 		ipseclog((LOG_DEBUG, "%s: ident type mismatch.\n", __func__));
 		return EINVAL;
 	}
 
 	switch (idsrc->sadb_ident_type) {
 	case SADB_IDENTTYPE_PREFIX:
 	case SADB_IDENTTYPE_FQDN:
 	case SADB_IDENTTYPE_USERFQDN:
 	default:
 		/* XXX do nothing */
 		sah->idents = NULL;
 		sah->identd = NULL;
 	 	return 0;
 	}
 
 	/* make structure */
 	sah->idents = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT);
 	if (sah->idents == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return ENOBUFS;
 	}
 	sah->identd = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT);
 	if (sah->identd == NULL) {
 		free(sah->idents, M_IPSEC_MISC);
 		sah->idents = NULL;
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return ENOBUFS;
 	}
 	sah->idents->type = idsrc->sadb_ident_type;
 	sah->idents->id = idsrc->sadb_ident_id;
 
 	sah->identd->type = iddst->sadb_ident_type;
 	sah->identd->id = iddst->sadb_ident_id;
 
 	return 0;
 }
 
 /*
  * m will not be freed on return.
  * it is caller's responsibility to free the result. 
  */
 static struct mbuf *
 key_getmsgbuf_x1(m, mhp)
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct mbuf *n;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* create new sadb_msg to reply. */
 	n = key_gather_mbuf(m, mhp, 1, 9, SADB_EXT_RESERVED,
 	    SADB_EXT_SA, SADB_X_EXT_SA2,
 	    SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST,
 	    SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT,
 	    SADB_EXT_IDENTITY_SRC, SADB_EXT_IDENTITY_DST);
 	if (!n)
 		return NULL;
 
 	if (n->m_len < sizeof(struct sadb_msg)) {
 		n = m_pullup(n, sizeof(struct sadb_msg));
 		if (n == NULL)
 			return NULL;
 	}
 	mtod(n, struct sadb_msg *)->sadb_msg_errno = 0;
 	mtod(n, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(n->m_pkthdr.len);
 
 	return n;
 }
 
 static int key_delete_all __P((struct socket *, struct mbuf *,
 	const struct sadb_msghdr *, u_int16_t));
 
 /*
  * SADB_DELETE processing
  * receive
  *   <base, SA(*), address(SD)>
  * from the ikmpd, and set SADB_SASTATE_DEAD,
  * and send,
  *   <base, SA(*), address(SD)>
  * to the ikmpd.
  *
  * m will always be freed.
  */
 static int
 key_delete(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct sadb_sa *sa0;
 	struct sadb_address *src0, *dst0;
 	struct secasindex saidx;
 	struct secashead *sah;
 	struct secasvar *sav = NULL;
 	u_int16_t proto;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->ext[SADB_EXT_SA] == NULL) {
 		/*
 		 * Caller wants us to delete all non-LARVAL SAs
 		 * that match the src/dst.  This is used during
 		 * IKE INITIAL-CONTACT.
 		 */
 		ipseclog((LOG_DEBUG, "%s: doing delete all.\n", __func__));
 		return key_delete_all(so, m, mhp, proto);
 	} else if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
 	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
 	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);
 
 	/*
 	 * Make sure the port numbers are zero.
 	 * In case of NAT-T we will update them later if needed.
 	 */
 	KEY_PORTTOSADDR(&saidx.src, 0);
 	KEY_PORTTOSADDR(&saidx.dst, 0);
 
 #ifdef IPSEC_NAT_T
 	/*
 	 * Handle NAT-T info if present.
 	 */
 	if (mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL &&
 	    mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) {
 		struct sadb_x_nat_t_port *sport, *dport;
 
 		if (mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) ||
 		    mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) {
 			ipseclog((LOG_DEBUG, "%s: invalid message.\n",
 			    __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 
 		sport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_SPORT];
 		dport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_DPORT];
 
 		if (sport)
 			KEY_PORTTOSADDR(&saidx.src,
 			    sport->sadb_x_nat_t_port_port);
 		if (dport)
 			KEY_PORTTOSADDR(&saidx.dst,
 			    dport->sadb_x_nat_t_port_port);
 	}
 #endif
 
 	/* get a SA header */
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0)
 			continue;
 
 		/* get a SA with SPI. */
 		sav = key_getsavbyspi(sah, sa0->sadb_sa_spi);
 		if (sav)
 			break;
 	}
 	if (sah == NULL) {
 		SAHTREE_UNLOCK();
 		ipseclog((LOG_DEBUG, "%s: no SA found.\n", __func__));
 		return key_senderror(so, m, ENOENT);
 	}
 
 	key_sa_chgstate(sav, SADB_SASTATE_DEAD);
 	KEY_FREESAV(&sav);
 	SAHTREE_UNLOCK();
 
     {
 	struct mbuf *n;
 	struct sadb_msg *newmsg;
 
 	/* create new sadb_msg to reply. */
 	/* XXX-BZ NAT-T extensions? */
 	n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED,
 	    SADB_EXT_SA, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	if (n->m_len < sizeof(struct sadb_msg)) {
 		n = m_pullup(n, sizeof(struct sadb_msg));
 		if (n == NULL)
 			return key_senderror(so, m, ENOBUFS);
 	}
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /*
  * delete all SAs for src/dst.  Called from key_delete().
  */
 static int
 key_delete_all(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp,
     u_int16_t proto)
 {
 	struct sadb_address *src0, *dst0;
 	struct secasindex saidx;
 	struct secashead *sah;
 	struct secasvar *sav, *nextsav;
 	u_int stateidx, state;
 
 	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
 	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);
 
 	/*
 	 * Make sure the port numbers are zero.
 	 * In case of NAT-T we will update them later if needed.
 	 */
 	KEY_PORTTOSADDR(&saidx.src, 0);
 	KEY_PORTTOSADDR(&saidx.dst, 0);
 
 #ifdef IPSEC_NAT_T
 	/*
 	 * Handle NAT-T info if present.
 	 */
 
 	if (mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL &&
 	    mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) {
 		struct sadb_x_nat_t_port *sport, *dport;
 
 		if (mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) ||
 		    mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) {
 			ipseclog((LOG_DEBUG, "%s: invalid message.\n",
 			    __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 
 		sport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_SPORT];
 		dport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_DPORT];
 
 		if (sport)
 			KEY_PORTTOSADDR(&saidx.src,
 			    sport->sadb_x_nat_t_port_port);
 		if (dport)
 			KEY_PORTTOSADDR(&saidx.dst,
 			    dport->sadb_x_nat_t_port_port);
 	}
 #endif
 
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0)
 			continue;
 
 		/* Delete all non-LARVAL SAs. */
 		for (stateidx = 0;
 		     stateidx < _ARRAYLEN(saorder_state_alive);
 		     stateidx++) {
 			state = saorder_state_alive[stateidx];
 			if (state == SADB_SASTATE_LARVAL)
 				continue;
 			for (sav = LIST_FIRST(&sah->savtree[state]);
 			     sav != NULL; sav = nextsav) {
 				nextsav = LIST_NEXT(sav, chain);
 				/* sanity check */
 				if (sav->state != state) {
 					ipseclog((LOG_DEBUG, "%s: invalid "
 						"sav->state (queue %d SA %d)\n",
 						__func__, state, sav->state));
 					continue;
 				}
 				
 				key_sa_chgstate(sav, SADB_SASTATE_DEAD);
 				KEY_FREESAV(&sav);
 			}
 		}
 	}
 	SAHTREE_UNLOCK();
     {
 	struct mbuf *n;
 	struct sadb_msg *newmsg;
 
 	/* create new sadb_msg to reply. */
 	/* XXX-BZ NAT-T extensions? */
 	n = key_gather_mbuf(m, mhp, 1, 3, SADB_EXT_RESERVED,
 	    SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	if (n->m_len < sizeof(struct sadb_msg)) {
 		n = m_pullup(n, sizeof(struct sadb_msg));
 		if (n == NULL)
 			return key_senderror(so, m, ENOBUFS);
 	}
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
     }
 }
 
 /*
  * SADB_GET processing
  * receive
  *   <base, SA(*), address(SD)>
  * from the ikmpd, and get a SP and a SA to respond,
  * and send,
  *   <base, SA, (lifetime(HSC),) address(SD), (address(P),) key(AE),
  *       (identity(SD),) (sensitivity)>
  * to the ikmpd.
  *
  * m will always be freed.
  */
 static int
 key_get(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct sadb_sa *sa0;
 	struct sadb_address *src0, *dst0;
 	struct secasindex saidx;
 	struct secashead *sah;
 	struct secasvar *sav = NULL;
 	u_int16_t proto;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->ext[SADB_EXT_SA] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
 	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
 	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);
 
 	/*
 	 * Make sure the port numbers are zero.
 	 * In case of NAT-T we will update them later if needed.
 	 */
 	KEY_PORTTOSADDR(&saidx.src, 0);
 	KEY_PORTTOSADDR(&saidx.dst, 0);
 
 #ifdef IPSEC_NAT_T
 	/*
 	 * Handle NAT-T info if present.
 	 */
 
 	if (mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL &&
 	    mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) {
 		struct sadb_x_nat_t_port *sport, *dport;
 
 		if (mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) ||
 		    mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) {
 			ipseclog((LOG_DEBUG, "%s: invalid message.\n",
 			    __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 
 		sport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_SPORT];
 		dport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_DPORT];
 
 		if (sport)
 			KEY_PORTTOSADDR(&saidx.src,
 			    sport->sadb_x_nat_t_port_port);
 		if (dport)
 			KEY_PORTTOSADDR(&saidx.dst,
 			    dport->sadb_x_nat_t_port_port);
 	}
 #endif
 
 	/* get a SA header */
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0)
 			continue;
 
 		/* get a SA with SPI. */
 		sav = key_getsavbyspi(sah, sa0->sadb_sa_spi);
 		if (sav)
 			break;
 	}
 	SAHTREE_UNLOCK();
 	if (sah == NULL) {
 		ipseclog((LOG_DEBUG, "%s: no SA found.\n", __func__));
 		return key_senderror(so, m, ENOENT);
 	}
 
     {
 	struct mbuf *n;
 	u_int8_t satype;
 
 	/* map proto to satype */
 	if ((satype = key_proto2satype(sah->saidx.proto)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: there was invalid proto in SAD.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* create new sadb_msg to reply. */
 	n = key_setdumpsa(sav, SADB_GET, satype, mhp->msg->sadb_msg_seq,
 	    mhp->msg->sadb_msg_pid);
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
     }
 }
 
 /* XXX make it sysctl-configurable? */
 static void
 key_getcomb_setlifetime(comb)
 	struct sadb_comb *comb;
 {
 
 	comb->sadb_comb_soft_allocations = 1;
 	comb->sadb_comb_hard_allocations = 1;
 	comb->sadb_comb_soft_bytes = 0;
 	comb->sadb_comb_hard_bytes = 0;
 	comb->sadb_comb_hard_addtime = 86400;	/* 1 day */
 	comb->sadb_comb_soft_addtime = comb->sadb_comb_soft_addtime * 80 / 100;
 	comb->sadb_comb_soft_usetime = 28800;	/* 8 hours */
 	comb->sadb_comb_hard_usetime = comb->sadb_comb_hard_usetime * 80 / 100;
 }
 
 /*
  * XXX reorder combinations by preference
  * XXX no idea if the user wants ESP authentication or not
  */
 static struct mbuf *
 key_getcomb_esp()
 {
 	struct sadb_comb *comb;
 	struct enc_xform *algo;
 	struct mbuf *result = NULL, *m, *n;
 	int encmin;
 	int i, off, o;
 	int totlen;
 	const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));
 
 	m = NULL;
 	for (i = 1; i <= SADB_EALG_MAX; i++) {
 		algo = esp_algorithm_lookup(i);
 		if (algo == NULL)
 			continue;
 
 		/* discard algorithms with key size smaller than system min */
 		if (_BITS(algo->maxkey) < V_ipsec_esp_keymin)
 			continue;
 		if (_BITS(algo->minkey) < V_ipsec_esp_keymin)
 			encmin = V_ipsec_esp_keymin;
 		else
 			encmin = _BITS(algo->minkey);
 
 		if (V_ipsec_esp_auth)
 			m = key_getcomb_ah();
 		else {
 			IPSEC_ASSERT(l <= MLEN,
 				("l=%u > MLEN=%lu", l, (u_long) MLEN));
 			MGET(m, M_NOWAIT, MT_DATA);
 			if (m) {
 				M_ALIGN(m, l);
 				m->m_len = l;
 				m->m_next = NULL;
 				bzero(mtod(m, caddr_t), m->m_len);
 			}
 		}
 		if (!m)
 			goto fail;
 
 		totlen = 0;
 		for (n = m; n; n = n->m_next)
 			totlen += n->m_len;
 		IPSEC_ASSERT((totlen % l) == 0, ("totlen=%u, l=%u", totlen, l));
 
 		for (off = 0; off < totlen; off += l) {
 			n = m_pulldown(m, off, l, &o);
 			if (!n) {
 				/* m is already freed */
 				goto fail;
 			}
 			comb = (struct sadb_comb *)(mtod(n, caddr_t) + o);
 			bzero(comb, sizeof(*comb));
 			key_getcomb_setlifetime(comb);
 			comb->sadb_comb_encrypt = i;
 			comb->sadb_comb_encrypt_minbits = encmin;
 			comb->sadb_comb_encrypt_maxbits = _BITS(algo->maxkey);
 		}
 
 		if (!result)
 			result = m;
 		else
 			m_cat(result, m);
 	}
 
 	return result;
 
  fail:
 	if (result)
 		m_freem(result);
 	return NULL;
 }
 
 static void
 key_getsizes_ah(
 	const struct auth_hash *ah,
 	int alg,
 	u_int16_t* min,
 	u_int16_t* max)
 {
 
 	*min = *max = ah->keysize;
 	if (ah->keysize == 0) {
 		/*
 		 * Transform takes arbitrary key size but algorithm
 		 * key size is restricted.  Enforce this here.
 		 */
 		switch (alg) {
 		case SADB_X_AALG_MD5:	*min = *max = 16; break;
 		case SADB_X_AALG_SHA:	*min = *max = 20; break;
 		case SADB_X_AALG_NULL:	*min = 1; *max = 256; break;
 		case SADB_X_AALG_SHA2_256: *min = *max = 32; break;
 		case SADB_X_AALG_SHA2_384: *min = *max = 48; break;
 		case SADB_X_AALG_SHA2_512: *min = *max = 64; break;
 		default:
 			DPRINTF(("%s: unknown AH algorithm %u\n",
 				__func__, alg));
 			break;
 		}
 	}
 }
 
 /*
  * XXX reorder combinations by preference
  */
 static struct mbuf *
 key_getcomb_ah()
 {
 	struct sadb_comb *comb;
 	struct auth_hash *algo;
 	struct mbuf *m;
 	u_int16_t minkeysize, maxkeysize;
 	int i;
 	const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));
 
 	m = NULL;
 	for (i = 1; i <= SADB_AALG_MAX; i++) {
 #if 1
 		/* we prefer HMAC algorithms, not old algorithms */
 		if (i != SADB_AALG_SHA1HMAC &&
 		    i != SADB_AALG_MD5HMAC  &&
 		    i != SADB_X_AALG_SHA2_256 &&
 		    i != SADB_X_AALG_SHA2_384 &&
 		    i != SADB_X_AALG_SHA2_512)
 			continue;
 #endif
 		algo = ah_algorithm_lookup(i);
 		if (!algo)
 			continue;
 		key_getsizes_ah(algo, i, &minkeysize, &maxkeysize);
 		/* discard algorithms with key size smaller than system min */
 		if (_BITS(minkeysize) < V_ipsec_ah_keymin)
 			continue;
 
 		if (!m) {
 			IPSEC_ASSERT(l <= MLEN,
 				("l=%u > MLEN=%lu", l, (u_long) MLEN));
 			MGET(m, M_NOWAIT, MT_DATA);
 			if (m) {
 				M_ALIGN(m, l);
 				m->m_len = l;
 				m->m_next = NULL;
 			}
 		} else
 			M_PREPEND(m, l, M_NOWAIT);
 		if (!m)
 			return NULL;
 
 		comb = mtod(m, struct sadb_comb *);
 		bzero(comb, sizeof(*comb));
 		key_getcomb_setlifetime(comb);
 		comb->sadb_comb_auth = i;
 		comb->sadb_comb_auth_minbits = _BITS(minkeysize);
 		comb->sadb_comb_auth_maxbits = _BITS(maxkeysize);
 	}
 
 	return m;
 }
 
 /*
  * not really an official behavior.  discussed in pf_key@inner.net in Sep2000.
  * XXX reorder combinations by preference
  */
 static struct mbuf *
 key_getcomb_ipcomp()
 {
 	struct sadb_comb *comb;
 	struct comp_algo *algo;
 	struct mbuf *m;
 	int i;
 	const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));
 
 	m = NULL;
 	for (i = 1; i <= SADB_X_CALG_MAX; i++) {
 		algo = ipcomp_algorithm_lookup(i);
 		if (!algo)
 			continue;
 
 		if (!m) {
 			IPSEC_ASSERT(l <= MLEN,
 				("l=%u > MLEN=%lu", l, (u_long) MLEN));
 			MGET(m, M_NOWAIT, MT_DATA);
 			if (m) {
 				M_ALIGN(m, l);
 				m->m_len = l;
 				m->m_next = NULL;
 			}
 		} else
 			M_PREPEND(m, l, M_NOWAIT);
 		if (!m)
 			return NULL;
 
 		comb = mtod(m, struct sadb_comb *);
 		bzero(comb, sizeof(*comb));
 		key_getcomb_setlifetime(comb);
 		comb->sadb_comb_encrypt = i;
 		/* what should we set into sadb_comb_*_{min,max}bits? */
 	}
 
 	return m;
 }
 
 /*
  * XXX no way to pass mode (transport/tunnel) to userland
  * XXX replay checking?
  * XXX sysctl interface to ipsec_{ah,esp}_keymin
  */
 static struct mbuf *
 key_getprop(saidx)
 	const struct secasindex *saidx;
 {
 	struct sadb_prop *prop;
 	struct mbuf *m, *n;
 	const int l = PFKEY_ALIGN8(sizeof(struct sadb_prop));
 	int totlen;
 
 	switch (saidx->proto)  {
 	case IPPROTO_ESP:
 		m = key_getcomb_esp();
 		break;
 	case IPPROTO_AH:
 		m = key_getcomb_ah();
 		break;
 	case IPPROTO_IPCOMP:
 		m = key_getcomb_ipcomp();
 		break;
 	default:
 		return NULL;
 	}
 
 	if (!m)
 		return NULL;
 	M_PREPEND(m, l, M_NOWAIT);
 	if (!m)
 		return NULL;
 
 	totlen = 0;
 	for (n = m; n; n = n->m_next)
 		totlen += n->m_len;
 
 	prop = mtod(m, struct sadb_prop *);
 	bzero(prop, sizeof(*prop));
 	prop->sadb_prop_len = PFKEY_UNIT64(totlen);
 	prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
 	prop->sadb_prop_replay = 32;	/* XXX */
 
 	return m;
 }
 
 /*
  * SADB_ACQUIRE processing called by key_checkrequest() and key_acquire2().
  * send
  *   <base, SA, address(SD), (address(P)), x_policy,
  *       (identity(SD),) (sensitivity,) proposal>
  * to KMD, and expect to receive
  *   <base> with SADB_ACQUIRE if error occured,
  * or
  *   <base, src address, dst address, (SPI range)> with SADB_GETSPI
  * from KMD by PF_KEY.
  *
  * XXX x_policy is outside of RFC2367 (KAME extension).
  * XXX sensitivity is not supported.
  * XXX for ipcomp, RFC2367 does not define how to fill in proposal.
  * see comment for key_getcomb_ipcomp().
  *
  * OUT:
  *    0     : succeed
  *    others: error number
  */
 static int
 key_acquire(const struct secasindex *saidx, struct secpolicy *sp)
 {
 	struct mbuf *result = NULL, *m;
 	struct secacq *newacq;
 	u_int8_t satype;
 	int error = -1;
 	u_int32_t seq;
 
 	IPSEC_ASSERT(saidx != NULL, ("null saidx"));
 	satype = key_proto2satype(saidx->proto);
 	IPSEC_ASSERT(satype != 0, ("null satype, protocol %u", saidx->proto));
 
 	/*
 	 * We never do anything about acquirng SA.  There is anather
 	 * solution that kernel blocks to send SADB_ACQUIRE message until
 	 * getting something message from IKEd.  In later case, to be
 	 * managed with ACQUIRING list.
 	 */
 	/* Get an entry to check whether sending message or not. */
 	if ((newacq = key_getacq(saidx)) != NULL) {
 		if (V_key_blockacq_count < newacq->count) {
 			/* reset counter and do send message. */
 			newacq->count = 0;
 		} else {
 			/* increment counter and do nothing. */
 			newacq->count++;
 			return 0;
 		}
 	} else {
 		/* make new entry for blocking to send SADB_ACQUIRE. */
 		if ((newacq = key_newacq(saidx)) == NULL)
 			return ENOBUFS;
 	}
 
 
 	seq = newacq->seq;
 	m = key_setsadbmsg(SADB_ACQUIRE, 0, satype, seq, 0, 0);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	result = m;
 
 	/*
 	 * No SADB_X_EXT_NAT_T_* here: we do not know
 	 * anything related to NAT-T at this time.
 	 */
 
 	/* set sadb_address for saidx's. */
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
 	    &saidx->src.sa, FULLMASK, IPSEC_ULPROTO_ANY);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
 	    &saidx->dst.sa, FULLMASK, IPSEC_ULPROTO_ANY);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/* XXX proxy address (optional) */
 
 	/* set sadb_x_policy */
 	if (sp) {
 		m = key_setsadbxpolicy(sp->policy, sp->spidx.dir, sp->id);
 		if (!m) {
 			error = ENOBUFS;
 			goto fail;
 		}
 		m_cat(result, m);
 	}
 
 	/* XXX identity (optional) */
 #if 0
 	if (idexttype && fqdn) {
 		/* create identity extension (FQDN) */
 		struct sadb_ident *id;
 		int fqdnlen;
 
 		fqdnlen = strlen(fqdn) + 1;	/* +1 for terminating-NUL */
 		id = (struct sadb_ident *)p;
 		bzero(id, sizeof(*id) + PFKEY_ALIGN8(fqdnlen));
 		id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(fqdnlen));
 		id->sadb_ident_exttype = idexttype;
 		id->sadb_ident_type = SADB_IDENTTYPE_FQDN;
 		bcopy(fqdn, id + 1, fqdnlen);
 		p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(fqdnlen);
 	}
 
 	if (idexttype) {
 		/* create identity extension (USERFQDN) */
 		struct sadb_ident *id;
 		int userfqdnlen;
 
 		if (userfqdn) {
 			/* +1 for terminating-NUL */
 			userfqdnlen = strlen(userfqdn) + 1;
 		} else
 			userfqdnlen = 0;
 		id = (struct sadb_ident *)p;
 		bzero(id, sizeof(*id) + PFKEY_ALIGN8(userfqdnlen));
 		id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(userfqdnlen));
 		id->sadb_ident_exttype = idexttype;
 		id->sadb_ident_type = SADB_IDENTTYPE_USERFQDN;
 		/* XXX is it correct? */
 		if (curproc && curproc->p_cred)
 			id->sadb_ident_id = curproc->p_cred->p_ruid;
 		if (userfqdn && userfqdnlen)
 			bcopy(userfqdn, id + 1, userfqdnlen);
 		p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(userfqdnlen);
 	}
 #endif
 
 	/* XXX sensitivity (optional) */
 
 	/* create proposal/combination extension */
 	m = key_getprop(saidx);
 #if 0
 	/*
 	 * spec conformant: always attach proposal/combination extension,
 	 * the problem is that we have no way to attach it for ipcomp,
 	 * due to the way sadb_comb is declared in RFC2367.
 	 */
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 #else
 	/*
 	 * outside of spec; make proposal/combination extension optional.
 	 */
 	if (m)
 		m_cat(result, m);
 #endif
 
 	if ((result->m_flags & M_PKTHDR) == 0) {
 		error = EINVAL;
 		goto fail;
 	}
 
 	if (result->m_len < sizeof(struct sadb_msg)) {
 		result = m_pullup(result, sizeof(struct sadb_msg));
 		if (result == NULL) {
 			error = ENOBUFS;
 			goto fail;
 		}
 	}
 
 	result->m_pkthdr.len = 0;
 	for (m = result; m; m = m->m_next)
 		result->m_pkthdr.len += m->m_len;
 
 	mtod(result, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(result->m_pkthdr.len);
 
 	return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);
 
  fail:
 	if (result)
 		m_freem(result);
 	return error;
 }
 
 static struct secacq *
 key_newacq(const struct secasindex *saidx)
 {
 	struct secacq *newacq;
 
 	/* get new entry */
 	newacq = malloc(sizeof(struct secacq), M_IPSEC_SAQ, M_NOWAIT|M_ZERO);
 	if (newacq == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return NULL;
 	}
 
 	/* copy secindex */
 	bcopy(saidx, &newacq->saidx, sizeof(newacq->saidx));
 	newacq->seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq);
 	newacq->created = time_second;
 	newacq->count = 0;
 
 	/* add to acqtree */
 	ACQ_LOCK();
 	LIST_INSERT_HEAD(&V_acqtree, newacq, chain);
 	ACQ_UNLOCK();
 
 	return newacq;
 }
 
 static struct secacq *
 key_getacq(const struct secasindex *saidx)
 {
 	struct secacq *acq;
 
 	ACQ_LOCK();
 	LIST_FOREACH(acq, &V_acqtree, chain) {
 		if (key_cmpsaidx(saidx, &acq->saidx, CMP_EXACTLY))
 			break;
 	}
 	ACQ_UNLOCK();
 
 	return acq;
 }
 
 static struct secacq *
 key_getacqbyseq(seq)
 	u_int32_t seq;
 {
 	struct secacq *acq;
 
 	ACQ_LOCK();
 	LIST_FOREACH(acq, &V_acqtree, chain) {
 		if (acq->seq == seq)
 			break;
 	}
 	ACQ_UNLOCK();
 
 	return acq;
 }
 
 static struct secspacq *
 key_newspacq(spidx)
 	struct secpolicyindex *spidx;
 {
 	struct secspacq *acq;
 
 	/* get new entry */
 	acq = malloc(sizeof(struct secspacq), M_IPSEC_SAQ, M_NOWAIT|M_ZERO);
 	if (acq == NULL) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return NULL;
 	}
 
 	/* copy secindex */
 	bcopy(spidx, &acq->spidx, sizeof(acq->spidx));
 	acq->created = time_second;
 	acq->count = 0;
 
 	/* add to spacqtree */
 	SPACQ_LOCK();
 	LIST_INSERT_HEAD(&V_spacqtree, acq, chain);
 	SPACQ_UNLOCK();
 
 	return acq;
 }
 
 static struct secspacq *
 key_getspacq(spidx)
 	struct secpolicyindex *spidx;
 {
 	struct secspacq *acq;
 
 	SPACQ_LOCK();
 	LIST_FOREACH(acq, &V_spacqtree, chain) {
 		if (key_cmpspidx_exactly(spidx, &acq->spidx)) {
 			/* NB: return holding spacq_lock */
 			return acq;
 		}
 	}
 	SPACQ_UNLOCK();
 
 	return NULL;
 }
 
 /*
  * SADB_ACQUIRE processing,
  * in first situation, is receiving
  *   <base>
  * from the ikmpd, and clear sequence of its secasvar entry.
  *
  * In second situation, is receiving
  *   <base, address(SD), (address(P),) (identity(SD),) (sensitivity,) proposal>
  * from a user land process, and return
  *   <base, address(SD), (address(P),) (identity(SD),) (sensitivity,) proposal>
  * to the socket.
  *
  * m will always be freed.
  */
 static int
 key_acquire2(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	const struct sadb_address *src0, *dst0;
 	struct secasindex saidx;
 	struct secashead *sah;
 	u_int16_t proto;
 	int error;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/*
 	 * Error message from KMd.
 	 * We assume that if error was occured in IKEd, the length of PFKEY
 	 * message is equal to the size of sadb_msg structure.
 	 * We do not raise error even if error occured in this function.
 	 */
 	if (mhp->msg->sadb_msg_len == PFKEY_UNIT64(sizeof(struct sadb_msg))) {
 		struct secacq *acq;
 
 		/* check sequence number */
 		if (mhp->msg->sadb_msg_seq == 0) {
 			ipseclog((LOG_DEBUG, "%s: must specify sequence "
 				"number.\n", __func__));
 			m_freem(m);
 			return 0;
 		}
 
 		if ((acq = key_getacqbyseq(mhp->msg->sadb_msg_seq)) == NULL) {
 			/*
 			 * the specified larval SA is already gone, or we got
 			 * a bogus sequence number.  we can silently ignore it.
 			 */
 			m_freem(m);
 			return 0;
 		}
 
 		/* reset acq counter in order to deletion by timehander. */
 		acq->created = time_second;
 		acq->count = 0;
 		m_freem(m);
 		return 0;
 	}
 
 	/*
 	 * This message is from user land.
 	 */
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL ||
 	    mhp->ext[SADB_EXT_ADDRESS_DST] == NULL ||
 	    mhp->ext[SADB_EXT_PROPOSAL] == NULL) {
 		/* error */
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) ||
 	    mhp->extlen[SADB_EXT_PROPOSAL] < sizeof(struct sadb_prop)) {
 		/* error */
 		ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",	
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
 	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
 
 	/* XXX boundary check against sa_len */
 	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);
 
 	/*
 	 * Make sure the port numbers are zero.
 	 * In case of NAT-T we will update them later if needed.
 	 */
 	KEY_PORTTOSADDR(&saidx.src, 0);
 	KEY_PORTTOSADDR(&saidx.dst, 0);
 
 #ifndef IPSEC_NAT_T
 	/*
 	 * Handle NAT-T info if present.
 	 */
 
 	if (mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL &&
 	    mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) {
 		struct sadb_x_nat_t_port *sport, *dport;
 
 		if (mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) ||
 		    mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) {
 			ipseclog((LOG_DEBUG, "%s: invalid message.\n",
 			    __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 
 		sport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_SPORT];
 		dport = (struct sadb_x_nat_t_port *)
 		    mhp->ext[SADB_X_EXT_NAT_T_DPORT];
 
 		if (sport)
 			KEY_PORTTOSADDR(&saidx.src,
 			    sport->sadb_x_nat_t_port_port);
 		if (dport)
 			KEY_PORTTOSADDR(&saidx.dst,
 			    dport->sadb_x_nat_t_port_port);
 	}
 #endif
 
 	/* get a SA index */
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (sah->state == SADB_SASTATE_DEAD)
 			continue;
 		if (key_cmpsaidx(&sah->saidx, &saidx, CMP_MODE_REQID))
 			break;
 	}
 	SAHTREE_UNLOCK();
 	if (sah != NULL) {
 		ipseclog((LOG_DEBUG, "%s: a SA exists already.\n", __func__));
 		return key_senderror(so, m, EEXIST);
 	}
 
 	error = key_acquire(&saidx, NULL);
 	if (error != 0) {
 		ipseclog((LOG_DEBUG, "%s: error %d returned from key_acquire\n",
 			__func__, mhp->msg->sadb_msg_errno));
 		return key_senderror(so, m, error);
 	}
 
 	return key_sendup_mbuf(so, m, KEY_SENDUP_REGISTERED);
 }
 
 /*
  * SADB_REGISTER processing.
  * If SATYPE_UNSPEC has been passed as satype, only return sabd_supported.
  * receive
  *   <base>
  * from the ikmpd, and register a socket to send PF_KEY messages,
  * and send
  *   <base, supported>
  * to KMD by PF_KEY.
  * If socket is detached, must free from regnode.
  *
  * m will always be freed.
  */
 static int
 key_register(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct secreg *reg, *newreg = 0;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* check for invalid register message */
 	if (mhp->msg->sadb_msg_satype >= sizeof(V_regtree)/sizeof(V_regtree[0]))
 		return key_senderror(so, m, EINVAL);
 
 	/* When SATYPE_UNSPEC is specified, only return sabd_supported. */
 	if (mhp->msg->sadb_msg_satype == SADB_SATYPE_UNSPEC)
 		goto setmsg;
 
 	/* check whether existing or not */
 	REGTREE_LOCK();
 	LIST_FOREACH(reg, &V_regtree[mhp->msg->sadb_msg_satype], chain) {
 		if (reg->so == so) {
 			REGTREE_UNLOCK();
 			ipseclog((LOG_DEBUG, "%s: socket exists already.\n",
 				__func__));
 			return key_senderror(so, m, EEXIST);
 		}
 	}
 
 	/* create regnode */
 	newreg =  malloc(sizeof(struct secreg), M_IPSEC_SAR, M_NOWAIT|M_ZERO);
 	if (newreg == NULL) {
 		REGTREE_UNLOCK();
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	newreg->so = so;
 	((struct keycb *)sotorawcb(so))->kp_registered++;
 
 	/* add regnode to regtree. */
 	LIST_INSERT_HEAD(&V_regtree[mhp->msg->sadb_msg_satype], newreg, chain);
 	REGTREE_UNLOCK();
 
   setmsg:
     {
 	struct mbuf *n;
 	struct sadb_msg *newmsg;
 	struct sadb_supported *sup;
 	u_int len, alen, elen;
 	int off;
 	int i;
 	struct sadb_alg *alg;
 
 	/* create new sadb_msg to reply. */
 	alen = 0;
 	for (i = 1; i <= SADB_AALG_MAX; i++) {
 		if (ah_algorithm_lookup(i))
 			alen += sizeof(struct sadb_alg);
 	}
 	if (alen)
 		alen += sizeof(struct sadb_supported);
 	elen = 0;
 	for (i = 1; i <= SADB_EALG_MAX; i++) {
 		if (esp_algorithm_lookup(i))
 			elen += sizeof(struct sadb_alg);
 	}
 	if (elen)
 		elen += sizeof(struct sadb_supported);
 
 	len = sizeof(struct sadb_msg) + alen + elen;
 
 	if (len > MCLBYTES)
 		return key_senderror(so, m, ENOBUFS);
 
 	MGETHDR(n, M_NOWAIT, MT_DATA);
 	if (len > MHLEN) {
 		MCLGET(n, M_NOWAIT);
 		if ((n->m_flags & M_EXT) == 0) {
 			m_freem(n);
 			n = NULL;
 		}
 	}
 	if (!n)
 		return key_senderror(so, m, ENOBUFS);
 
 	n->m_pkthdr.len = n->m_len = len;
 	n->m_next = NULL;
 	off = 0;
 
 	m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
 	newmsg = mtod(n, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(len);
 	off += PFKEY_ALIGN8(sizeof(struct sadb_msg));
 
 	/* for authentication algorithm */
 	if (alen) {
 		sup = (struct sadb_supported *)(mtod(n, caddr_t) + off);
 		sup->sadb_supported_len = PFKEY_UNIT64(alen);
 		sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
 		off += PFKEY_ALIGN8(sizeof(*sup));
 
 		for (i = 1; i <= SADB_AALG_MAX; i++) {
 			struct auth_hash *aalgo;
 			u_int16_t minkeysize, maxkeysize;
 
 			aalgo = ah_algorithm_lookup(i);
 			if (!aalgo)
 				continue;
 			alg = (struct sadb_alg *)(mtod(n, caddr_t) + off);
 			alg->sadb_alg_id = i;
 			alg->sadb_alg_ivlen = 0;
 			key_getsizes_ah(aalgo, i, &minkeysize, &maxkeysize);
 			alg->sadb_alg_minbits = _BITS(minkeysize);
 			alg->sadb_alg_maxbits = _BITS(maxkeysize);
 			off += PFKEY_ALIGN8(sizeof(*alg));
 		}
 	}
 
 	/* for encryption algorithm */
 	if (elen) {
 		sup = (struct sadb_supported *)(mtod(n, caddr_t) + off);
 		sup->sadb_supported_len = PFKEY_UNIT64(elen);
 		sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT;
 		off += PFKEY_ALIGN8(sizeof(*sup));
 
 		for (i = 1; i <= SADB_EALG_MAX; i++) {
 			struct enc_xform *ealgo;
 
 			ealgo = esp_algorithm_lookup(i);
 			if (!ealgo)
 				continue;
 			alg = (struct sadb_alg *)(mtod(n, caddr_t) + off);
 			alg->sadb_alg_id = i;
 			alg->sadb_alg_ivlen = ealgo->blocksize;
 			alg->sadb_alg_minbits = _BITS(ealgo->minkey);
 			alg->sadb_alg_maxbits = _BITS(ealgo->maxkey);
 			off += PFKEY_ALIGN8(sizeof(struct sadb_alg));
 		}
 	}
 
 	IPSEC_ASSERT(off == len,
 		("length assumption failed (off %u len %u)", off, len));
 
 	m_freem(m);
 	return key_sendup_mbuf(so, n, KEY_SENDUP_REGISTERED);
     }
 }
 
 /*
  * free secreg entry registered.
  * XXX: I want to do free a socket marked done SADB_RESIGER to socket.
  */
 void
 key_freereg(struct socket *so)
 {
 	struct secreg *reg;
 	int i;
 
 	IPSEC_ASSERT(so != NULL, ("NULL so"));
 
 	/*
 	 * check whether existing or not.
 	 * check all type of SA, because there is a potential that
 	 * one socket is registered to multiple type of SA.
 	 */
 	REGTREE_LOCK();
 	for (i = 0; i <= SADB_SATYPE_MAX; i++) {
 		LIST_FOREACH(reg, &V_regtree[i], chain) {
 			if (reg->so == so && __LIST_CHAINED(reg)) {
 				LIST_REMOVE(reg, chain);
 				free(reg, M_IPSEC_SAR);
 				break;
 			}
 		}
 	}
 	REGTREE_UNLOCK();
 }
 
 /*
  * SADB_EXPIRE processing
  * send
  *   <base, SA, SA2, lifetime(C and one of HS), address(SD)>
  * to KMD by PF_KEY.
  * NOTE: We send only soft lifetime extension.
  *
  * OUT:	0	: succeed
  *	others	: error number
  */
 static int
 key_expire(struct secasvar *sav)
 {
 	int satype;
 	struct mbuf *result = NULL, *m;
 	int len;
 	int error = -1;
 	struct sadb_lifetime *lt;
 
 	IPSEC_ASSERT (sav != NULL, ("null sav"));
 	IPSEC_ASSERT (sav->sah != NULL, ("null sa header"));
 
 	/* set msg header */
 	satype = key_proto2satype(sav->sah->saidx.proto);
 	IPSEC_ASSERT(satype != 0, ("invalid proto, satype %u", satype));
 	m = key_setsadbmsg(SADB_EXPIRE, 0, satype, sav->seq, 0, sav->refcnt);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	result = m;
 
 	/* create SA extension */
 	m = key_setsadbsa(sav);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/* create SA extension */
 	m = key_setsadbxsa2(sav->sah->saidx.mode,
 			sav->replay ? sav->replay->count : 0,
 			sav->sah->saidx.reqid);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/* create lifetime extension (current and soft) */
 	len = PFKEY_ALIGN8(sizeof(*lt)) * 2;
 	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_align(m, len);
 	m->m_len = len;
 	bzero(mtod(m, caddr_t), len);
 	lt = mtod(m, struct sadb_lifetime *);
 	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
 	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
 	lt->sadb_lifetime_allocations = sav->lft_c->allocations;
 	lt->sadb_lifetime_bytes = sav->lft_c->bytes;
 	lt->sadb_lifetime_addtime = sav->lft_c->addtime;
 	lt->sadb_lifetime_usetime = sav->lft_c->usetime;
 	lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2);
 	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
 	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT;
 	lt->sadb_lifetime_allocations = sav->lft_s->allocations;
 	lt->sadb_lifetime_bytes = sav->lft_s->bytes;
 	lt->sadb_lifetime_addtime = sav->lft_s->addtime;
 	lt->sadb_lifetime_usetime = sav->lft_s->usetime;
 	m_cat(result, m);
 
 	/* set sadb_address for source */
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
 	    &sav->sah->saidx.src.sa,
 	    FULLMASK, IPSEC_ULPROTO_ANY);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/* set sadb_address for destination */
 	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
 	    &sav->sah->saidx.dst.sa,
 	    FULLMASK, IPSEC_ULPROTO_ANY);
 	if (!m) {
 		error = ENOBUFS;
 		goto fail;
 	}
 	m_cat(result, m);
 
 	/*
 	 * XXX-BZ Handle NAT-T extensions here.
 	 */
 
 	if ((result->m_flags & M_PKTHDR) == 0) {
 		error = EINVAL;
 		goto fail;
 	}
 
 	if (result->m_len < sizeof(struct sadb_msg)) {
 		result = m_pullup(result, sizeof(struct sadb_msg));
 		if (result == NULL) {
 			error = ENOBUFS;
 			goto fail;
 		}
 	}
 
 	result->m_pkthdr.len = 0;
 	for (m = result; m; m = m->m_next)
 		result->m_pkthdr.len += m->m_len;
 
 	mtod(result, struct sadb_msg *)->sadb_msg_len =
 	    PFKEY_UNIT64(result->m_pkthdr.len);
 
 	return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);
 
  fail:
 	if (result)
 		m_freem(result);
 	return error;
 }
 
 /*
  * SADB_FLUSH processing
  * receive
  *   <base>
  * from the ikmpd, and free all entries in secastree.
  * and send,
  *   <base>
  * to the ikmpd.
  * NOTE: to do is only marking SADB_SASTATE_DEAD.
  *
  * m will always be freed.
  */
 static int
 key_flush(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct sadb_msg *newmsg;
 	struct secashead *sah, *nextsah;
 	struct secasvar *sav, *nextsav;
 	u_int16_t proto;
 	u_int8_t state;
 	u_int stateidx;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* no SATYPE specified, i.e. flushing all SA. */
 	SAHTREE_LOCK();
 	for (sah = LIST_FIRST(&V_sahtree);
 	     sah != NULL;
 	     sah = nextsah) {
 		nextsah = LIST_NEXT(sah, chain);
 
 		if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC
 		 && proto != sah->saidx.proto)
 			continue;
 
 		for (stateidx = 0;
 		     stateidx < _ARRAYLEN(saorder_state_alive);
 		     stateidx++) {
 			state = saorder_state_any[stateidx];
 			for (sav = LIST_FIRST(&sah->savtree[state]);
 			     sav != NULL;
 			     sav = nextsav) {
 
 				nextsav = LIST_NEXT(sav, chain);
 
 				key_sa_chgstate(sav, SADB_SASTATE_DEAD);
 				KEY_FREESAV(&sav);
 			}
 		}
 
 		sah->state = SADB_SASTATE_DEAD;
 	}
 	SAHTREE_UNLOCK();
 
 	if (m->m_len < sizeof(struct sadb_msg) ||
 	    sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) {
 		ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
 		return key_senderror(so, m, ENOBUFS);
 	}
 
 	if (m->m_next)
 		m_freem(m->m_next);
 	m->m_next = NULL;
 	m->m_pkthdr.len = m->m_len = sizeof(struct sadb_msg);
 	newmsg = mtod(m, struct sadb_msg *);
 	newmsg->sadb_msg_errno = 0;
 	newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);
 
 	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
 }
 
 /*
  * SADB_DUMP processing
  * dump all entries including status of DEAD in SAD.
  * receive
  *   <base>
  * from the ikmpd, and dump all secasvar leaves
  * and send,
  *   <base> .....
  * to the ikmpd.
  *
  * m will always be freed.
  */
 static int
 key_dump(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	struct secashead *sah;
 	struct secasvar *sav;
 	u_int16_t proto;
 	u_int stateidx;
 	u_int8_t satype;
 	u_int8_t state;
 	int cnt;
 	struct sadb_msg *newmsg;
 	struct mbuf *n;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	/* map satype to proto */
 	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
 		ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
 			__func__));
 		return key_senderror(so, m, EINVAL);
 	}
 
 	/* count sav entries to be sent to the userland. */
 	cnt = 0;
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC
 		 && proto != sah->saidx.proto)
 			continue;
 
 		for (stateidx = 0;
 		     stateidx < _ARRAYLEN(saorder_state_any);
 		     stateidx++) {
 			state = saorder_state_any[stateidx];
 			LIST_FOREACH(sav, &sah->savtree[state], chain) {
 				cnt++;
 			}
 		}
 	}
 
 	if (cnt == 0) {
 		SAHTREE_UNLOCK();
 		return key_senderror(so, m, ENOENT);
 	}
 
 	/* send this to the userland, one at a time. */
 	newmsg = NULL;
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC
 		 && proto != sah->saidx.proto)
 			continue;
 
 		/* map proto to satype */
 		if ((satype = key_proto2satype(sah->saidx.proto)) == 0) {
 			SAHTREE_UNLOCK();
 			ipseclog((LOG_DEBUG, "%s: there was invalid proto in "
 				"SAD.\n", __func__));
 			return key_senderror(so, m, EINVAL);
 		}
 
 		for (stateidx = 0;
 		     stateidx < _ARRAYLEN(saorder_state_any);
 		     stateidx++) {
 			state = saorder_state_any[stateidx];
 			LIST_FOREACH(sav, &sah->savtree[state], chain) {
 				n = key_setdumpsa(sav, SADB_DUMP, satype,
 				    --cnt, mhp->msg->sadb_msg_pid);
 				if (!n) {
 					SAHTREE_UNLOCK();
 					return key_senderror(so, m, ENOBUFS);
 				}
 				key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
 			}
 		}
 	}
 	SAHTREE_UNLOCK();
 
 	m_freem(m);
 	return 0;
 }
 
 /*
  * SADB_X_PROMISC processing
  *
  * m will always be freed.
  */
 static int
 key_promisc(so, m, mhp)
 	struct socket *so;
 	struct mbuf *m;
 	const struct sadb_msghdr *mhp;
 {
 	int olen;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
 
 	olen = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len);
 
 	if (olen < sizeof(struct sadb_msg)) {
 #if 1
 		return key_senderror(so, m, EINVAL);
 #else
 		m_freem(m);
 		return 0;
 #endif
 	} else if (olen == sizeof(struct sadb_msg)) {
 		/* enable/disable promisc mode */
 		struct keycb *kp;
 
 		if ((kp = (struct keycb *)sotorawcb(so)) == NULL)
 			return key_senderror(so, m, EINVAL);
 		mhp->msg->sadb_msg_errno = 0;
 		switch (mhp->msg->sadb_msg_satype) {
 		case 0:
 		case 1:
 			kp->kp_promisc = mhp->msg->sadb_msg_satype;
 			break;
 		default:
 			return key_senderror(so, m, EINVAL);
 		}
 
 		/* send the original message back to everyone */
 		mhp->msg->sadb_msg_errno = 0;
 		return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
 	} else {
 		/* send packet as is */
 
 		m_adj(m, PFKEY_ALIGN8(sizeof(struct sadb_msg)));
 
 		/* TODO: if sadb_msg_seq is specified, send to specific pid */
 		return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
 	}
 }
 
 static int (*key_typesw[]) __P((struct socket *, struct mbuf *,
 		const struct sadb_msghdr *)) = {
 	NULL,		/* SADB_RESERVED */
 	key_getspi,	/* SADB_GETSPI */
 	key_update,	/* SADB_UPDATE */
 	key_add,	/* SADB_ADD */
 	key_delete,	/* SADB_DELETE */
 	key_get,	/* SADB_GET */
 	key_acquire2,	/* SADB_ACQUIRE */
 	key_register,	/* SADB_REGISTER */
 	NULL,		/* SADB_EXPIRE */
 	key_flush,	/* SADB_FLUSH */
 	key_dump,	/* SADB_DUMP */
 	key_promisc,	/* SADB_X_PROMISC */
 	NULL,		/* SADB_X_PCHANGE */
 	key_spdadd,	/* SADB_X_SPDUPDATE */
 	key_spdadd,	/* SADB_X_SPDADD */
 	key_spddelete,	/* SADB_X_SPDDELETE */
 	key_spdget,	/* SADB_X_SPDGET */
 	NULL,		/* SADB_X_SPDACQUIRE */
 	key_spddump,	/* SADB_X_SPDDUMP */
 	key_spdflush,	/* SADB_X_SPDFLUSH */
 	key_spdadd,	/* SADB_X_SPDSETIDX */
 	NULL,		/* SADB_X_SPDEXPIRE */
 	key_spddelete2,	/* SADB_X_SPDDELETE2 */
 };
 
 /*
  * parse sadb_msg buffer to process PFKEYv2,
  * and create a data to response if needed.
  * I think to be dealed with mbuf directly.
  * IN:
  *     msgp  : pointer to pointer to a received buffer pulluped.
  *             This is rewrited to response.
  *     so    : pointer to socket.
  * OUT:
  *    length for buffer to send to user process.
  */
 int
 key_parse(m, so)
 	struct mbuf *m;
 	struct socket *so;
 {
 	struct sadb_msg *msg;
 	struct sadb_msghdr mh;
 	u_int orglen;
 	int error;
 	int target;
 
 	IPSEC_ASSERT(so != NULL, ("null socket"));
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 
 #if 0	/*kdebug_sadb assumes msg in linear buffer*/
 	KEYDEBUG(KEYDEBUG_KEY_DUMP,
 		ipseclog((LOG_DEBUG, "%s: passed sadb_msg\n", __func__));
 		kdebug_sadb(msg));
 #endif
 
 	if (m->m_len < sizeof(struct sadb_msg)) {
 		m = m_pullup(m, sizeof(struct sadb_msg));
 		if (!m)
 			return ENOBUFS;
 	}
 	msg = mtod(m, struct sadb_msg *);
 	orglen = PFKEY_UNUNIT64(msg->sadb_msg_len);
 	target = KEY_SENDUP_ONE;
 
 	if ((m->m_flags & M_PKTHDR) == 0 ||
 	    m->m_pkthdr.len != m->m_pkthdr.len) {
 		ipseclog((LOG_DEBUG, "%s: invalid message length.\n",__func__));
 		PFKEYSTAT_INC(out_invlen);
 		error = EINVAL;
 		goto senderror;
 	}
 
 	if (msg->sadb_msg_version != PF_KEY_V2) {
 		ipseclog((LOG_DEBUG, "%s: PF_KEY version %u is mismatched.\n",
 		    __func__, msg->sadb_msg_version));
 		PFKEYSTAT_INC(out_invver);
 		error = EINVAL;
 		goto senderror;
 	}
 
 	if (msg->sadb_msg_type > SADB_MAX) {
 		ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n",
 		    __func__, msg->sadb_msg_type));
 		PFKEYSTAT_INC(out_invmsgtype);
 		error = EINVAL;
 		goto senderror;
 	}
 
 	/* for old-fashioned code - should be nuked */
 	if (m->m_pkthdr.len > MCLBYTES) {
 		m_freem(m);
 		return ENOBUFS;
 	}
 	if (m->m_next) {
 		struct mbuf *n;
 
 		MGETHDR(n, M_NOWAIT, MT_DATA);
 		if (n && m->m_pkthdr.len > MHLEN) {
 			MCLGET(n, M_NOWAIT);
 			if ((n->m_flags & M_EXT) == 0) {
 				m_free(n);
 				n = NULL;
 			}
 		}
 		if (!n) {
 			m_freem(m);
 			return ENOBUFS;
 		}
 		m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t));
 		n->m_pkthdr.len = n->m_len = m->m_pkthdr.len;
 		n->m_next = NULL;
 		m_freem(m);
 		m = n;
 	}
 
 	/* align the mbuf chain so that extensions are in contiguous region. */
 	error = key_align(m, &mh);
 	if (error)
 		return error;
 
 	msg = mh.msg;
 
 	/* check SA type */
 	switch (msg->sadb_msg_satype) {
 	case SADB_SATYPE_UNSPEC:
 		switch (msg->sadb_msg_type) {
 		case SADB_GETSPI:
 		case SADB_UPDATE:
 		case SADB_ADD:
 		case SADB_DELETE:
 		case SADB_GET:
 		case SADB_ACQUIRE:
 		case SADB_EXPIRE:
 			ipseclog((LOG_DEBUG, "%s: must specify satype "
 			    "when msg type=%u.\n", __func__,
 			    msg->sadb_msg_type));
 			PFKEYSTAT_INC(out_invsatype);
 			error = EINVAL;
 			goto senderror;
 		}
 		break;
 	case SADB_SATYPE_AH:
 	case SADB_SATYPE_ESP:
 	case SADB_X_SATYPE_IPCOMP:
 	case SADB_X_SATYPE_TCPSIGNATURE:
 		switch (msg->sadb_msg_type) {
 		case SADB_X_SPDADD:
 		case SADB_X_SPDDELETE:
 		case SADB_X_SPDGET:
 		case SADB_X_SPDDUMP:
 		case SADB_X_SPDFLUSH:
 		case SADB_X_SPDSETIDX:
 		case SADB_X_SPDUPDATE:
 		case SADB_X_SPDDELETE2:
 			ipseclog((LOG_DEBUG, "%s: illegal satype=%u\n",
 				__func__, msg->sadb_msg_type));
 			PFKEYSTAT_INC(out_invsatype);
 			error = EINVAL;
 			goto senderror;
 		}
 		break;
 	case SADB_SATYPE_RSVP:
 	case SADB_SATYPE_OSPFV2:
 	case SADB_SATYPE_RIPV2:
 	case SADB_SATYPE_MIP:
 		ipseclog((LOG_DEBUG, "%s: type %u isn't supported.\n",
 			__func__, msg->sadb_msg_satype));
 		PFKEYSTAT_INC(out_invsatype);
 		error = EOPNOTSUPP;
 		goto senderror;
 	case 1:	/* XXX: What does it do? */
 		if (msg->sadb_msg_type == SADB_X_PROMISC)
 			break;
 		/*FALLTHROUGH*/
 	default:
 		ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n",
 			__func__, msg->sadb_msg_satype));
 		PFKEYSTAT_INC(out_invsatype);
 		error = EINVAL;
 		goto senderror;
 	}
 
 	/* check field of upper layer protocol and address family */
 	if (mh.ext[SADB_EXT_ADDRESS_SRC] != NULL
 	 && mh.ext[SADB_EXT_ADDRESS_DST] != NULL) {
 		struct sadb_address *src0, *dst0;
 		u_int plen;
 
 		src0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_SRC]);
 		dst0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_DST]);
 
 		/* check upper layer protocol */
 		if (src0->sadb_address_proto != dst0->sadb_address_proto) {
 			ipseclog((LOG_DEBUG, "%s: upper layer protocol "
 				"mismatched.\n", __func__));
 			PFKEYSTAT_INC(out_invaddr);
 			error = EINVAL;
 			goto senderror;
 		}
 
 		/* check family */
 		if (PFKEY_ADDR_SADDR(src0)->sa_family !=
 		    PFKEY_ADDR_SADDR(dst0)->sa_family) {
 			ipseclog((LOG_DEBUG, "%s: address family mismatched.\n",
 				__func__));
 			PFKEYSTAT_INC(out_invaddr);
 			error = EINVAL;
 			goto senderror;
 		}
 		if (PFKEY_ADDR_SADDR(src0)->sa_len !=
 		    PFKEY_ADDR_SADDR(dst0)->sa_len) {
 			ipseclog((LOG_DEBUG, "%s: address struct size "
 				"mismatched.\n", __func__));
 			PFKEYSTAT_INC(out_invaddr);
 			error = EINVAL;
 			goto senderror;
 		}
 
 		switch (PFKEY_ADDR_SADDR(src0)->sa_family) {
 		case AF_INET:
 			if (PFKEY_ADDR_SADDR(src0)->sa_len !=
 			    sizeof(struct sockaddr_in)) {
 				PFKEYSTAT_INC(out_invaddr);
 				error = EINVAL;
 				goto senderror;
 			}
 			break;
 		case AF_INET6:
 			if (PFKEY_ADDR_SADDR(src0)->sa_len !=
 			    sizeof(struct sockaddr_in6)) {
 				PFKEYSTAT_INC(out_invaddr);
 				error = EINVAL;
 				goto senderror;
 			}
 			break;
 		default:
 			ipseclog((LOG_DEBUG, "%s: unsupported address family\n",
 				__func__));
 			PFKEYSTAT_INC(out_invaddr);
 			error = EAFNOSUPPORT;
 			goto senderror;
 		}
 
 		switch (PFKEY_ADDR_SADDR(src0)->sa_family) {
 		case AF_INET:
 			plen = sizeof(struct in_addr) << 3;
 			break;
 		case AF_INET6:
 			plen = sizeof(struct in6_addr) << 3;
 			break;
 		default:
 			plen = 0;	/*fool gcc*/
 			break;
 		}
 
 		/* check max prefix length */
 		if (src0->sadb_address_prefixlen > plen ||
 		    dst0->sadb_address_prefixlen > plen) {
 			ipseclog((LOG_DEBUG, "%s: illegal prefixlen.\n",
 				__func__));
 			PFKEYSTAT_INC(out_invaddr);
 			error = EINVAL;
 			goto senderror;
 		}
 
 		/*
 		 * prefixlen == 0 is valid because there can be a case when
 		 * all addresses are matched.
 		 */
 	}
 
 	if (msg->sadb_msg_type >= sizeof(key_typesw)/sizeof(key_typesw[0]) ||
 	    key_typesw[msg->sadb_msg_type] == NULL) {
 		PFKEYSTAT_INC(out_invmsgtype);
 		error = EINVAL;
 		goto senderror;
 	}
 
 	return (*key_typesw[msg->sadb_msg_type])(so, m, &mh);
 
 senderror:
 	msg->sadb_msg_errno = error;
 	return key_sendup_mbuf(so, m, target);
 }
 
 static int
 key_senderror(so, m, code)
 	struct socket *so;
 	struct mbuf *m;
 	int code;
 {
 	struct sadb_msg *msg;
 
 	IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg),
 		("mbuf too small, len %u", m->m_len));
 
 	msg = mtod(m, struct sadb_msg *);
 	msg->sadb_msg_errno = code;
 	return key_sendup_mbuf(so, m, KEY_SENDUP_ONE);
 }
 
 /*
  * set the pointer to each header into message buffer.
  * m will be freed on error.
  * XXX larger-than-MCLBYTES extension?
  */
 static int
 key_align(m, mhp)
 	struct mbuf *m;
 	struct sadb_msghdr *mhp;
 {
 	struct mbuf *n;
 	struct sadb_ext *ext;
 	size_t off, end;
 	int extlen;
 	int toff;
 
 	IPSEC_ASSERT(m != NULL, ("null mbuf"));
 	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
 	IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg),
 		("mbuf too small, len %u", m->m_len));
 
 	/* initialize */
 	bzero(mhp, sizeof(*mhp));
 
 	mhp->msg = mtod(m, struct sadb_msg *);
 	mhp->ext[0] = (struct sadb_ext *)mhp->msg;	/*XXX backward compat */
 
 	end = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len);
 	extlen = end;	/*just in case extlen is not updated*/
 	for (off = sizeof(struct sadb_msg); off < end; off += extlen) {
 		n = m_pulldown(m, off, sizeof(struct sadb_ext), &toff);
 		if (!n) {
 			/* m is already freed */
 			return ENOBUFS;
 		}
 		ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff);
 
 		/* set pointer */
 		switch (ext->sadb_ext_type) {
 		case SADB_EXT_SA:
 		case SADB_EXT_ADDRESS_SRC:
 		case SADB_EXT_ADDRESS_DST:
 		case SADB_EXT_ADDRESS_PROXY:
 		case SADB_EXT_LIFETIME_CURRENT:
 		case SADB_EXT_LIFETIME_HARD:
 		case SADB_EXT_LIFETIME_SOFT:
 		case SADB_EXT_KEY_AUTH:
 		case SADB_EXT_KEY_ENCRYPT:
 		case SADB_EXT_IDENTITY_SRC:
 		case SADB_EXT_IDENTITY_DST:
 		case SADB_EXT_SENSITIVITY:
 		case SADB_EXT_PROPOSAL:
 		case SADB_EXT_SUPPORTED_AUTH:
 		case SADB_EXT_SUPPORTED_ENCRYPT:
 		case SADB_EXT_SPIRANGE:
 		case SADB_X_EXT_POLICY:
 		case SADB_X_EXT_SA2:
 #ifdef IPSEC_NAT_T
 		case SADB_X_EXT_NAT_T_TYPE:
 		case SADB_X_EXT_NAT_T_SPORT:
 		case SADB_X_EXT_NAT_T_DPORT:
 		case SADB_X_EXT_NAT_T_OAI:
 		case SADB_X_EXT_NAT_T_OAR:
 		case SADB_X_EXT_NAT_T_FRAG:
 #endif
 			/* duplicate check */
 			/*
 			 * XXX Are there duplication payloads of either
 			 * KEY_AUTH or KEY_ENCRYPT ?
 			 */
 			if (mhp->ext[ext->sadb_ext_type] != NULL) {
 				ipseclog((LOG_DEBUG, "%s: duplicate ext_type "
 					"%u\n", __func__, ext->sadb_ext_type));
 				m_freem(m);
 				PFKEYSTAT_INC(out_dupext);
 				return EINVAL;
 			}
 			break;
 		default:
 			ipseclog((LOG_DEBUG, "%s: invalid ext_type %u\n",
 				__func__, ext->sadb_ext_type));
 			m_freem(m);
 			PFKEYSTAT_INC(out_invexttype);
 			return EINVAL;
 		}
 
 		extlen = PFKEY_UNUNIT64(ext->sadb_ext_len);
 
 		if (key_validate_ext(ext, extlen)) {
 			m_freem(m);
 			PFKEYSTAT_INC(out_invlen);
 			return EINVAL;
 		}
 
 		n = m_pulldown(m, off, extlen, &toff);
 		if (!n) {
 			/* m is already freed */
 			return ENOBUFS;
 		}
 		ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff);
 
 		mhp->ext[ext->sadb_ext_type] = ext;
 		mhp->extoff[ext->sadb_ext_type] = off;
 		mhp->extlen[ext->sadb_ext_type] = extlen;
 	}
 
 	if (off != end) {
 		m_freem(m);
 		PFKEYSTAT_INC(out_invlen);
 		return EINVAL;
 	}
 
 	return 0;
 }
 
 static int
 key_validate_ext(ext, len)
 	const struct sadb_ext *ext;
 	int len;
 {
 	const struct sockaddr *sa;
 	enum { NONE, ADDR } checktype = NONE;
 	int baselen = 0;
 	const int sal = offsetof(struct sockaddr, sa_len) + sizeof(sa->sa_len);
 
 	if (len != PFKEY_UNUNIT64(ext->sadb_ext_len))
 		return EINVAL;
 
 	/* if it does not match minimum/maximum length, bail */
 	if (ext->sadb_ext_type >= sizeof(minsize) / sizeof(minsize[0]) ||
 	    ext->sadb_ext_type >= sizeof(maxsize) / sizeof(maxsize[0]))
 		return EINVAL;
 	if (!minsize[ext->sadb_ext_type] || len < minsize[ext->sadb_ext_type])
 		return EINVAL;
 	if (maxsize[ext->sadb_ext_type] && len > maxsize[ext->sadb_ext_type])
 		return EINVAL;
 
 	/* more checks based on sadb_ext_type XXX need more */
 	switch (ext->sadb_ext_type) {
 	case SADB_EXT_ADDRESS_SRC:
 	case SADB_EXT_ADDRESS_DST:
 	case SADB_EXT_ADDRESS_PROXY:
 		baselen = PFKEY_ALIGN8(sizeof(struct sadb_address));
 		checktype = ADDR;
 		break;
 	case SADB_EXT_IDENTITY_SRC:
 	case SADB_EXT_IDENTITY_DST:
 		if (((const struct sadb_ident *)ext)->sadb_ident_type ==
 		    SADB_X_IDENTTYPE_ADDR) {
 			baselen = PFKEY_ALIGN8(sizeof(struct sadb_ident));
 			checktype = ADDR;
 		} else
 			checktype = NONE;
 		break;
 	default:
 		checktype = NONE;
 		break;
 	}
 
 	switch (checktype) {
 	case NONE:
 		break;
 	case ADDR:
 		sa = (const struct sockaddr *)(((const u_int8_t*)ext)+baselen);
 		if (len < baselen + sal)
 			return EINVAL;
 		if (baselen + PFKEY_ALIGN8(sa->sa_len) != len)
 			return EINVAL;
 		break;
 	}
 
 	return 0;
 }
 
 void
 key_init(void)
 {
 	int i;
 
 	for (i = 0; i < IPSEC_DIR_MAX; i++)
 		LIST_INIT(&V_sptree[i]);
 
 	LIST_INIT(&V_sahtree);
 
 	for (i = 0; i <= SADB_SATYPE_MAX; i++)
 		LIST_INIT(&V_regtree[i]);
 
 	LIST_INIT(&V_acqtree);
 	LIST_INIT(&V_spacqtree);
 
 	/* system default */
 	V_ip4_def_policy.policy = IPSEC_POLICY_NONE;
 	V_ip4_def_policy.refcnt++;	/*never reclaim this*/
 
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	SPTREE_LOCK_INIT();
 	REGTREE_LOCK_INIT();
 	SAHTREE_LOCK_INIT();
 	ACQ_LOCK_INIT();
 	SPACQ_LOCK_INIT();
 
 #ifndef IPSEC_DEBUG2
 	callout_init(&key_timer, CALLOUT_MPSAFE);
 	callout_reset(&key_timer, hz, key_timehandler, NULL);
 #endif /*IPSEC_DEBUG2*/
 
 	/* initialize key statistics */
 	keystat.getspi_count = 1;
 
 	printf("IPsec: Initialized Security Association Processing.\n");
 }
 
 #ifdef VIMAGE
 void
 key_destroy(void)
 {
 	struct secpolicy *sp, *nextsp;
 	struct secacq *acq, *nextacq;
 	struct secspacq *spacq, *nextspacq;
 	struct secashead *sah, *nextsah;
 	struct secreg *reg;
 	int i;
 
 	SPTREE_LOCK();
 	for (i = 0; i < IPSEC_DIR_MAX; i++) {
 		for (sp = LIST_FIRST(&V_sptree[i]); 
 		    sp != NULL; sp = nextsp) {
 			nextsp = LIST_NEXT(sp, chain);
 			if (__LIST_CHAINED(sp)) {
 				LIST_REMOVE(sp, chain);
 				free(sp, M_IPSEC_SP);
 			}
 		}
 	}
 	SPTREE_UNLOCK();
 
 	SAHTREE_LOCK();
 	for (sah = LIST_FIRST(&V_sahtree); sah != NULL; sah = nextsah) {
 		nextsah = LIST_NEXT(sah, chain);
 		if (__LIST_CHAINED(sah)) {
 			LIST_REMOVE(sah, chain);
 			free(sah, M_IPSEC_SAH);
 		}
 	}
 	SAHTREE_UNLOCK();
 
 	REGTREE_LOCK();
 	for (i = 0; i <= SADB_SATYPE_MAX; i++) {
 		LIST_FOREACH(reg, &V_regtree[i], chain) {
 			if (__LIST_CHAINED(reg)) {
 				LIST_REMOVE(reg, chain);
 				free(reg, M_IPSEC_SAR);
 				break;
 			}
 		}
 	}
 	REGTREE_UNLOCK();
 
 	ACQ_LOCK();
 	for (acq = LIST_FIRST(&V_acqtree); acq != NULL; acq = nextacq) {
 		nextacq = LIST_NEXT(acq, chain);
 		if (__LIST_CHAINED(acq)) {
 			LIST_REMOVE(acq, chain);
 			free(acq, M_IPSEC_SAQ);
 		}
 	}
 	ACQ_UNLOCK();
 
 	SPACQ_LOCK();
 	for (spacq = LIST_FIRST(&V_spacqtree); spacq != NULL;
 	    spacq = nextspacq) {
 		nextspacq = LIST_NEXT(spacq, chain);
 		if (__LIST_CHAINED(spacq)) {
 			LIST_REMOVE(spacq, chain);
 			free(spacq, M_IPSEC_SAQ);
 		}
 	}
 	SPACQ_UNLOCK();
 }
 #endif
 
 /*
  * XXX: maybe This function is called after INBOUND IPsec processing.
  *
  * Special check for tunnel-mode packets.
  * We must make some checks for consistency between inner and outer IP header.
  *
  * xxx more checks to be provided
  */
 int
 key_checktunnelsanity(sav, family, src, dst)
 	struct secasvar *sav;
 	u_int family;
 	caddr_t src;
 	caddr_t dst;
 {
 	IPSEC_ASSERT(sav->sah != NULL, ("null SA header"));
 
 	/* XXX: check inner IP header */
 
 	return 1;
 }
 
 /* record data transfer on SA, and update timestamps */
 void
 key_sa_recordxfer(sav, m)
 	struct secasvar *sav;
 	struct mbuf *m;
 {
 	IPSEC_ASSERT(sav != NULL, ("Null secasvar"));
 	IPSEC_ASSERT(m != NULL, ("Null mbuf"));
 	if (!sav->lft_c)
 		return;
 
 	/*
 	 * XXX Currently, there is a difference of bytes size
 	 * between inbound and outbound processing.
 	 */
 	sav->lft_c->bytes += m->m_pkthdr.len;
 	/* to check bytes lifetime is done in key_timehandler(). */
 
 	/*
 	 * We use the number of packets as the unit of
 	 * allocations.  We increment the variable
 	 * whenever {esp,ah}_{in,out}put is called.
 	 */
 	sav->lft_c->allocations++;
 	/* XXX check for expires? */
 
 	/*
 	 * NOTE: We record CURRENT usetime by using wall clock,
 	 * in seconds.  HARD and SOFT lifetime are measured by the time
 	 * difference (again in seconds) from usetime.
 	 *
 	 *	usetime
 	 *	v     expire   expire
 	 * -----+-----+--------+---> t
 	 *	<--------------> HARD
 	 *	<-----> SOFT
 	 */
 	sav->lft_c->usetime = time_second;
 	/* XXX check for expires? */
 
 	return;
 }
 
 /* dumb version */
 void
 key_sa_routechange(dst)
 	struct sockaddr *dst;
 {
 	struct secashead *sah;
 	struct route *ro;
 
 	SAHTREE_LOCK();
 	LIST_FOREACH(sah, &V_sahtree, chain) {
 		ro = &sah->route_cache.sa_route;
 		if (ro->ro_rt && dst->sa_len == ro->ro_dst.sa_len
 		 && bcmp(dst, &ro->ro_dst, dst->sa_len) == 0) {
 			RTFREE(ro->ro_rt);
 			ro->ro_rt = (struct rtentry *)NULL;
 		}
 	}
 	SAHTREE_UNLOCK();
 }
 
 static void
 key_sa_chgstate(struct secasvar *sav, u_int8_t state)
 {
 	IPSEC_ASSERT(sav != NULL, ("NULL sav"));
 	SAHTREE_LOCK_ASSERT();
 
 	if (sav->state != state) {
 		if (__LIST_CHAINED(sav))
 			LIST_REMOVE(sav, chain);
 		sav->state = state;
 		LIST_INSERT_HEAD(&sav->sah->savtree[state], sav, chain);
 	}
 }
 
 void
 key_sa_stir_iv(sav)
 	struct secasvar *sav;
 {
 
 	IPSEC_ASSERT(sav->iv != NULL, ("null IV"));
 	key_randomfill(sav->iv, sav->ivlen);
 }
 
 /*
  * Take one of the kernel's security keys and convert it into a PF_KEY
  * structure within an mbuf, suitable for sending up to a waiting
  * application in user land.
  * 
  * IN: 
  *    src: A pointer to a kernel security key.
  *    exttype: Which type of key this is. Refer to the PF_KEY data structures.
  * OUT:
  *    a valid mbuf or NULL indicating an error
  *
  */
 
 static struct mbuf *
 key_setkey(struct seckey *src, u_int16_t exttype) 
 {
 	struct mbuf *m;
 	struct sadb_key *p;
 	int len;
 
 	if (src == NULL)
 		return NULL;
 
 	len = PFKEY_ALIGN8(sizeof(struct sadb_key) + _KEYLEN(src));
 	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
 	if (m == NULL)
 		return NULL;
 	m_align(m, len);
 	m->m_len = len;
 	p = mtod(m, struct sadb_key *);
 	bzero(p, len);
 	p->sadb_key_len = PFKEY_UNIT64(len);
 	p->sadb_key_exttype = exttype;
 	p->sadb_key_bits = src->bits;
 	bcopy(src->key_data, _KEYBUF(p), _KEYLEN(src));
 
 	return m;
 }
 
 /*
  * Take one of the kernel's lifetime data structures and convert it
  * into a PF_KEY structure within an mbuf, suitable for sending up to
  * a waiting application in user land.
  * 
  * IN: 
  *    src: A pointer to a kernel lifetime structure.
  *    exttype: Which type of lifetime this is. Refer to the PF_KEY 
  *             data structures for more information.
  * OUT:
  *    a valid mbuf or NULL indicating an error
  *
  */
 
 static struct mbuf *
 key_setlifetime(struct seclifetime *src, u_int16_t exttype)
 {
 	struct mbuf *m = NULL;
 	struct sadb_lifetime *p;
 	int len = PFKEY_ALIGN8(sizeof(struct sadb_lifetime));
 
 	if (src == NULL)
 		return NULL;
 
 	m = m_get2(len, M_NOWAIT, MT_DATA, 0);
 	if (m == NULL)
 		return m;
 	m_align(m, len);
 	m->m_len = len;
 	p = mtod(m, struct sadb_lifetime *);
 
 	bzero(p, len);
 	p->sadb_lifetime_len = PFKEY_UNIT64(len);
 	p->sadb_lifetime_exttype = exttype;
 	p->sadb_lifetime_allocations = src->allocations;
 	p->sadb_lifetime_bytes = src->bytes;
 	p->sadb_lifetime_addtime = src->addtime;
 	p->sadb_lifetime_usetime = src->usetime;
 	
 	return m;
 
 }
Index: user/dchagin/lemul/sys/netipsec/xform_ah.c
===================================================================
--- user/dchagin/lemul/sys/netipsec/xform_ah.c	(revision 274240)
+++ user/dchagin/lemul/sys/netipsec/xform_ah.c	(revision 274241)
@@ -1,1228 +1,1228 @@
 /*	$FreeBSD$	*/
 /*	$OpenBSD: ip_ah.c,v 1.63 2001/06/26 06:18:58 angelos Exp $ */
 /*-
  * The authors of this code are John Ioannidis (ji@tla.org),
  * Angelos D. Keromytis (kermit@csd.uch.gr) and
  * Niels Provos (provos@physnet.uni-hamburg.de).
  *
  * The original version of this code was written by John Ioannidis
  * for BSD/OS in Athens, Greece, in November 1995.
  *
  * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
  * by Angelos D. Keromytis.
  *
  * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
  * and Niels Provos.
  *
  * Additional features in 1999 by Angelos D. Keromytis and Niklas Hallqvist.
  *
  * Copyright (c) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
  * Angelos D. Keromytis and Niels Provos.
  * Copyright (c) 1999 Niklas Hallqvist.
  * Copyright (c) 2001 Angelos D. Keromytis.
  *
  * Permission to use, copy, and modify this software with or without fee
  * is hereby granted, provided that this entire notice is included in
  * all copies of any software which is or includes a copy or
  * modification of this software.
  * You may use this code under the GNU public license if you so wish. Please
  * contribute changes back to the authors under this freer than GPL license
  * so that we may further the use of strong encryption without limitations to
  * all.
  *
  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
  * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
  * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
  * PURPOSE.
  */
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_ecn.h>
 #include <netinet/ip6.h>
 
 #include <net/route.h>
 #include <netipsec/ipsec.h>
 #include <netipsec/ah.h>
 #include <netipsec/ah_var.h>
 #include <netipsec/xform.h>
 
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netipsec/ipsec6.h>
 #include <netinet6/ip6_ecn.h>
 #endif
 
 #include <netipsec/key.h>
 #include <netipsec/key_debug.h>
 
 #include <opencrypto/cryptodev.h>
 
 /*
  * Return header size in bytes.  The old protocol did not support
  * the replay counter; the new protocol always includes the counter.
  */
 #define HDRSIZE(sav) \
 	(((sav)->flags & SADB_X_EXT_OLD) ? \
 		sizeof (struct ah) : sizeof (struct ah) + sizeof (u_int32_t))
 /* 
  * Return authenticator size in bytes.  The old protocol is known
  * to use a fixed 16-byte authenticator.  The new algorithm use 12-byte
  * authenticator.
  */
 #define	AUTHSIZE(sav)	ah_authsize(sav)
 
 VNET_DEFINE(int, ah_enable) = 1;	/* control flow of packets with AH */
 VNET_DEFINE(int, ah_cleartos) = 1;	/* clear ip_tos when doing AH calc */
 VNET_PCPUSTAT_DEFINE(struct ahstat, ahstat);
 VNET_PCPUSTAT_SYSINIT(ahstat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ahstat);
 #endif /* VIMAGE */
 
 #ifdef INET
 SYSCTL_DECL(_net_inet_ah);
-SYSCTL_VNET_INT(_net_inet_ah, OID_AUTO,
-	ah_enable,	CTLFLAG_RW,	&VNET_NAME(ah_enable),	0, "");
-SYSCTL_VNET_INT(_net_inet_ah, OID_AUTO,
-	ah_cleartos,	CTLFLAG_RW,	&VNET_NAME(ah_cleartos), 0, "");
+SYSCTL_INT(_net_inet_ah, OID_AUTO, ah_enable,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ah_enable), 0, "");
+SYSCTL_INT(_net_inet_ah, OID_AUTO, ah_cleartos,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ah_cleartos), 0, "");
 SYSCTL_VNET_PCPUSTAT(_net_inet_ah, IPSECCTL_STATS, stats, struct ahstat,
     ahstat, "AH statistics (struct ahstat, netipsec/ah_var.h)");
 #endif
 
 static unsigned char ipseczeroes[256];	/* larger than an ip6 extension hdr */
 
 static int ah_input_cb(struct cryptop*);
 static int ah_output_cb(struct cryptop*);
 
 static int
 ah_authsize(struct secasvar *sav)
 {
 
 	IPSEC_ASSERT(sav != NULL, ("%s: sav == NULL", __func__));
 
 	if (sav->flags & SADB_X_EXT_OLD)
 		return 16;
 
 	switch (sav->alg_auth) {
 	case SADB_X_AALG_SHA2_256:
 		return 16;
 	case SADB_X_AALG_SHA2_384:
 		return 24;
 	case SADB_X_AALG_SHA2_512:
 		return 32;
 	default:
 		return AH_HMAC_HASHLEN;
 	}
 	/* NOTREACHED */
 }
 /*
  * NB: this is public for use by the PF_KEY support.
  */
 struct auth_hash *
 ah_algorithm_lookup(int alg)
 {
 	if (alg > SADB_AALG_MAX)
 		return NULL;
 	switch (alg) {
 	case SADB_X_AALG_NULL:
 		return &auth_hash_null;
 	case SADB_AALG_MD5HMAC:
 		return &auth_hash_hmac_md5;
 	case SADB_AALG_SHA1HMAC:
 		return &auth_hash_hmac_sha1;
 	case SADB_X_AALG_RIPEMD160HMAC:
 		return &auth_hash_hmac_ripemd_160;
 	case SADB_X_AALG_MD5:
 		return &auth_hash_key_md5;
 	case SADB_X_AALG_SHA:
 		return &auth_hash_key_sha1;
 	case SADB_X_AALG_SHA2_256:
 		return &auth_hash_hmac_sha2_256;
 	case SADB_X_AALG_SHA2_384:
 		return &auth_hash_hmac_sha2_384;
 	case SADB_X_AALG_SHA2_512:
 		return &auth_hash_hmac_sha2_512;
 	}
 	return NULL;
 }
 
 size_t
 ah_hdrsiz(struct secasvar *sav)
 {
 	size_t size;
 
 	if (sav != NULL) {
 		int authsize;
 		IPSEC_ASSERT(sav->tdb_authalgxform != NULL, ("null xform"));
 		/*XXX not right for null algorithm--does it matter??*/
 		authsize = AUTHSIZE(sav);
 		size = roundup(authsize, sizeof (u_int32_t)) + HDRSIZE(sav);
 	} else {
 		/* default guess */
 		size = sizeof (struct ah) + sizeof (u_int32_t) + 16;
 	}
 	return size;
 }
 
 /*
  * NB: public for use by esp_init.
  */
 int
 ah_init0(struct secasvar *sav, struct xformsw *xsp, struct cryptoini *cria)
 {
 	struct auth_hash *thash;
 	int keylen;
 
 	thash = ah_algorithm_lookup(sav->alg_auth);
 	if (thash == NULL) {
 		DPRINTF(("%s: unsupported authentication algorithm %u\n",
 			__func__, sav->alg_auth));
 		return EINVAL;
 	}
 	/*
 	 * Verify the replay state block allocation is consistent with
 	 * the protocol type.  We check here so we can make assumptions
 	 * later during protocol processing.
 	 */
 	/* NB: replay state is setup elsewhere (sigh) */
 	if (((sav->flags&SADB_X_EXT_OLD) == 0) ^ (sav->replay != NULL)) {
 		DPRINTF(("%s: replay state block inconsistency, "
 			"%s algorithm %s replay state\n", __func__,
 			(sav->flags & SADB_X_EXT_OLD) ? "old" : "new",
 			sav->replay == NULL ? "without" : "with"));
 		return EINVAL;
 	}
 	if (sav->key_auth == NULL) {
 		DPRINTF(("%s: no authentication key for %s algorithm\n",
 			__func__, thash->name));
 		return EINVAL;
 	}
 	keylen = _KEYLEN(sav->key_auth);
 	if (keylen != thash->keysize && thash->keysize != 0) {
 		DPRINTF(("%s: invalid keylength %d, algorithm %s requires "
 			"keysize %d\n", __func__,
 			 keylen, thash->name, thash->keysize));
 		return EINVAL;
 	}
 
 	sav->tdb_xform = xsp;
 	sav->tdb_authalgxform = thash;
 
 	/* Initialize crypto session. */
 	bzero(cria, sizeof (*cria));
 	cria->cri_alg = sav->tdb_authalgxform->type;
 	cria->cri_klen = _KEYBITS(sav->key_auth);
 	cria->cri_key = sav->key_auth->key_data;
 	cria->cri_mlen = AUTHSIZE(sav);
 
 	return 0;
 }
 
 /*
  * ah_init() is called when an SPI is being set up.
  */
 static int
 ah_init(struct secasvar *sav, struct xformsw *xsp)
 {
 	struct cryptoini cria;
 	int error;
 
 	error = ah_init0(sav, xsp, &cria);
 	return error ? error :
 		 crypto_newsession(&sav->tdb_cryptoid, &cria, V_crypto_support);
 }
 
 /*
  * Paranoia.
  *
  * NB: public for use by esp_zeroize (XXX).
  */
 int
 ah_zeroize(struct secasvar *sav)
 {
 	int err;
 
 	if (sav->key_auth)
 		bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth));
 
 	err = crypto_freesession(sav->tdb_cryptoid);
 	sav->tdb_cryptoid = 0;
 	sav->tdb_authalgxform = NULL;
 	sav->tdb_xform = NULL;
 	return err;
 }
 
 /*
  * Massage IPv4/IPv6 headers for AH processing.
  */
 static int
 ah_massage_headers(struct mbuf **m0, int proto, int skip, int alg, int out)
 {
 	struct mbuf *m = *m0;
 	unsigned char *ptr;
 	int off, count;
 
 #ifdef INET
 	struct ip *ip;
 #endif /* INET */
 
 #ifdef INET6
 	struct ip6_ext *ip6e;
 	struct ip6_hdr ip6;
 	int alloc, len, ad;
 #endif /* INET6 */
 
 	switch (proto) {
 #ifdef INET
 	case AF_INET:
 		/*
 		 * This is the least painful way of dealing with IPv4 header
 		 * and option processing -- just make sure they're in
 		 * contiguous memory.
 		 */
 		*m0 = m = m_pullup(m, skip);
 		if (m == NULL) {
 			DPRINTF(("%s: m_pullup failed\n", __func__));
 			return ENOBUFS;
 		}
 
 		/* Fix the IP header */
 		ip = mtod(m, struct ip *);
 		if (V_ah_cleartos)
 			ip->ip_tos = 0;
 		ip->ip_ttl = 0;
 		ip->ip_sum = 0;
 
 		if (alg == CRYPTO_MD5_KPDK || alg == CRYPTO_SHA1_KPDK)
 			ip->ip_off &= htons(IP_DF);
 		else
 			ip->ip_off = htons(0);
 
 		ptr = mtod(m, unsigned char *) + sizeof(struct ip);
 
 		/* IPv4 option processing */
 		for (off = sizeof(struct ip); off < skip;) {
 			if (ptr[off] == IPOPT_EOL || ptr[off] == IPOPT_NOP ||
 			    off + 1 < skip)
 				;
 			else {
 				DPRINTF(("%s: illegal IPv4 option length for "
 					"option %d\n", __func__, ptr[off]));
 
 				m_freem(m);
 				return EINVAL;
 			}
 
 			switch (ptr[off]) {
 			case IPOPT_EOL:
 				off = skip;  /* End the loop. */
 				break;
 
 			case IPOPT_NOP:
 				off++;
 				break;
 
 			case IPOPT_SECURITY:	/* 0x82 */
 			case 0x85:	/* Extended security. */
 			case 0x86:	/* Commercial security. */
 			case 0x94:	/* Router alert */
 			case 0x95:	/* RFC1770 */
 				/* Sanity check for option length. */
 				if (ptr[off + 1] < 2) {
 					DPRINTF(("%s: illegal IPv4 option "
 						"length for option %d\n",
 						__func__, ptr[off]));
 
 					m_freem(m);
 					return EINVAL;
 				}
 
 				off += ptr[off + 1];
 				break;
 
 			case IPOPT_LSRR:
 			case IPOPT_SSRR:
 				/* Sanity check for option length. */
 				if (ptr[off + 1] < 2) {
 					DPRINTF(("%s: illegal IPv4 option "
 						"length for option %d\n",
 						__func__, ptr[off]));
 
 					m_freem(m);
 					return EINVAL;
 				}
 
 				/*
 				 * On output, if we have either of the
 				 * source routing options, we should
 				 * swap the destination address of the
 				 * IP header with the last address
 				 * specified in the option, as that is
 				 * what the destination's IP header
 				 * will look like.
 				 */
 				if (out)
 					bcopy(ptr + off + ptr[off + 1] -
 					    sizeof(struct in_addr),
 					    &(ip->ip_dst), sizeof(struct in_addr));
 
 				/* Fall through */
 			default:
 				/* Sanity check for option length. */
 				if (ptr[off + 1] < 2) {
 					DPRINTF(("%s: illegal IPv4 option "
 						"length for option %d\n",
 						__func__, ptr[off]));
 					m_freem(m);
 					return EINVAL;
 				}
 
 				/* Zeroize all other options. */
 				count = ptr[off + 1];
 				bcopy(ipseczeroes, ptr, count);
 				off += count;
 				break;
 			}
 
 			/* Sanity check. */
 			if (off > skip)	{
 				DPRINTF(("%s: malformed IPv4 options header\n",
 					__func__));
 
 				m_freem(m);
 				return EINVAL;
 			}
 		}
 
 		break;
 #endif /* INET */
 
 #ifdef INET6
 	case AF_INET6:  /* Ugly... */
 		/* Copy and "cook" the IPv6 header. */
 		m_copydata(m, 0, sizeof(ip6), (caddr_t) &ip6);
 
 		/* We don't do IPv6 Jumbograms. */
 		if (ip6.ip6_plen == 0) {
 			DPRINTF(("%s: unsupported IPv6 jumbogram\n", __func__));
 			m_freem(m);
 			return EMSGSIZE;
 		}
 
 		ip6.ip6_flow = 0;
 		ip6.ip6_hlim = 0;
 		ip6.ip6_vfc &= ~IPV6_VERSION_MASK;
 		ip6.ip6_vfc |= IPV6_VERSION;
 
 		/* Scoped address handling. */
 		if (IN6_IS_SCOPE_LINKLOCAL(&ip6.ip6_src))
 			ip6.ip6_src.s6_addr16[1] = 0;
 		if (IN6_IS_SCOPE_LINKLOCAL(&ip6.ip6_dst))
 			ip6.ip6_dst.s6_addr16[1] = 0;
 
 		/* Done with IPv6 header. */
 		m_copyback(m, 0, sizeof(struct ip6_hdr), (caddr_t) &ip6);
 
 		/* Let's deal with the remaining headers (if any). */
 		if (skip - sizeof(struct ip6_hdr) > 0) {
 			if (m->m_len <= skip) {
 				ptr = (unsigned char *) malloc(
 				    skip - sizeof(struct ip6_hdr),
 				    M_XDATA, M_NOWAIT);
 				if (ptr == NULL) {
 					DPRINTF(("%s: failed to allocate memory"
 						"for IPv6 headers\n",__func__));
 					m_freem(m);
 					return ENOBUFS;
 				}
 
 				/*
 				 * Copy all the protocol headers after
 				 * the IPv6 header.
 				 */
 				m_copydata(m, sizeof(struct ip6_hdr),
 				    skip - sizeof(struct ip6_hdr), ptr);
 				alloc = 1;
 			} else {
 				/* No need to allocate memory. */
 				ptr = mtod(m, unsigned char *) +
 				    sizeof(struct ip6_hdr);
 				alloc = 0;
 			}
 		} else
 			break;
 
 		off = ip6.ip6_nxt & 0xff; /* Next header type. */
 
 		for (len = 0; len < skip - sizeof(struct ip6_hdr);)
 			switch (off) {
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_DSTOPTS:
 				ip6e = (struct ip6_ext *) (ptr + len);
 
 				/*
 				 * Process the mutable/immutable
 				 * options -- borrows heavily from the
 				 * KAME code.
 				 */
 				for (count = len + sizeof(struct ip6_ext);
 				     count < len + ((ip6e->ip6e_len + 1) << 3);) {
 					if (ptr[count] == IP6OPT_PAD1) {
 						count++;
 						continue; /* Skip padding. */
 					}
 
 					/* Sanity check. */
 					if (count > len +
 					    ((ip6e->ip6e_len + 1) << 3)) {
 						m_freem(m);
 
 						/* Free, if we allocated. */
 						if (alloc)
 							free(ptr, M_XDATA);
 						return EINVAL;
 					}
 
 					ad = ptr[count + 1];
 
 					/* If mutable option, zeroize. */
 					if (ptr[count] & IP6OPT_MUTABLE)
 						bcopy(ipseczeroes, ptr + count,
 						    ptr[count + 1]);
 
 					count += ad;
 
 					/* Sanity check. */
 					if (count >
 					    skip - sizeof(struct ip6_hdr)) {
 						m_freem(m);
 
 						/* Free, if we allocated. */
 						if (alloc)
 							free(ptr, M_XDATA);
 						return EINVAL;
 					}
 				}
 
 				/* Advance. */
 				len += ((ip6e->ip6e_len + 1) << 3);
 				off = ip6e->ip6e_nxt;
 				break;
 
 			case IPPROTO_ROUTING:
 				/*
 				 * Always include routing headers in
 				 * computation.
 				 */
 				ip6e = (struct ip6_ext *) (ptr + len);
 				len += ((ip6e->ip6e_len + 1) << 3);
 				off = ip6e->ip6e_nxt;
 				break;
 
 			default:
 				DPRINTF(("%s: unexpected IPv6 header type %d",
 					__func__, off));
 				if (alloc)
 					free(ptr, M_XDATA);
 				m_freem(m);
 				return EINVAL;
 			}
 
 		/* Copyback and free, if we allocated. */
 		if (alloc) {
 			m_copyback(m, sizeof(struct ip6_hdr),
 			    skip - sizeof(struct ip6_hdr), ptr);
 			free(ptr, M_XDATA);
 		}
 
 		break;
 #endif /* INET6 */
 	}
 
 	return 0;
 }
 
 /*
  * ah_input() gets called to verify that an input packet
  * passes authentication.
  */
 static int
 ah_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff)
 {
 	struct auth_hash *ahx;
 	struct tdb_ident *tdbi;
 	struct tdb_crypto *tc;
 	struct m_tag *mtag;
 	struct newah *ah;
 	int hl, rplen, authsize;
 
 	struct cryptodesc *crda;
 	struct cryptop *crp;
 
 	IPSEC_ASSERT(sav != NULL, ("null SA"));
 	IPSEC_ASSERT(sav->key_auth != NULL, ("null authentication key"));
 	IPSEC_ASSERT(sav->tdb_authalgxform != NULL,
 		("null authentication xform"));
 
 	/* Figure out header size. */
 	rplen = HDRSIZE(sav);
 
 	/* XXX don't pullup, just copy header */
 	IP6_EXTHDR_GET(ah, struct newah *, m, skip, rplen);
 	if (ah == NULL) {
 		DPRINTF(("ah_input: cannot pullup header\n"));
 		AHSTAT_INC(ahs_hdrops);		/*XXX*/
 		m_freem(m);
 		return ENOBUFS;
 	}
 
 	/* Check replay window, if applicable. */
 	if (sav->replay && !ipsec_chkreplay(ntohl(ah->ah_seq), sav)) {
 		AHSTAT_INC(ahs_replay);
 		DPRINTF(("%s: packet replay failure: %s\n", __func__,
 			  ipsec_logsastr(sav)));
 		m_freem(m);
 		return ENOBUFS;
 	}
 
 	/* Verify AH header length. */
 	hl = ah->ah_len * sizeof (u_int32_t);
 	ahx = sav->tdb_authalgxform;
 	authsize = AUTHSIZE(sav);
 	if (hl != authsize + rplen - sizeof (struct ah)) {
 		DPRINTF(("%s: bad authenticator length %u (expecting %lu)"
 			" for packet in SA %s/%08lx\n", __func__,
 			hl, (u_long) (authsize + rplen - sizeof (struct ah)),
 			ipsec_address(&sav->sah->saidx.dst),
 			(u_long) ntohl(sav->spi)));
 		AHSTAT_INC(ahs_badauthl);
 		m_freem(m);
 		return EACCES;
 	}
 	AHSTAT_ADD(ahs_ibytes, m->m_pkthdr.len - skip - hl);
 
 	/* Get crypto descriptors. */
 	crp = crypto_getreq(1);
 	if (crp == NULL) {
 		DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__));
 		AHSTAT_INC(ahs_crypto);
 		m_freem(m);
 		return ENOBUFS;
 	}
 
 	crda = crp->crp_desc;
 	IPSEC_ASSERT(crda != NULL, ("null crypto descriptor"));
 
 	crda->crd_skip = 0;
 	crda->crd_len = m->m_pkthdr.len;
 	crda->crd_inject = skip + rplen;
 
 	/* Authentication operation. */
 	crda->crd_alg = ahx->type;
 	crda->crd_klen = _KEYBITS(sav->key_auth);
 	crda->crd_key = sav->key_auth->key_data;
 
 	/* Find out if we've already done crypto. */
 	for (mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, NULL);
 	     mtag != NULL;
 	     mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, mtag)) {
 		tdbi = (struct tdb_ident *) (mtag + 1);
 		if (tdbi->proto == sav->sah->saidx.proto &&
 		    tdbi->spi == sav->spi &&
 		    !bcmp(&tdbi->dst, &sav->sah->saidx.dst,
 			  sizeof (union sockaddr_union)))
 			break;
 	}
 
 	/* Allocate IPsec-specific opaque crypto info. */
 	if (mtag == NULL) {
 		tc = (struct tdb_crypto *) malloc(sizeof (struct tdb_crypto) +
 			skip + rplen + authsize, M_XDATA, M_NOWAIT|M_ZERO);
 	} else {
 		/* Hash verification has already been done successfully. */
 		tc = (struct tdb_crypto *) malloc(sizeof (struct tdb_crypto),
 						    M_XDATA, M_NOWAIT|M_ZERO);
 	}
 	if (tc == NULL) {
 		DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__));
 		AHSTAT_INC(ahs_crypto);
 		crypto_freereq(crp);
 		m_freem(m);
 		return ENOBUFS;
 	}
 
 	/* Only save information if crypto processing is needed. */
 	if (mtag == NULL) {
 		int error;
 
 		/*
 		 * Save the authenticator, the skipped portion of the packet,
 		 * and the AH header.
 		 */
 		m_copydata(m, 0, skip + rplen + authsize, (caddr_t)(tc+1));
 
 		/* Zeroize the authenticator on the packet. */
 		m_copyback(m, skip + rplen, authsize, ipseczeroes);
 
 		/* "Massage" the packet headers for crypto processing. */
 		error = ah_massage_headers(&m, sav->sah->saidx.dst.sa.sa_family,
 		    skip, ahx->type, 0);
 		if (error != 0) {
 			/* NB: mbuf is free'd by ah_massage_headers */
 			AHSTAT_INC(ahs_hdrops);
 			free(tc, M_XDATA);
 			crypto_freereq(crp);
 			return error;
 		}
 	}
 
 	/* Crypto operation descriptor. */
 	crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */
 	crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC;
 	crp->crp_buf = (caddr_t) m;
 	crp->crp_callback = ah_input_cb;
 	crp->crp_sid = sav->tdb_cryptoid;
 	crp->crp_opaque = (caddr_t) tc;
 
 	/* These are passed as-is to the callback. */
 	tc->tc_spi = sav->spi;
 	tc->tc_dst = sav->sah->saidx.dst;
 	tc->tc_proto = sav->sah->saidx.proto;
 	tc->tc_nxt = ah->ah_nxt;
 	tc->tc_protoff = protoff;
 	tc->tc_skip = skip;
 	tc->tc_ptr = (caddr_t) mtag; /* Save the mtag we've identified. */
 	KEY_ADDREFSA(sav);
 	tc->tc_sav = sav;
 
 	if (mtag == NULL)
 		return crypto_dispatch(crp);
 	else
 		return ah_input_cb(crp);
 }
 
 /*
  * AH input callback from the crypto driver.
  */
 static int
 ah_input_cb(struct cryptop *crp)
 {
 	int rplen, error, skip, protoff;
 	unsigned char calc[AH_ALEN_MAX];
 	struct mbuf *m;
 	struct cryptodesc *crd;
 	struct auth_hash *ahx;
 	struct tdb_crypto *tc;
 	struct m_tag *mtag;
 	struct secasvar *sav;
 	struct secasindex *saidx;
 	u_int8_t nxt;
 	caddr_t ptr;
 	int authsize;
 
 	crd = crp->crp_desc;
 
 	tc = (struct tdb_crypto *) crp->crp_opaque;
 	IPSEC_ASSERT(tc != NULL, ("null opaque crypto data area!"));
 	skip = tc->tc_skip;
 	nxt = tc->tc_nxt;
 	protoff = tc->tc_protoff;
 	mtag = (struct m_tag *) tc->tc_ptr;
 	m = (struct mbuf *) crp->crp_buf;
 
 	sav = tc->tc_sav;
 	IPSEC_ASSERT(sav != NULL, ("null SA!"));
 
 	saidx = &sav->sah->saidx;
 	IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET ||
 		saidx->dst.sa.sa_family == AF_INET6,
 		("unexpected protocol family %u", saidx->dst.sa.sa_family));
 
 	ahx = (struct auth_hash *) sav->tdb_authalgxform;
 
 	/* Check for crypto errors. */
 	if (crp->crp_etype) {
 		if (sav->tdb_cryptoid != 0)
 			sav->tdb_cryptoid = crp->crp_sid;
 
 		if (crp->crp_etype == EAGAIN)
 			return (crypto_dispatch(crp));
 
 		AHSTAT_INC(ahs_noxform);
 		DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
 		error = crp->crp_etype;
 		goto bad;
 	} else {
 		AHSTAT_INC(ahs_hist[sav->alg_auth]);
 		crypto_freereq(crp);		/* No longer needed. */
 		crp = NULL;
 	}
 
 	/* Shouldn't happen... */
 	if (m == NULL) {
 		AHSTAT_INC(ahs_crypto);
 		DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
 		error = EINVAL;
 		goto bad;
 	}
 
 	/* Figure out header size. */
 	rplen = HDRSIZE(sav);
 	authsize = AUTHSIZE(sav);
 
 	/* Copy authenticator off the packet. */
 	m_copydata(m, skip + rplen, authsize, calc);
 
 	/*
 	 * If we have an mtag, we don't need to verify the authenticator --
 	 * it has been verified by an IPsec-aware NIC.
 	 */
 	if (mtag == NULL) {
 		ptr = (caddr_t) (tc + 1);
 
 		/* Verify authenticator. */
 		if (bcmp(ptr + skip + rplen, calc, authsize)) {
 			DPRINTF(("%s: authentication hash mismatch for packet "
 			    "in SA %s/%08lx\n", __func__,
 			    ipsec_address(&saidx->dst),
 			    (u_long) ntohl(sav->spi)));
 			AHSTAT_INC(ahs_badauth);
 			error = EACCES;
 			goto bad;
 		}
 
 		/* Fix the Next Protocol field. */
 		((u_int8_t *) ptr)[protoff] = nxt;
 
 		/* Copyback the saved (uncooked) network headers. */
 		m_copyback(m, 0, skip, ptr);
 	} else {
 		/* Fix the Next Protocol field. */
 		m_copyback(m, protoff, sizeof(u_int8_t), &nxt);
 	}
 
 	free(tc, M_XDATA), tc = NULL;			/* No longer needed */
 
 	/*
 	 * Header is now authenticated.
 	 */
 	m->m_flags |= M_AUTHIPHDR|M_AUTHIPDGM;
 
 	/*
 	 * Update replay sequence number, if appropriate.
 	 */
 	if (sav->replay) {
 		u_int32_t seq;
 
 		m_copydata(m, skip + offsetof(struct newah, ah_seq),
 			   sizeof (seq), (caddr_t) &seq);
 		if (ipsec_updatereplay(ntohl(seq), sav)) {
 			AHSTAT_INC(ahs_replay);
 			error = ENOBUFS;			/*XXX as above*/
 			goto bad;
 		}
 	}
 
 	/*
 	 * Remove the AH header and authenticator from the mbuf.
 	 */
 	error = m_striphdr(m, skip, rplen + authsize);
 	if (error) {
 		DPRINTF(("%s: mangled mbuf chain for SA %s/%08lx\n", __func__,
 		    ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi)));
 
 		AHSTAT_INC(ahs_hdrops);
 		goto bad;
 	}
 
 	switch (saidx->dst.sa.sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		error = ipsec6_common_input_cb(m, sav, skip, protoff, mtag);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag);
 		break;
 #endif
 	default:
 		panic("%s: Unexpected address family: %d saidx=%p", __func__,
 		    saidx->dst.sa.sa_family, saidx);
 	}
 
 	KEY_FREESAV(&sav);
 	return error;
 bad:
 	if (sav)
 		KEY_FREESAV(&sav);
 	if (m != NULL)
 		m_freem(m);
 	if (tc != NULL)
 		free(tc, M_XDATA);
 	if (crp != NULL)
 		crypto_freereq(crp);
 	return error;
 }
 
 /*
  * AH output routine, called by ipsec[46]_process_packet().
  */
 static int
 ah_output(
 	struct mbuf *m,
 	struct ipsecrequest *isr,
 	struct mbuf **mp,
 	int skip,
 	int protoff)
 {
 	struct secasvar *sav;
 	struct auth_hash *ahx;
 	struct cryptodesc *crda;
 	struct tdb_crypto *tc;
 	struct mbuf *mi;
 	struct cryptop *crp;
 	u_int16_t iplen;
 	int error, rplen, authsize, maxpacketsize, roff;
 	u_int8_t prot;
 	struct newah *ah;
 
 	sav = isr->sav;
 	IPSEC_ASSERT(sav != NULL, ("null SA"));
 	ahx = sav->tdb_authalgxform;
 	IPSEC_ASSERT(ahx != NULL, ("null authentication xform"));
 
 	AHSTAT_INC(ahs_output);
 
 	/* Figure out header size. */
 	rplen = HDRSIZE(sav);
 
 	/* Check for maximum packet size violations. */
 	switch (sav->sah->saidx.dst.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		maxpacketsize = IP_MAXPACKET;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		maxpacketsize = IPV6_MAXPACKET;
 		break;
 #endif /* INET6 */
 	default:
 		DPRINTF(("%s: unknown/unsupported protocol family %u, "
 		    "SA %s/%08lx\n", __func__,
 		    sav->sah->saidx.dst.sa.sa_family,
 		    ipsec_address(&sav->sah->saidx.dst),
 		    (u_long) ntohl(sav->spi)));
 		AHSTAT_INC(ahs_nopf);
 		error = EPFNOSUPPORT;
 		goto bad;
 	}
 	authsize = AUTHSIZE(sav);
 	if (rplen + authsize + m->m_pkthdr.len > maxpacketsize) {
 		DPRINTF(("%s: packet in SA %s/%08lx got too big "
 		    "(len %u, max len %u)\n", __func__,
 		    ipsec_address(&sav->sah->saidx.dst),
 		    (u_long) ntohl(sav->spi),
 		    rplen + authsize + m->m_pkthdr.len, maxpacketsize));
 		AHSTAT_INC(ahs_toobig);
 		error = EMSGSIZE;
 		goto bad;
 	}
 
 	/* Update the counters. */
 	AHSTAT_ADD(ahs_obytes, m->m_pkthdr.len - skip);
 
 	m = m_unshare(m, M_NOWAIT);
 	if (m == NULL) {
 		DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__,
 		    ipsec_address(&sav->sah->saidx.dst),
 		    (u_long) ntohl(sav->spi)));
 		AHSTAT_INC(ahs_hdrops);
 		error = ENOBUFS;
 		goto bad;
 	}
 
 	/* Inject AH header. */
 	mi = m_makespace(m, skip, rplen + authsize, &roff);
 	if (mi == NULL) {
 		DPRINTF(("%s: failed to inject %u byte AH header for SA "
 		    "%s/%08lx\n", __func__,
 		    rplen + authsize,
 		    ipsec_address(&sav->sah->saidx.dst),
 		    (u_long) ntohl(sav->spi)));
 		AHSTAT_INC(ahs_hdrops);		/*XXX differs from openbsd */
 		error = ENOBUFS;
 		goto bad;
 	}
 
 	/*
 	 * The AH header is guaranteed by m_makespace() to be in
 	 * contiguous memory, at roff bytes offset into the returned mbuf.
 	 */
 	ah = (struct newah *)(mtod(mi, caddr_t) + roff);
 
 	/* Initialize the AH header. */
 	m_copydata(m, protoff, sizeof(u_int8_t), (caddr_t) &ah->ah_nxt);
 	ah->ah_len = (rplen + authsize - sizeof(struct ah)) / sizeof(u_int32_t);
 	ah->ah_reserve = 0;
 	ah->ah_spi = sav->spi;
 
 	/* Zeroize authenticator. */
 	m_copyback(m, skip + rplen, authsize, ipseczeroes);
 
 	/* Insert packet replay counter, as requested.  */
 	if (sav->replay) {
 		if (sav->replay->count == ~0 &&
 		    (sav->flags & SADB_X_EXT_CYCSEQ) == 0) {
 			DPRINTF(("%s: replay counter wrapped for SA %s/%08lx\n",
 				__func__,
 				ipsec_address(&sav->sah->saidx.dst),
 				(u_long) ntohl(sav->spi)));
 			AHSTAT_INC(ahs_wrap);
 			error = EINVAL;
 			goto bad;
 		}
 #ifdef REGRESSION
 		/* Emulate replay attack when ipsec_replay is TRUE. */
 		if (!V_ipsec_replay)
 #endif
 			sav->replay->count++;
 		ah->ah_seq = htonl(sav->replay->count);
 	}
 
 	/* Get crypto descriptors. */
 	crp = crypto_getreq(1);
 	if (crp == NULL) {
 		DPRINTF(("%s: failed to acquire crypto descriptors\n",
 			__func__));
 		AHSTAT_INC(ahs_crypto);
 		error = ENOBUFS;
 		goto bad;
 	}
 
 	crda = crp->crp_desc;
 
 	crda->crd_skip = 0;
 	crda->crd_inject = skip + rplen;
 	crda->crd_len = m->m_pkthdr.len;
 
 	/* Authentication operation. */
 	crda->crd_alg = ahx->type;
 	crda->crd_key = sav->key_auth->key_data;
 	crda->crd_klen = _KEYBITS(sav->key_auth);
 
 	/* Allocate IPsec-specific opaque crypto info. */
 	tc = (struct tdb_crypto *) malloc(
 		sizeof(struct tdb_crypto) + skip, M_XDATA, M_NOWAIT|M_ZERO);
 	if (tc == NULL) {
 		crypto_freereq(crp);
 		DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__));
 		AHSTAT_INC(ahs_crypto);
 		error = ENOBUFS;
 		goto bad;
 	}
 
 	/* Save the skipped portion of the packet. */
 	m_copydata(m, 0, skip, (caddr_t) (tc + 1));
 
 	/*
 	 * Fix IP header length on the header used for
 	 * authentication. We don't need to fix the original
 	 * header length as it will be fixed by our caller.
 	 */
 	switch (sav->sah->saidx.dst.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		bcopy(((caddr_t)(tc + 1)) +
 		    offsetof(struct ip, ip_len),
 		    (caddr_t) &iplen, sizeof(u_int16_t));
 		iplen = htons(ntohs(iplen) + rplen + authsize);
 		m_copyback(m, offsetof(struct ip, ip_len),
 		    sizeof(u_int16_t), (caddr_t) &iplen);
 		break;
 #endif /* INET */
 
 #ifdef INET6
 	case AF_INET6:
 		bcopy(((caddr_t)(tc + 1)) +
 		    offsetof(struct ip6_hdr, ip6_plen),
 		    (caddr_t) &iplen, sizeof(u_int16_t));
 		iplen = htons(ntohs(iplen) + rplen + authsize);
 		m_copyback(m, offsetof(struct ip6_hdr, ip6_plen),
 		    sizeof(u_int16_t), (caddr_t) &iplen);
 		break;
 #endif /* INET6 */
 	}
 
 	/* Fix the Next Header field in saved header. */
 	((u_int8_t *) (tc + 1))[protoff] = IPPROTO_AH;
 
 	/* Update the Next Protocol field in the IP header. */
 	prot = IPPROTO_AH;
 	m_copyback(m, protoff, sizeof(u_int8_t), (caddr_t) &prot);
 
 	/* "Massage" the packet headers for crypto processing. */
 	error = ah_massage_headers(&m, sav->sah->saidx.dst.sa.sa_family,
 			skip, ahx->type, 1);
 	if (error != 0) {
 		m = NULL;	/* mbuf was free'd by ah_massage_headers. */
 		free(tc, M_XDATA);
 		crypto_freereq(crp);
 		goto bad;
 	}
 
 	/* Crypto operation descriptor. */
 	crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */
 	crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC;
 	crp->crp_buf = (caddr_t) m;
 	crp->crp_callback = ah_output_cb;
 	crp->crp_sid = sav->tdb_cryptoid;
 	crp->crp_opaque = (caddr_t) tc;
 
 	/* These are passed as-is to the callback. */
 	tc->tc_isr = isr;
 	KEY_ADDREFSA(sav);
 	tc->tc_sav = sav;
 	tc->tc_spi = sav->spi;
 	tc->tc_dst = sav->sah->saidx.dst;
 	tc->tc_proto = sav->sah->saidx.proto;
 	tc->tc_skip = skip;
 	tc->tc_protoff = protoff;
 
 	return crypto_dispatch(crp);
 bad:
 	if (m)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * AH output callback from the crypto driver.
  */
 static int
 ah_output_cb(struct cryptop *crp)
 {
 	int skip, protoff, error;
 	struct tdb_crypto *tc;
 	struct ipsecrequest *isr;
 	struct secasvar *sav;
 	struct mbuf *m;
 	caddr_t ptr;
 
 	tc = (struct tdb_crypto *) crp->crp_opaque;
 	IPSEC_ASSERT(tc != NULL, ("null opaque data area!"));
 	skip = tc->tc_skip;
 	protoff = tc->tc_protoff;
 	ptr = (caddr_t) (tc + 1);
 	m = (struct mbuf *) crp->crp_buf;
 
 	isr = tc->tc_isr;
 	IPSECREQUEST_LOCK(isr);
 	sav = tc->tc_sav;
 	/* With the isr lock released SA pointer can be updated. */
 	if (sav != isr->sav) {
 		AHSTAT_INC(ahs_notdb);
 		DPRINTF(("%s: SA expired while in crypto\n", __func__));
 		error = ENOBUFS;		/*XXX*/
 		goto bad;
 	}
 
 	/* Check for crypto errors. */
 	if (crp->crp_etype) {
 		if (sav->tdb_cryptoid != 0)
 			sav->tdb_cryptoid = crp->crp_sid;
 
 		if (crp->crp_etype == EAGAIN) {
 			IPSECREQUEST_UNLOCK(isr);
 			return (crypto_dispatch(crp));
 		}
 
 		AHSTAT_INC(ahs_noxform);
 		DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
 		error = crp->crp_etype;
 		goto bad;
 	}
 
 	/* Shouldn't happen... */
 	if (m == NULL) {
 		AHSTAT_INC(ahs_crypto);
 		DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
 		error = EINVAL;
 		goto bad;
 	}
 	AHSTAT_INC(ahs_hist[sav->alg_auth]);
 
 	/*
 	 * Copy original headers (with the new protocol number) back
 	 * in place.
 	 */
 	m_copyback(m, 0, skip, ptr);
 
 	/* No longer needed. */
 	free(tc, M_XDATA);
 	crypto_freereq(crp);
 
 #ifdef REGRESSION
 	/* Emulate man-in-the-middle attack when ipsec_integrity is TRUE. */
 	if (V_ipsec_integrity) {
 		int alen;
 
 		/*
 		 * Corrupt HMAC if we want to test integrity verification of
 		 * the other side.
 		 */
 		alen = AUTHSIZE(sav);
 		m_copyback(m, m->m_pkthdr.len - alen, alen, ipseczeroes);
 	}
 #endif
 
 	/* NB: m is reclaimed by ipsec_process_done. */
 	error = ipsec_process_done(m, isr);
 	KEY_FREESAV(&sav);
 	IPSECREQUEST_UNLOCK(isr);
 	return error;
 bad:
 	if (sav)
 		KEY_FREESAV(&sav);
 	IPSECREQUEST_UNLOCK(isr);
 	if (m)
 		m_freem(m);
 	free(tc, M_XDATA);
 	crypto_freereq(crp);
 	return error;
 }
 
 static struct xformsw ah_xformsw = {
 	XF_AH,		XFT_AUTH,	"IPsec AH",
 	ah_init,	ah_zeroize,	ah_input,	ah_output,
 };
 
 static void
 ah_attach(void)
 {
 
 	xform_register(&ah_xformsw);
 }
 
 SYSINIT(ah_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ah_attach, NULL);
Index: user/dchagin/lemul/sys/netipsec/xform_esp.c
===================================================================
--- user/dchagin/lemul/sys/netipsec/xform_esp.c	(revision 274240)
+++ user/dchagin/lemul/sys/netipsec/xform_esp.c	(revision 274241)
@@ -1,1031 +1,1031 @@
 /*	$FreeBSD$	*/
 /*	$OpenBSD: ip_esp.c,v 1.69 2001/06/26 06:18:59 angelos Exp $ */
 /*-
  * The authors of this code are John Ioannidis (ji@tla.org),
  * Angelos D. Keromytis (kermit@csd.uch.gr) and
  * Niels Provos (provos@physnet.uni-hamburg.de).
  *
  * The original version of this code was written by John Ioannidis
  * for BSD/OS in Athens, Greece, in November 1995.
  *
  * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
  * by Angelos D. Keromytis.
  *
  * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
  * and Niels Provos.
  *
  * Additional features in 1999 by Angelos D. Keromytis.
  *
  * Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
  * Angelos D. Keromytis and Niels Provos.
  * Copyright (c) 2001 Angelos D. Keromytis.
  *
  * Permission to use, copy, and modify this software with or without fee
  * is hereby granted, provided that this entire notice is included in
  * all copies of any software which is or includes a copy or
  * modification of this software.
  * You may use this code under the GNU public license if you so wish. Please
  * contribute changes back to the authors under this freer than GPL license
  * so that we may further the use of strong encryption without limitations to
  * all.
  *
  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
  * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
  * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
  * PURPOSE.
  */
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/random.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_ecn.h>
 #include <netinet/ip6.h>
 
 #include <net/route.h>
 #include <netipsec/ipsec.h>
 #include <netipsec/ah.h>
 #include <netipsec/ah_var.h>
 #include <netipsec/esp.h>
 #include <netipsec/esp_var.h>
 #include <netipsec/xform.h>
 
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netipsec/ipsec6.h>
 #include <netinet6/ip6_ecn.h>
 #endif
 
 #include <netipsec/key.h>
 #include <netipsec/key_debug.h>
 
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/xform.h>
 
 VNET_DEFINE(int, esp_enable) = 1;
 VNET_PCPUSTAT_DEFINE(struct espstat, espstat);
 VNET_PCPUSTAT_SYSINIT(espstat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(espstat);
 #endif /* VIMAGE */
 
 SYSCTL_DECL(_net_inet_esp);
-SYSCTL_VNET_INT(_net_inet_esp, OID_AUTO,
-	esp_enable,	CTLFLAG_RW,	&VNET_NAME(esp_enable),	0, "");
+SYSCTL_INT(_net_inet_esp, OID_AUTO, esp_enable,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(esp_enable), 0, "");
 SYSCTL_VNET_PCPUSTAT(_net_inet_esp, IPSECCTL_STATS, stats,
     struct espstat, espstat,
     "ESP statistics (struct espstat, netipsec/esp_var.h");
 
 static int esp_input_cb(struct cryptop *op);
 static int esp_output_cb(struct cryptop *crp);
 
 /*
  * NB: this is public for use by the PF_KEY support.
  * NB: if you add support here; be sure to add code to esp_attach below!
  */
 struct enc_xform *
 esp_algorithm_lookup(int alg)
 {
 	if (alg >= ESP_ALG_MAX)
 		return NULL;
 	switch (alg) {
 	case SADB_EALG_DESCBC:
 		return &enc_xform_des;
 	case SADB_EALG_3DESCBC:
 		return &enc_xform_3des;
 	case SADB_X_EALG_AES:
 		return &enc_xform_rijndael128;
 	case SADB_X_EALG_BLOWFISHCBC:
 		return &enc_xform_blf;
 	case SADB_X_EALG_CAST128CBC:
 		return &enc_xform_cast5;
 	case SADB_X_EALG_SKIPJACK:
 		return &enc_xform_skipjack;
 	case SADB_EALG_NULL:
 		return &enc_xform_null;
 	case SADB_X_EALG_CAMELLIACBC:
 		return &enc_xform_camellia;
 	}
 	return NULL;
 }
 
 size_t
 esp_hdrsiz(struct secasvar *sav)
 {
 	size_t size;
 
 	if (sav != NULL) {
 		/*XXX not right for null algorithm--does it matter??*/
 		IPSEC_ASSERT(sav->tdb_encalgxform != NULL,
 			("SA with null xform"));
 		if (sav->flags & SADB_X_EXT_OLD)
 			size = sizeof (struct esp);
 		else
 			size = sizeof (struct newesp);
 		size += sav->tdb_encalgxform->blocksize + 9;
 		/*XXX need alg check???*/
 		if (sav->tdb_authalgxform != NULL && sav->replay)
 			size += ah_hdrsiz(sav);
 	} else {
 		/*
 		 *   base header size
 		 * + max iv length for CBC mode
 		 * + max pad length
 		 * + sizeof (pad length field)
 		 * + sizeof (next header field)
 		 * + max icv supported.
 		 */
 		size = sizeof (struct newesp) + EALG_MAX_BLOCK_LEN + 9 + 16;
 	}
 	return size;
 }
 
 /*
  * esp_init() is called when an SPI is being set up.
  */
 static int
 esp_init(struct secasvar *sav, struct xformsw *xsp)
 {
 	struct enc_xform *txform;
 	struct cryptoini cria, crie;
 	int keylen;
 	int error;
 
 	txform = esp_algorithm_lookup(sav->alg_enc);
 	if (txform == NULL) {
 		DPRINTF(("%s: unsupported encryption algorithm %d\n",
 			__func__, sav->alg_enc));
 		return EINVAL;
 	}
 	if (sav->key_enc == NULL) {
 		DPRINTF(("%s: no encoding key for %s algorithm\n",
 			 __func__, txform->name));
 		return EINVAL;
 	}
 	if ((sav->flags&(SADB_X_EXT_OLD|SADB_X_EXT_IV4B)) == SADB_X_EXT_IV4B) {
 		DPRINTF(("%s: 4-byte IV not supported with protocol\n",
 			__func__));
 		return EINVAL;
 	}
 	keylen = _KEYLEN(sav->key_enc);
 	if (txform->minkey > keylen || keylen > txform->maxkey) {
 		DPRINTF(("%s: invalid key length %u, must be in the range "
 			"[%u..%u] for algorithm %s\n", __func__,
 			keylen, txform->minkey, txform->maxkey,
 			txform->name));
 		return EINVAL;
 	}
 
 	/*
 	 * NB: The null xform needs a non-zero blocksize to keep the
 	 *      crypto code happy but if we use it to set ivlen then
 	 *      the ESP header will be processed incorrectly.  The
 	 *      compromise is to force it to zero here.
 	 */
 	sav->ivlen = (txform == &enc_xform_null ? 0 : txform->blocksize);
 	sav->iv = (caddr_t) malloc(sav->ivlen, M_XDATA, M_WAITOK);
 	key_randomfill(sav->iv, sav->ivlen);	/*XXX*/
 
 	/*
 	 * Setup AH-related state.
 	 */
 	if (sav->alg_auth != 0) {
 		error = ah_init0(sav, xsp, &cria);
 		if (error)
 			return error;
 	}
 
 	/* NB: override anything set in ah_init0 */
 	sav->tdb_xform = xsp;
 	sav->tdb_encalgxform = txform;
 
 	/* Initialize crypto session. */
 	bzero(&crie, sizeof (crie));
 	crie.cri_alg = sav->tdb_encalgxform->type;
 	crie.cri_klen = _KEYBITS(sav->key_enc);
 	crie.cri_key = sav->key_enc->key_data;
 	/* XXX Rounds ? */
 
 	if (sav->tdb_authalgxform && sav->tdb_encalgxform) {
 		/* init both auth & enc */
 		crie.cri_next = &cria;
 		error = crypto_newsession(&sav->tdb_cryptoid,
 					  &crie, V_crypto_support);
 	} else if (sav->tdb_encalgxform) {
 		error = crypto_newsession(&sav->tdb_cryptoid,
 					  &crie, V_crypto_support);
 	} else if (sav->tdb_authalgxform) {
 		error = crypto_newsession(&sav->tdb_cryptoid,
 					  &cria, V_crypto_support);
 	} else {
 		/* XXX cannot happen? */
 		DPRINTF(("%s: no encoding OR authentication xform!\n",
 			__func__));
 		error = EINVAL;
 	}
 	return error;
 }
 
 /*
  * Paranoia.
  */
 static int
 esp_zeroize(struct secasvar *sav)
 {
 	/* NB: ah_zerorize free's the crypto session state */
 	int error = ah_zeroize(sav);
 
 	if (sav->key_enc)
 		bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc));
 	if (sav->iv) {
 		free(sav->iv, M_XDATA);
 		sav->iv = NULL;
 	}
 	sav->tdb_encalgxform = NULL;
 	sav->tdb_xform = NULL;
 	return error;
 }
 
 /*
  * ESP input processing, called (eventually) through the protocol switch.
  */
 static int
 esp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff)
 {
 	struct auth_hash *esph;
 	struct enc_xform *espx;
 	struct tdb_ident *tdbi;
 	struct tdb_crypto *tc;
 	int plen, alen, hlen;
 	struct m_tag *mtag;
 	struct newesp *esp;
 
 	struct cryptodesc *crde;
 	struct cryptop *crp;
 
 	IPSEC_ASSERT(sav != NULL, ("null SA"));
 	IPSEC_ASSERT(sav->tdb_encalgxform != NULL, ("null encoding xform"));
 
 	/* Valid IP Packet length ? */
 	if ( (skip&3) || (m->m_pkthdr.len&3) ){
 		DPRINTF(("%s: misaligned packet, skip %u pkt len %u",
 				__func__, skip, m->m_pkthdr.len));
 		ESPSTAT_INC(esps_badilen);
 		m_freem(m);
 		return EINVAL;
 	}
 
 	/* XXX don't pullup, just copy header */
 	IP6_EXTHDR_GET(esp, struct newesp *, m, skip, sizeof (struct newesp));
 
 	esph = sav->tdb_authalgxform;
 	espx = sav->tdb_encalgxform;
 
 	/* Determine the ESP header length */
 	if (sav->flags & SADB_X_EXT_OLD)
 		hlen = sizeof (struct esp) + sav->ivlen;
 	else
 		hlen = sizeof (struct newesp) + sav->ivlen;
 	/* Authenticator hash size */
 	if (esph != NULL) {
 		switch (esph->type) {
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 		case CRYPTO_SHA2_512_HMAC:
 			alen = esph->hashsize/2;
 			break;
 		default:
 			alen = AH_HMAC_HASHLEN;
 			break;
 		}
 	}else
 		alen = 0;
 
 	/*
 	 * Verify payload length is multiple of encryption algorithm
 	 * block size.
 	 *
 	 * NB: This works for the null algorithm because the blocksize
 	 *     is 4 and all packets must be 4-byte aligned regardless
 	 *     of the algorithm.
 	 */
 	plen = m->m_pkthdr.len - (skip + hlen + alen);
 	if ((plen & (espx->blocksize - 1)) || (plen <= 0)) {
 		DPRINTF(("%s: payload of %d octets not a multiple of %d octets,"
 		    "  SA %s/%08lx\n", __func__,
 		    plen, espx->blocksize,
 		    ipsec_address(&sav->sah->saidx.dst),
 		    (u_long) ntohl(sav->spi)));
 		ESPSTAT_INC(esps_badilen);
 		m_freem(m);
 		return EINVAL;
 	}
 
 	/*
 	 * Check sequence number.
 	 */
 	if (esph && sav->replay && !ipsec_chkreplay(ntohl(esp->esp_seq), sav)) {
 		DPRINTF(("%s: packet replay check for %s\n", __func__,
 		    ipsec_logsastr(sav)));	/*XXX*/
 		ESPSTAT_INC(esps_replay);
 		m_freem(m);
 		return ENOBUFS;		/*XXX*/
 	}
 
 	/* Update the counters */
 	ESPSTAT_ADD(esps_ibytes, m->m_pkthdr.len - (skip + hlen + alen));
 
 	/* Find out if we've already done crypto */
 	for (mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, NULL);
 	     mtag != NULL;
 	     mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, mtag)) {
 		tdbi = (struct tdb_ident *) (mtag + 1);
 		if (tdbi->proto == sav->sah->saidx.proto &&
 		    tdbi->spi == sav->spi &&
 		    !bcmp(&tdbi->dst, &sav->sah->saidx.dst,
 			  sizeof(union sockaddr_union)))
 			break;
 	}
 
 	/* Get crypto descriptors */
 	crp = crypto_getreq(esph && espx ? 2 : 1);
 	if (crp == NULL) {
 		DPRINTF(("%s: failed to acquire crypto descriptors\n",
 			__func__));
 		ESPSTAT_INC(esps_crypto);
 		m_freem(m);
 		return ENOBUFS;
 	}
 
 	/* Get IPsec-specific opaque pointer */
 	if (esph == NULL || mtag != NULL)
 		tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto),
 		    M_XDATA, M_NOWAIT|M_ZERO);
 	else
 		tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto) + alen,
 		    M_XDATA, M_NOWAIT|M_ZERO);
 	if (tc == NULL) {
 		crypto_freereq(crp);
 		DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__));
 		ESPSTAT_INC(esps_crypto);
 		m_freem(m);
 		return ENOBUFS;
 	}
 
 	tc->tc_ptr = (caddr_t) mtag;
 
 	if (esph) {
 		struct cryptodesc *crda = crp->crp_desc;
 
 		IPSEC_ASSERT(crda != NULL, ("null ah crypto descriptor"));
 
 		/* Authentication descriptor */
 		crda->crd_skip = skip;
 		crda->crd_len = m->m_pkthdr.len - (skip + alen);
 		crda->crd_inject = m->m_pkthdr.len - alen;
 
 		crda->crd_alg = esph->type;
 		crda->crd_key = sav->key_auth->key_data;
 		crda->crd_klen = _KEYBITS(sav->key_auth);
 
 		/* Copy the authenticator */
 		if (mtag == NULL)
 			m_copydata(m, m->m_pkthdr.len - alen, alen,
 				   (caddr_t) (tc + 1));
 
 		/* Chain authentication request */
 		crde = crda->crd_next;
 	} else {
 		crde = crp->crp_desc;
 	}
 
 	/* Crypto operation descriptor */
 	crp->crp_ilen = m->m_pkthdr.len; /* Total input length */
 	crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC;
 	crp->crp_buf = (caddr_t) m;
 	crp->crp_callback = esp_input_cb;
 	crp->crp_sid = sav->tdb_cryptoid;
 	crp->crp_opaque = (caddr_t) tc;
 
 	/* These are passed as-is to the callback */
 	tc->tc_spi = sav->spi;
 	tc->tc_dst = sav->sah->saidx.dst;
 	tc->tc_proto = sav->sah->saidx.proto;
 	tc->tc_protoff = protoff;
 	tc->tc_skip = skip;
 	KEY_ADDREFSA(sav);
 	tc->tc_sav = sav;
 
 	/* Decryption descriptor */
 	if (espx) {
 		IPSEC_ASSERT(crde != NULL, ("null esp crypto descriptor"));
 		crde->crd_skip = skip + hlen;
 		crde->crd_len = m->m_pkthdr.len - (skip + hlen + alen);
 		crde->crd_inject = skip + hlen - sav->ivlen;
 
 		crde->crd_alg = espx->type;
 		crde->crd_key = sav->key_enc->key_data;
 		crde->crd_klen = _KEYBITS(sav->key_enc);
 		/* XXX Rounds ? */
 	}
 
 	if (mtag == NULL)
 		return crypto_dispatch(crp);
 	else
 		return esp_input_cb(crp);
 }
 
 /*
  * ESP input callback from the crypto driver.
  */
 static int
 esp_input_cb(struct cryptop *crp)
 {
 	u_int8_t lastthree[3], aalg[AH_HMAC_MAXHASHLEN];
 	int hlen, skip, protoff, error, alen;
 	struct mbuf *m;
 	struct cryptodesc *crd;
 	struct auth_hash *esph;
 	struct enc_xform *espx;
 	struct tdb_crypto *tc;
 	struct m_tag *mtag;
 	struct secasvar *sav;
 	struct secasindex *saidx;
 	caddr_t ptr;
 
 	crd = crp->crp_desc;
 	IPSEC_ASSERT(crd != NULL, ("null crypto descriptor!"));
 
 	tc = (struct tdb_crypto *) crp->crp_opaque;
 	IPSEC_ASSERT(tc != NULL, ("null opaque crypto data area!"));
 	skip = tc->tc_skip;
 	protoff = tc->tc_protoff;
 	mtag = (struct m_tag *) tc->tc_ptr;
 	m = (struct mbuf *) crp->crp_buf;
 
 	sav = tc->tc_sav;
 	IPSEC_ASSERT(sav != NULL, ("null SA!"));
 
 	saidx = &sav->sah->saidx;
 	IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET ||
 		saidx->dst.sa.sa_family == AF_INET6,
 		("unexpected protocol family %u", saidx->dst.sa.sa_family));
 
 	esph = sav->tdb_authalgxform;
 	espx = sav->tdb_encalgxform;
 
 	/* Check for crypto errors */
 	if (crp->crp_etype) {
 		/* Reset the session ID */
 		if (sav->tdb_cryptoid != 0)
 			sav->tdb_cryptoid = crp->crp_sid;
 
 		if (crp->crp_etype == EAGAIN)
 			return (crypto_dispatch(crp));
 
 		ESPSTAT_INC(esps_noxform);
 		DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
 		error = crp->crp_etype;
 		goto bad;
 	}
 
 	/* Shouldn't happen... */
 	if (m == NULL) {
 		ESPSTAT_INC(esps_crypto);
 		DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
 		error = EINVAL;
 		goto bad;
 	}
 	ESPSTAT_INC(esps_hist[sav->alg_enc]);
 
 	/* If authentication was performed, check now. */
 	if (esph != NULL) {
 		switch (esph->type) {
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 		case CRYPTO_SHA2_512_HMAC:
 			alen = esph->hashsize/2;
 			break;
 		default:
 			alen = AH_HMAC_HASHLEN;
 			break;
 		}
 		/*
 		 * If we have a tag, it means an IPsec-aware NIC did
 		 * the verification for us.  Otherwise we need to
 		 * check the authentication calculation.
 		 */
 		AHSTAT_INC(ahs_hist[sav->alg_auth]);
 		if (mtag == NULL) {
 			/* Copy the authenticator from the packet */
 			m_copydata(m, m->m_pkthdr.len - alen,
 				alen, aalg);
 
 			ptr = (caddr_t) (tc + 1);
 
 			/* Verify authenticator */
 			if (bcmp(ptr, aalg, alen) != 0) {
 				DPRINTF(("%s: "
 		    "authentication hash mismatch for packet in SA %s/%08lx\n",
 				    __func__,
 				    ipsec_address(&saidx->dst),
 				    (u_long) ntohl(sav->spi)));
 				ESPSTAT_INC(esps_badauth);
 				error = EACCES;
 				goto bad;
 			}
 		}
 
 		/* Remove trailing authenticator */
 		m_adj(m, -alen);
 	}
 
 	/* Release the crypto descriptors */
 	free(tc, M_XDATA), tc = NULL;
 	crypto_freereq(crp), crp = NULL;
 
 	/*
 	 * Packet is now decrypted.
 	 */
 	m->m_flags |= M_DECRYPTED;
 
 	/*
 	 * Update replay sequence number, if appropriate.
 	 */
 	if (sav->replay) {
 		u_int32_t seq;
 
 		m_copydata(m, skip + offsetof(struct newesp, esp_seq),
 			   sizeof (seq), (caddr_t) &seq);
 		if (ipsec_updatereplay(ntohl(seq), sav)) {
 			DPRINTF(("%s: packet replay check for %s\n", __func__,
 			    ipsec_logsastr(sav)));
 			ESPSTAT_INC(esps_replay);
 			error = ENOBUFS;
 			goto bad;
 		}
 	}
 
 	/* Determine the ESP header length */
 	if (sav->flags & SADB_X_EXT_OLD)
 		hlen = sizeof (struct esp) + sav->ivlen;
 	else
 		hlen = sizeof (struct newesp) + sav->ivlen;
 
 	/* Remove the ESP header and IV from the mbuf. */
 	error = m_striphdr(m, skip, hlen);
 	if (error) {
 		ESPSTAT_INC(esps_hdrops);
 		DPRINTF(("%s: bad mbuf chain, SA %s/%08lx\n", __func__,
 		    ipsec_address(&sav->sah->saidx.dst),
 		    (u_long) ntohl(sav->spi)));
 		goto bad;
 	}
 
 	/* Save the last three bytes of decrypted data */
 	m_copydata(m, m->m_pkthdr.len - 3, 3, lastthree);
 
 	/* Verify pad length */
 	if (lastthree[1] + 2 > m->m_pkthdr.len - skip) {
 		ESPSTAT_INC(esps_badilen);
 		DPRINTF(("%s: invalid padding length %d for %u byte packet "
 			"in SA %s/%08lx\n", __func__,
 			 lastthree[1], m->m_pkthdr.len - skip,
 			 ipsec_address(&sav->sah->saidx.dst),
 			 (u_long) ntohl(sav->spi)));
 		error = EINVAL;
 		goto bad;
 	}
 
 	/* Verify correct decryption by checking the last padding bytes */
 	if ((sav->flags & SADB_X_EXT_PMASK) != SADB_X_EXT_PRAND) {
 		if (lastthree[1] != lastthree[0] && lastthree[1] != 0) {
 			ESPSTAT_INC(esps_badenc);
 			DPRINTF(("%s: decryption failed for packet in "
 				"SA %s/%08lx\n", __func__,
 				ipsec_address(&sav->sah->saidx.dst),
 				(u_long) ntohl(sav->spi)));
 			error = EINVAL;
 			goto bad;
 		}
 	}
 
 	/* Trim the mbuf chain to remove trailing authenticator and padding */
 	m_adj(m, -(lastthree[1] + 2));
 
 	/* Restore the Next Protocol field */
 	m_copyback(m, protoff, sizeof (u_int8_t), lastthree + 2);
 
 	switch (saidx->dst.sa.sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		error = ipsec6_common_input_cb(m, sav, skip, protoff, mtag);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag);
 		break;
 #endif
 	default:
 		panic("%s: Unexpected address family: %d saidx=%p", __func__,
 		    saidx->dst.sa.sa_family, saidx);
 	}
 
 	KEY_FREESAV(&sav);
 	return error;
 bad:
 	if (sav)
 		KEY_FREESAV(&sav);
 	if (m != NULL)
 		m_freem(m);
 	if (tc != NULL)
 		free(tc, M_XDATA);
 	if (crp != NULL)
 		crypto_freereq(crp);
 	return error;
 }
 
 /*
  * ESP output routine, called by ipsec[46]_process_packet().
  */
 static int
 esp_output(
 	struct mbuf *m,
 	struct ipsecrequest *isr,
 	struct mbuf **mp,
 	int skip,
 	int protoff
 )
 {
 	struct enc_xform *espx;
 	struct auth_hash *esph;
 	int hlen, rlen, padding, blks, alen, i, roff;
 	struct mbuf *mo = (struct mbuf *) NULL;
 	struct tdb_crypto *tc;
 	struct secasvar *sav;
 	struct secasindex *saidx;
 	unsigned char *pad;
 	u_int8_t prot;
 	int error, maxpacketsize;
 
 	struct cryptodesc *crde = NULL, *crda = NULL;
 	struct cryptop *crp;
 
 	sav = isr->sav;
 	IPSEC_ASSERT(sav != NULL, ("null SA"));
 	esph = sav->tdb_authalgxform;
 	espx = sav->tdb_encalgxform;
 	IPSEC_ASSERT(espx != NULL, ("null encoding xform"));
 
 	if (sav->flags & SADB_X_EXT_OLD)
 		hlen = sizeof (struct esp) + sav->ivlen;
 	else
 		hlen = sizeof (struct newesp) + sav->ivlen;
 
 	rlen = m->m_pkthdr.len - skip;	/* Raw payload length. */
 	/*
 	 * NB: The null encoding transform has a blocksize of 4
 	 *     so that headers are properly aligned.
 	 */
 	blks = espx->blocksize;		/* IV blocksize */
 
 	/* XXX clamp padding length a la KAME??? */
 	padding = ((blks - ((rlen + 2) % blks)) % blks) + 2;
 
 	if (esph)
 		switch (esph->type) {
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 		case CRYPTO_SHA2_512_HMAC:
 			alen = esph->hashsize/2;
 			break;
 		default:
 		alen = AH_HMAC_HASHLEN;
 			break;
 		}
 	else
 		alen = 0;
 
 	ESPSTAT_INC(esps_output);
 
 	saidx = &sav->sah->saidx;
 	/* Check for maximum packet size violations. */
 	switch (saidx->dst.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		maxpacketsize = IP_MAXPACKET;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		maxpacketsize = IPV6_MAXPACKET;
 		break;
 #endif /* INET6 */
 	default:
 		DPRINTF(("%s: unknown/unsupported protocol "
 		    "family %d, SA %s/%08lx\n", __func__,
 		    saidx->dst.sa.sa_family, ipsec_address(&saidx->dst),
 		    (u_long) ntohl(sav->spi)));
 		ESPSTAT_INC(esps_nopf);
 		error = EPFNOSUPPORT;
 		goto bad;
 	}
 	if (skip + hlen + rlen + padding + alen > maxpacketsize) {
 		DPRINTF(("%s: packet in SA %s/%08lx got too big "
 		    "(len %u, max len %u)\n", __func__,
 		    ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi),
 		    skip + hlen + rlen + padding + alen, maxpacketsize));
 		ESPSTAT_INC(esps_toobig);
 		error = EMSGSIZE;
 		goto bad;
 	}
 
 	/* Update the counters. */
 	ESPSTAT_ADD(esps_obytes, m->m_pkthdr.len - skip);
 
 	m = m_unshare(m, M_NOWAIT);
 	if (m == NULL) {
 		DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__,
 		    ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi)));
 		ESPSTAT_INC(esps_hdrops);
 		error = ENOBUFS;
 		goto bad;
 	}
 
 	/* Inject ESP header. */
 	mo = m_makespace(m, skip, hlen, &roff);
 	if (mo == NULL) {
 		DPRINTF(("%s: %u byte ESP hdr inject failed for SA %s/%08lx\n",
 		    __func__, hlen, ipsec_address(&saidx->dst),
 		    (u_long) ntohl(sav->spi)));
 		ESPSTAT_INC(esps_hdrops);		/* XXX diffs from openbsd */
 		error = ENOBUFS;
 		goto bad;
 	}
 
 	/* Initialize ESP header. */
 	bcopy((caddr_t) &sav->spi, mtod(mo, caddr_t) + roff, sizeof(u_int32_t));
 	if (sav->replay) {
 		u_int32_t replay;
 
 #ifdef REGRESSION
 		/* Emulate replay attack when ipsec_replay is TRUE. */
 		if (!V_ipsec_replay)
 #endif
 			sav->replay->count++;
 		replay = htonl(sav->replay->count);
 		bcopy((caddr_t) &replay,
 		    mtod(mo, caddr_t) + roff + sizeof(u_int32_t),
 		    sizeof(u_int32_t));
 	}
 
 	/*
 	 * Add padding -- better to do it ourselves than use the crypto engine,
 	 * although if/when we support compression, we'd have to do that.
 	 */
 	pad = (u_char *) m_pad(m, padding + alen);
 	if (pad == NULL) {
 		DPRINTF(("%s: m_pad failed for SA %s/%08lx\n", __func__,
 		    ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi)));
 		m = NULL;		/* NB: free'd by m_pad */
 		error = ENOBUFS;
 		goto bad;
 	}
 
 	/*
 	 * Add padding: random, zero, or self-describing.
 	 * XXX catch unexpected setting
 	 */
 	switch (sav->flags & SADB_X_EXT_PMASK) {
 	case SADB_X_EXT_PRAND:
 		(void) read_random(pad, padding - 2);
 		break;
 	case SADB_X_EXT_PZERO:
 		bzero(pad, padding - 2);
 		break;
 	case SADB_X_EXT_PSEQ:
 		for (i = 0; i < padding - 2; i++)
 			pad[i] = i+1;
 		break;
 	}
 
 	/* Fix padding length and Next Protocol in padding itself. */
 	pad[padding - 2] = padding - 2;
 	m_copydata(m, protoff, sizeof(u_int8_t), pad + padding - 1);
 
 	/* Fix Next Protocol in IPv4/IPv6 header. */
 	prot = IPPROTO_ESP;
 	m_copyback(m, protoff, sizeof(u_int8_t), (u_char *) &prot);
 
 	/* Get crypto descriptors. */
 	crp = crypto_getreq(esph && espx ? 2 : 1);
 	if (crp == NULL) {
 		DPRINTF(("%s: failed to acquire crypto descriptors\n",
 			__func__));
 		ESPSTAT_INC(esps_crypto);
 		error = ENOBUFS;
 		goto bad;
 	}
 
 	if (espx) {
 		crde = crp->crp_desc;
 		crda = crde->crd_next;
 
 		/* Encryption descriptor. */
 		crde->crd_skip = skip + hlen;
 		crde->crd_len = m->m_pkthdr.len - (skip + hlen + alen);
 		crde->crd_flags = CRD_F_ENCRYPT;
 		crde->crd_inject = skip + hlen - sav->ivlen;
 
 		/* Encryption operation. */
 		crde->crd_alg = espx->type;
 		crde->crd_key = sav->key_enc->key_data;
 		crde->crd_klen = _KEYBITS(sav->key_enc);
 		/* XXX Rounds ? */
 	} else
 		crda = crp->crp_desc;
 
 	/* IPsec-specific opaque crypto info. */
 	tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto),
 		M_XDATA, M_NOWAIT|M_ZERO);
 	if (tc == NULL) {
 		crypto_freereq(crp);
 		DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__));
 		ESPSTAT_INC(esps_crypto);
 		error = ENOBUFS;
 		goto bad;
 	}
 
 	/* Callback parameters */
 	tc->tc_isr = isr;
 	KEY_ADDREFSA(sav);
 	tc->tc_sav = sav;
 	tc->tc_spi = sav->spi;
 	tc->tc_dst = saidx->dst;
 	tc->tc_proto = saidx->proto;
 
 	/* Crypto operation descriptor. */
 	crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */
 	crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC;
 	crp->crp_buf = (caddr_t) m;
 	crp->crp_callback = esp_output_cb;
 	crp->crp_opaque = (caddr_t) tc;
 	crp->crp_sid = sav->tdb_cryptoid;
 
 	if (esph) {
 		/* Authentication descriptor. */
 		crda->crd_skip = skip;
 		crda->crd_len = m->m_pkthdr.len - (skip + alen);
 		crda->crd_inject = m->m_pkthdr.len - alen;
 
 		/* Authentication operation. */
 		crda->crd_alg = esph->type;
 		crda->crd_key = sav->key_auth->key_data;
 		crda->crd_klen = _KEYBITS(sav->key_auth);
 	}
 
 	return crypto_dispatch(crp);
 bad:
 	if (m)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * ESP output callback from the crypto driver.
  */
 static int
 esp_output_cb(struct cryptop *crp)
 {
 	struct tdb_crypto *tc;
 	struct ipsecrequest *isr;
 	struct secasvar *sav;
 	struct mbuf *m;
 	int error;
 
 	tc = (struct tdb_crypto *) crp->crp_opaque;
 	IPSEC_ASSERT(tc != NULL, ("null opaque data area!"));
 	m = (struct mbuf *) crp->crp_buf;
 
 	isr = tc->tc_isr;
 	IPSECREQUEST_LOCK(isr);
 	sav = tc->tc_sav;
 	/* With the isr lock released SA pointer can be updated. */
 	if (sav != isr->sav) {
 		ESPSTAT_INC(esps_notdb);
 		DPRINTF(("%s: SA gone during crypto (SA %s/%08lx proto %u)\n",
 		    __func__, ipsec_address(&tc->tc_dst),
 		    (u_long) ntohl(tc->tc_spi), tc->tc_proto));
 		error = ENOBUFS;		/*XXX*/
 		goto bad;
 	}
 
 	/* Check for crypto errors. */
 	if (crp->crp_etype) {
 		/* Reset session ID. */
 		if (sav->tdb_cryptoid != 0)
 			sav->tdb_cryptoid = crp->crp_sid;
 
 		if (crp->crp_etype == EAGAIN) {
 			IPSECREQUEST_UNLOCK(isr);
 			return (crypto_dispatch(crp));
 		}
 
 		ESPSTAT_INC(esps_noxform);
 		DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
 		error = crp->crp_etype;
 		goto bad;
 	}
 
 	/* Shouldn't happen... */
 	if (m == NULL) {
 		ESPSTAT_INC(esps_crypto);
 		DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
 		error = EINVAL;
 		goto bad;
 	}
 	ESPSTAT_INC(esps_hist[sav->alg_enc]);
 	if (sav->tdb_authalgxform != NULL)
 		AHSTAT_INC(ahs_hist[sav->alg_auth]);
 
 	/* Release crypto descriptors. */
 	free(tc, M_XDATA);
 	crypto_freereq(crp);
 
 #ifdef REGRESSION
 	/* Emulate man-in-the-middle attack when ipsec_integrity is TRUE. */
 	if (V_ipsec_integrity) {
 		static unsigned char ipseczeroes[AH_HMAC_MAXHASHLEN];
 		struct auth_hash *esph;
 
 		/*
 		 * Corrupt HMAC if we want to test integrity verification of
 		 * the other side.
 		 */
 		esph = sav->tdb_authalgxform;
 		if (esph !=  NULL) {
 			int alen;
 
 			switch (esph->type) {
 			case CRYPTO_SHA2_256_HMAC:
 			case CRYPTO_SHA2_384_HMAC:
 			case CRYPTO_SHA2_512_HMAC:
 				alen = esph->hashsize/2;
 				break;
 			default:
 				alen = AH_HMAC_HASHLEN;
 				break;
 			}
 			m_copyback(m, m->m_pkthdr.len - alen,
 			    alen, ipseczeroes);
 		}
 	}
 #endif
 
 	/* NB: m is reclaimed by ipsec_process_done. */
 	error = ipsec_process_done(m, isr);
 	KEY_FREESAV(&sav);
 	IPSECREQUEST_UNLOCK(isr);
 	return error;
 bad:
 	if (sav)
 		KEY_FREESAV(&sav);
 	IPSECREQUEST_UNLOCK(isr);
 	if (m)
 		m_freem(m);
 	free(tc, M_XDATA);
 	crypto_freereq(crp);
 	return error;
 }
 
 static struct xformsw esp_xformsw = {
 	XF_ESP,		XFT_CONF|XFT_AUTH,	"IPsec ESP",
 	esp_init,	esp_zeroize,		esp_input,
 	esp_output
 };
 
 static void
 esp_attach(void)
 {
 
 	xform_register(&esp_xformsw);
 }
 SYSINIT(esp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, esp_attach, NULL);
Index: user/dchagin/lemul/sys/netipsec/xform_ipcomp.c
===================================================================
--- user/dchagin/lemul/sys/netipsec/xform_ipcomp.c	(revision 274240)
+++ user/dchagin/lemul/sys/netipsec/xform_ipcomp.c	(revision 274241)
@@ -1,639 +1,639 @@
 /*	$FreeBSD$	*/
 /* $OpenBSD: ip_ipcomp.c,v 1.1 2001/07/05 12:08:52 jjbg Exp $ */
 
 /*-
  * Copyright (c) 2001 Jean-Jacques Bernard-Gundol (jj@wabbitt.org)
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *   notice, this list of conditions and the following disclaimer in the
  *   documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *   derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /* IP payload compression protocol (IPComp), see RFC 2393 */
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netipsec/ipsec6.h>
 #endif
 
 #include <netipsec/ipcomp.h>
 #include <netipsec/ipcomp_var.h>
 
 #include <netipsec/key.h>
 #include <netipsec/key_debug.h>
 
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/deflate.h>
 #include <opencrypto/xform.h>
 
 VNET_DEFINE(int, ipcomp_enable) = 1;
 VNET_PCPUSTAT_DEFINE(struct ipcompstat, ipcompstat);
 VNET_PCPUSTAT_SYSINIT(ipcompstat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipcompstat);
 #endif /* VIMAGE */
 
 SYSCTL_DECL(_net_inet_ipcomp);
-SYSCTL_VNET_INT(_net_inet_ipcomp, OID_AUTO,
-	ipcomp_enable,	CTLFLAG_RW,	&VNET_NAME(ipcomp_enable),	0, "");
+SYSCTL_INT(_net_inet_ipcomp, OID_AUTO, ipcomp_enable,
+	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipcomp_enable), 0, "");
 SYSCTL_VNET_PCPUSTAT(_net_inet_ipcomp, IPSECCTL_STATS, stats,
     struct ipcompstat, ipcompstat,
     "IPCOMP statistics (struct ipcompstat, netipsec/ipcomp_var.h");
 
 static int ipcomp_input_cb(struct cryptop *crp);
 static int ipcomp_output_cb(struct cryptop *crp);
 
 struct comp_algo *
 ipcomp_algorithm_lookup(int alg)
 {
 	if (alg >= IPCOMP_ALG_MAX)
 		return NULL;
 	switch (alg) {
 	case SADB_X_CALG_DEFLATE:
 		return &comp_algo_deflate;
 	}
 	return NULL;
 }
 
 /*
  * ipcomp_init() is called when an CPI is being set up.
  */
 static int
 ipcomp_init(struct secasvar *sav, struct xformsw *xsp)
 {
 	struct comp_algo *tcomp;
 	struct cryptoini cric;
 
 	/* NB: algorithm really comes in alg_enc and not alg_comp! */
 	tcomp = ipcomp_algorithm_lookup(sav->alg_enc);
 	if (tcomp == NULL) {
 		DPRINTF(("%s: unsupported compression algorithm %d\n", __func__,
 			 sav->alg_comp));
 		return EINVAL;
 	}
 	sav->alg_comp = sav->alg_enc;		/* set for doing histogram */
 	sav->tdb_xform = xsp;
 	sav->tdb_compalgxform = tcomp;
 
 	/* Initialize crypto session */
 	bzero(&cric, sizeof (cric));
 	cric.cri_alg = sav->tdb_compalgxform->type;
 
 	return crypto_newsession(&sav->tdb_cryptoid, &cric, V_crypto_support);
 }
 
 /*
  * ipcomp_zeroize() used when IPCA is deleted
  */
 static int
 ipcomp_zeroize(struct secasvar *sav)
 {
 	int err;
 
 	err = crypto_freesession(sav->tdb_cryptoid);
 	sav->tdb_cryptoid = 0;
 	return err;
 }
 
 /*
  * ipcomp_input() gets called to uncompress an input packet
  */
 static int
 ipcomp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff)
 {
 	struct tdb_crypto *tc;
 	struct cryptodesc *crdc;
 	struct cryptop *crp;
 	struct ipcomp *ipcomp;
 	caddr_t addr;
 	int hlen = IPCOMP_HLENGTH;
 
 	/*
 	 * Check that the next header of the IPComp is not IPComp again, before
 	 * doing any real work.  Given it is not possible to do double
 	 * compression it means someone is playing tricks on us.
 	 */
 	if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == NULL) {
 		IPCOMPSTAT_INC(ipcomps_hdrops);		/*XXX*/
 		DPRINTF(("%s: m_pullup failed\n", __func__));
 		return (ENOBUFS);
 	}
 	addr = (caddr_t) mtod(m, struct ip *) + skip;
 	ipcomp = (struct ipcomp *)addr;
 	if (ipcomp->comp_nxt == IPPROTO_IPCOMP) {
 		m_freem(m);
 		IPCOMPSTAT_INC(ipcomps_pdrops);	/* XXX have our own stats? */
 		DPRINTF(("%s: recursive compression detected\n", __func__));
 		return (EINVAL);
 	}
 
 	/* Get crypto descriptors */
 	crp = crypto_getreq(1);
 	if (crp == NULL) {
 		m_freem(m);
 		DPRINTF(("%s: no crypto descriptors\n", __func__));
 		IPCOMPSTAT_INC(ipcomps_crypto);
 		return ENOBUFS;
 	}
 	/* Get IPsec-specific opaque pointer */
 	tc = (struct tdb_crypto *) malloc(sizeof (*tc), M_XDATA, M_NOWAIT|M_ZERO);
 	if (tc == NULL) {
 		m_freem(m);
 		crypto_freereq(crp);
 		DPRINTF(("%s: cannot allocate tdb_crypto\n", __func__));
 		IPCOMPSTAT_INC(ipcomps_crypto);
 		return ENOBUFS;
 	}
 	crdc = crp->crp_desc;
 
 	crdc->crd_skip = skip + hlen;
 	crdc->crd_len = m->m_pkthdr.len - (skip + hlen);
 	crdc->crd_inject = skip;
 
 	tc->tc_ptr = 0;
 
 	/* Decompression operation */
 	crdc->crd_alg = sav->tdb_compalgxform->type;
 
 	/* Crypto operation descriptor */
 	crp->crp_ilen = m->m_pkthdr.len - (skip + hlen);
 	crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC;
 	crp->crp_buf = (caddr_t) m;
 	crp->crp_callback = ipcomp_input_cb;
 	crp->crp_sid = sav->tdb_cryptoid;
 	crp->crp_opaque = (caddr_t) tc;
 
 	/* These are passed as-is to the callback */
 	tc->tc_spi = sav->spi;
 	tc->tc_dst = sav->sah->saidx.dst;
 	tc->tc_proto = sav->sah->saidx.proto;
 	tc->tc_protoff = protoff;
 	tc->tc_skip = skip;
 	KEY_ADDREFSA(sav);
 	tc->tc_sav = sav;
 
 	return crypto_dispatch(crp);
 }
 
 /*
  * IPComp input callback from the crypto driver.
  */
 static int
 ipcomp_input_cb(struct cryptop *crp)
 {
 	struct cryptodesc *crd;
 	struct tdb_crypto *tc;
 	int skip, protoff;
 	struct mtag *mtag;
 	struct mbuf *m;
 	struct secasvar *sav;
 	struct secasindex *saidx;
 	int hlen = IPCOMP_HLENGTH, error, clen;
 	u_int8_t nproto;
 	caddr_t addr;
 
 	crd = crp->crp_desc;
 
 	tc = (struct tdb_crypto *) crp->crp_opaque;
 	IPSEC_ASSERT(tc != NULL, ("null opaque crypto data area!"));
 	skip = tc->tc_skip;
 	protoff = tc->tc_protoff;
 	mtag = (struct mtag *) tc->tc_ptr;
 	m = (struct mbuf *) crp->crp_buf;
 
 	sav = tc->tc_sav;
 	IPSEC_ASSERT(sav != NULL, ("null SA!"));
 
 	saidx = &sav->sah->saidx;
 	IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET ||
 		saidx->dst.sa.sa_family == AF_INET6,
 		("unexpected protocol family %u", saidx->dst.sa.sa_family));
 
 	/* Check for crypto errors */
 	if (crp->crp_etype) {
 		/* Reset the session ID */
 		if (sav->tdb_cryptoid != 0)
 			sav->tdb_cryptoid = crp->crp_sid;
 
 		if (crp->crp_etype == EAGAIN) {
 			return crypto_dispatch(crp);
 		}
 		IPCOMPSTAT_INC(ipcomps_noxform);
 		DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
 		error = crp->crp_etype;
 		goto bad;
 	}
 	/* Shouldn't happen... */
 	if (m == NULL) {
 		IPCOMPSTAT_INC(ipcomps_crypto);
 		DPRINTF(("%s: null mbuf returned from crypto\n", __func__));
 		error = EINVAL;
 		goto bad;
 	}
 	IPCOMPSTAT_INC(ipcomps_hist[sav->alg_comp]);
 
 	clen = crp->crp_olen;		/* Length of data after processing */
 
 	/* Release the crypto descriptors */
 	free(tc, M_XDATA), tc = NULL;
 	crypto_freereq(crp), crp = NULL;
 
 	/* In case it's not done already, adjust the size of the mbuf chain */
 	m->m_pkthdr.len = clen + hlen + skip;
 
 	if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == 0) {
 		IPCOMPSTAT_INC(ipcomps_hdrops);		/*XXX*/
 		DPRINTF(("%s: m_pullup failed\n", __func__));
 		error = EINVAL;				/*XXX*/
 		goto bad;
 	}
 
 	/* Keep the next protocol field */
 	addr = (caddr_t) mtod(m, struct ip *) + skip;
 	nproto = ((struct ipcomp *) addr)->comp_nxt;
 
 	/* Remove the IPCOMP header */
 	error = m_striphdr(m, skip, hlen);
 	if (error) {
 		IPCOMPSTAT_INC(ipcomps_hdrops);
 		DPRINTF(("%s: bad mbuf chain, IPCA %s/%08lx\n", __func__,
 			 ipsec_address(&sav->sah->saidx.dst),
 			 (u_long) ntohl(sav->spi)));
 		goto bad;
 	}
 
 	/* Restore the Next Protocol field */
 	m_copyback(m, protoff, sizeof (u_int8_t), (u_int8_t *) &nproto);
 
 	switch (saidx->dst.sa.sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		error = ipsec6_common_input_cb(m, sav, skip, protoff, NULL);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		error = ipsec4_common_input_cb(m, sav, skip, protoff, NULL);
 		break;
 #endif
 	default:
 		panic("%s: Unexpected address family: %d saidx=%p", __func__,
 		    saidx->dst.sa.sa_family, saidx);
 	}
 
 	KEY_FREESAV(&sav);
 	return error;
 bad:
 	if (sav)
 		KEY_FREESAV(&sav);
 	if (m)
 		m_freem(m);
 	if (tc != NULL)
 		free(tc, M_XDATA);
 	if (crp)
 		crypto_freereq(crp);
 	return error;
 }
 
 /*
  * IPComp output routine, called by ipsec[46]_process_packet()
  */
 static int
 ipcomp_output(
 	struct mbuf *m,
 	struct ipsecrequest *isr,
 	struct mbuf **mp,
 	int skip,
 	int protoff
 )
 {
 	struct secasvar *sav;
 	struct comp_algo *ipcompx;
 	int error, ralen, maxpacketsize;
 	struct cryptodesc *crdc;
 	struct cryptop *crp;
 	struct tdb_crypto *tc;
 
 	sav = isr->sav;
 	IPSEC_ASSERT(sav != NULL, ("null SA"));
 	ipcompx = sav->tdb_compalgxform;
 	IPSEC_ASSERT(ipcompx != NULL, ("null compression xform"));
 
 	/*
 	 * Do not touch the packet in case our payload to compress
 	 * is lower than the minimal threshold of the compression
 	 * alogrithm.  We will just send out the data uncompressed.
 	 * See RFC 3173, 2.2. Non-Expansion Policy.
 	 */
 	if (m->m_pkthdr.len <= ipcompx->minlen) {
 		IPCOMPSTAT_INC(ipcomps_threshold);
 		return ipsec_process_done(m, isr);
 	}
 
 	ralen = m->m_pkthdr.len - skip;	/* Raw payload length before comp. */
 	IPCOMPSTAT_INC(ipcomps_output);
 
 	/* Check for maximum packet size violations. */
 	switch (sav->sah->saidx.dst.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		maxpacketsize = IP_MAXPACKET;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		maxpacketsize = IPV6_MAXPACKET;
 		break;
 #endif /* INET6 */
 	default:
 		IPCOMPSTAT_INC(ipcomps_nopf);
 		DPRINTF(("%s: unknown/unsupported protocol family %d, "
 		    "IPCA %s/%08lx\n", __func__,
 		    sav->sah->saidx.dst.sa.sa_family,
 		    ipsec_address(&sav->sah->saidx.dst),
 		    (u_long) ntohl(sav->spi)));
 		error = EPFNOSUPPORT;
 		goto bad;
 	}
 	if (ralen + skip + IPCOMP_HLENGTH > maxpacketsize) {
 		IPCOMPSTAT_INC(ipcomps_toobig);
 		DPRINTF(("%s: packet in IPCA %s/%08lx got too big "
 		    "(len %u, max len %u)\n", __func__,
 		    ipsec_address(&sav->sah->saidx.dst),
 		    (u_long) ntohl(sav->spi),
 		    ralen + skip + IPCOMP_HLENGTH, maxpacketsize));
 		error = EMSGSIZE;
 		goto bad;
 	}
 
 	/* Update the counters */
 	IPCOMPSTAT_ADD(ipcomps_obytes, m->m_pkthdr.len - skip);
 
 	m = m_unshare(m, M_NOWAIT);
 	if (m == NULL) {
 		IPCOMPSTAT_INC(ipcomps_hdrops);
 		DPRINTF(("%s: cannot clone mbuf chain, IPCA %s/%08lx\n",
 		    __func__, ipsec_address(&sav->sah->saidx.dst),
 		    (u_long) ntohl(sav->spi)));
 		error = ENOBUFS;
 		goto bad;
 	}
 
 	/* Ok now, we can pass to the crypto processing. */
 
 	/* Get crypto descriptors */
 	crp = crypto_getreq(1);
 	if (crp == NULL) {
 		IPCOMPSTAT_INC(ipcomps_crypto);
 		DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__));
 		error = ENOBUFS;
 		goto bad;
 	}
 	crdc = crp->crp_desc;
 
 	/* Compression descriptor */
 	crdc->crd_skip = skip;
 	crdc->crd_len = ralen;
 	crdc->crd_flags = CRD_F_COMP;
 	crdc->crd_inject = skip;
 
 	/* Compression operation */
 	crdc->crd_alg = ipcompx->type;
 
 	/* IPsec-specific opaque crypto info */
 	tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto),
 		M_XDATA, M_NOWAIT|M_ZERO);
 	if (tc == NULL) {
 		IPCOMPSTAT_INC(ipcomps_crypto);
 		DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__));
 		crypto_freereq(crp);
 		error = ENOBUFS;
 		goto bad;
 	}
 
 	tc->tc_isr = isr;
 	KEY_ADDREFSA(sav);
 	tc->tc_sav = sav;
 	tc->tc_spi = sav->spi;
 	tc->tc_dst = sav->sah->saidx.dst;
 	tc->tc_proto = sav->sah->saidx.proto;
 	tc->tc_protoff = protoff;
 	tc->tc_skip = skip;
 
 	/* Crypto operation descriptor */
 	crp->crp_ilen = m->m_pkthdr.len;	/* Total input length */
 	crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC;
 	crp->crp_buf = (caddr_t) m;
 	crp->crp_callback = ipcomp_output_cb;
 	crp->crp_opaque = (caddr_t) tc;
 	crp->crp_sid = sav->tdb_cryptoid;
 
 	return crypto_dispatch(crp);
 bad:
 	if (m)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * IPComp output callback from the crypto driver.
  */
 static int
 ipcomp_output_cb(struct cryptop *crp)
 {
 	struct tdb_crypto *tc;
 	struct ipsecrequest *isr;
 	struct secasvar *sav;
 	struct mbuf *m;
 	int error, skip;
 
 	tc = (struct tdb_crypto *) crp->crp_opaque;
 	IPSEC_ASSERT(tc != NULL, ("null opaque data area!"));
 	m = (struct mbuf *) crp->crp_buf;
 	skip = tc->tc_skip;
 
 	isr = tc->tc_isr;
 	IPSECREQUEST_LOCK(isr);
 	sav = tc->tc_sav;
 	/* With the isr lock released SA pointer can be updated. */
 	if (sav != isr->sav) {
 		IPCOMPSTAT_INC(ipcomps_notdb);
 		DPRINTF(("%s: SA expired while in crypto\n", __func__));
 		error = ENOBUFS;		/*XXX*/
 		goto bad;
 	}
 
 	/* Check for crypto errors */
 	if (crp->crp_etype) {
 		/* Reset the session ID */
 		if (sav->tdb_cryptoid != 0)
 			sav->tdb_cryptoid = crp->crp_sid;
 
 		if (crp->crp_etype == EAGAIN) {
 			IPSECREQUEST_UNLOCK(isr);
 			return crypto_dispatch(crp);
 		}
 		IPCOMPSTAT_INC(ipcomps_noxform);
 		DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
 		error = crp->crp_etype;
 		goto bad;
 	}
 	/* Shouldn't happen... */
 	if (m == NULL) {
 		IPCOMPSTAT_INC(ipcomps_crypto);
 		DPRINTF(("%s: bogus return buffer from crypto\n", __func__));
 		error = EINVAL;
 		goto bad;
 	}
 	IPCOMPSTAT_INC(ipcomps_hist[sav->alg_comp]);
 
 	if (crp->crp_ilen - skip > crp->crp_olen) {
 		struct mbuf *mo;
 		struct ipcomp *ipcomp;
 		int roff;
 		uint8_t prot;
 
 		/* Compression helped, inject IPCOMP header. */
 		mo = m_makespace(m, skip, IPCOMP_HLENGTH, &roff);
 		if (mo == NULL) {
 			IPCOMPSTAT_INC(ipcomps_wrap);
 			DPRINTF(("%s: IPCOMP header inject failed for IPCA %s/%08lx\n",
 			    __func__, ipsec_address(&sav->sah->saidx.dst),
 			    (u_long) ntohl(sav->spi)));
 			error = ENOBUFS;
 			goto bad;
 		}
 		ipcomp = (struct ipcomp *)(mtod(mo, caddr_t) + roff);
 
 		/* Initialize the IPCOMP header */
 		/* XXX alignment always correct? */
 		switch (sav->sah->saidx.dst.sa.sa_family) {
 #ifdef INET
 		case AF_INET:
 			ipcomp->comp_nxt = mtod(m, struct ip *)->ip_p;
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 			ipcomp->comp_nxt = mtod(m, struct ip6_hdr *)->ip6_nxt;
 			break;
 #endif
 		}
 		ipcomp->comp_flags = 0;
 		ipcomp->comp_cpi = htons((u_int16_t) ntohl(sav->spi));
 
 		/* Fix Next Protocol in IPv4/IPv6 header */
 		prot = IPPROTO_IPCOMP;
 		m_copyback(m, tc->tc_protoff, sizeof(u_int8_t),
 		    (u_char *)&prot);
 
 		/* Adjust the length in the IP header */
 		switch (sav->sah->saidx.dst.sa.sa_family) {
 #ifdef INET
 		case AF_INET:
 			mtod(m, struct ip *)->ip_len = htons(m->m_pkthdr.len);
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 			mtod(m, struct ip6_hdr *)->ip6_plen =
 				htons(m->m_pkthdr.len) - sizeof(struct ip6_hdr);
 			break;
 #endif /* INET6 */
 		default:
 			IPCOMPSTAT_INC(ipcomps_nopf);
 			DPRINTF(("%s: unknown/unsupported protocol "
 			    "family %d, IPCA %s/%08lx\n", __func__,
 			    sav->sah->saidx.dst.sa.sa_family,
 			    ipsec_address(&sav->sah->saidx.dst),
 			    (u_long) ntohl(sav->spi)));
 			error = EPFNOSUPPORT;
 			goto bad;
 		}
 	} else {
 		/* Compression was useless, we have lost time. */
 		IPCOMPSTAT_INC(ipcomps_uncompr);
 		DPRINTF(("%s: compressions was useless %d - %d <= %d\n",
 		    __func__, crp->crp_ilen, skip, crp->crp_olen));
 		/* XXX remember state to not compress the next couple
 		 *     of packets, RFC 3173, 2.2. Non-Expansion Policy */
 	}
 
 	/* Release the crypto descriptor */
 	free(tc, M_XDATA);
 	crypto_freereq(crp);
 
 	/* NB: m is reclaimed by ipsec_process_done. */
 	error = ipsec_process_done(m, isr);
 	KEY_FREESAV(&sav);
 	IPSECREQUEST_UNLOCK(isr);
 	return error;
 bad:
 	if (sav)
 		KEY_FREESAV(&sav);
 	IPSECREQUEST_UNLOCK(isr);
 	if (m)
 		m_freem(m);
 	free(tc, M_XDATA);
 	crypto_freereq(crp);
 	return error;
 }
 
 static struct xformsw ipcomp_xformsw = {
 	XF_IPCOMP,		XFT_COMP,		"IPcomp",
 	ipcomp_init,		ipcomp_zeroize,		ipcomp_input,
 	ipcomp_output
 };
 
 static void
 ipcomp_attach(void)
 {
 
 	xform_register(&ipcomp_xformsw);
 }
 
 SYSINIT(ipcomp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipcomp_attach, NULL);
Index: user/dchagin/lemul/sys/netipsec/xform_ipip.c
===================================================================
--- user/dchagin/lemul/sys/netipsec/xform_ipip.c	(revision 274240)
+++ user/dchagin/lemul/sys/netipsec/xform_ipip.c	(revision 274241)
@@ -1,668 +1,668 @@
 /*	$FreeBSD$	*/
 /*	$OpenBSD: ip_ipip.c,v 1.25 2002/06/10 18:04:55 itojun Exp $ */
 /*-
  * The authors of this code are John Ioannidis (ji@tla.org),
  * Angelos D. Keromytis (kermit@csd.uch.gr) and
  * Niels Provos (provos@physnet.uni-hamburg.de).
  *
  * The original version of this code was written by John Ioannidis
  * for BSD/OS in Athens, Greece, in November 1995.
  *
  * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
  * by Angelos D. Keromytis.
  *
  * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
  * and Niels Provos.
  *
  * Additional features in 1999 by Angelos D. Keromytis.
  *
  * Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
  * Angelos D. Keromytis and Niels Provos.
  * Copyright (c) 2001, Angelos D. Keromytis.
  *
  * Permission to use, copy, and modify this software with or without fee
  * is hereby granted, provided that this entire notice is included in
  * all copies of any software which is or includes a copy or
  * modification of this software.
  * You may use this code under the GNU public license if you so wish. Please
  * contribute changes back to the authors under this freer than GPL license
  * so that we may further the use of strong encryption without limitations to
  * all.
  *
  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
  * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
  * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
  * PURPOSE.
  */
 
 /*
  * IP-inside-IP processing
  */
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_enc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_ecn.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_encap.h>
 
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 
 #include <netipsec/ipip_var.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netipsec/ipsec6.h>
 #include <netinet6/ip6_ecn.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6protosw.h>
 #endif
 
 #include <netipsec/key.h>
 #include <netipsec/key_debug.h>
 
 #include <machine/stdarg.h>
 
 /*
  * We can control the acceptance of IP4 packets by altering the sysctl
  * net.inet.ipip.allow value.  Zero means drop them, all else is acceptance.
  */
 VNET_DEFINE(int, ipip_allow) = 0;
 VNET_PCPUSTAT_DEFINE(struct ipipstat, ipipstat);
 VNET_PCPUSTAT_SYSINIT(ipipstat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipipstat);
 #endif /* VIMAGE */
 
 SYSCTL_DECL(_net_inet_ipip);
-SYSCTL_VNET_INT(_net_inet_ipip, OID_AUTO,
-	ipip_allow,	CTLFLAG_RW,	&VNET_NAME(ipip_allow),	0, "");
+SYSCTL_INT(_net_inet_ipip, OID_AUTO, ipip_allow, CTLFLAG_VNET | CTLFLAG_RW,
+    &VNET_NAME(ipip_allow), 0, "");
 SYSCTL_VNET_PCPUSTAT(_net_inet_ipip, IPSECCTL_STATS, stats,
     struct ipipstat, ipipstat,
     "IPIP statistics (struct ipipstat, netipsec/ipip_var.h)");
 
 /* XXX IPCOMP */
 #define	M_IPSEC	(M_AUTHIPHDR|M_AUTHIPDGM|M_DECRYPTED)
 
 static void _ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp);
 
 #ifdef INET6
 /*
  * Really only a wrapper for ipip_input(), for use with IPv6.
  */
 int
 ip4_input6(struct mbuf **m, int *offp, int proto)
 {
 #if 0
 	/* If we do not accept IP-in-IP explicitly, drop.  */
 	if (!V_ipip_allow && ((*m)->m_flags & M_IPSEC) == 0) {
 		DPRINTF(("%s: dropped due to policy\n", __func__));
 		IPIPSTAT_INC(ipips_pdrops);
 		m_freem(*m);
 		return IPPROTO_DONE;
 	}
 #endif
 	_ipip_input(*m, *offp, NULL);
 	return IPPROTO_DONE;
 }
 #endif /* INET6 */
 
 #ifdef INET
 /*
  * Really only a wrapper for ipip_input(), for use with IPv4.
  */
 int
 ip4_input(struct mbuf **mp, int *offp, int proto)
 {
 #if 0
 	/* If we do not accept IP-in-IP explicitly, drop.  */
 	if (!V_ipip_allow && (m->m_flags & M_IPSEC) == 0) {
 		DPRINTF(("%s: dropped due to policy\n", __func__));
 		IPIPSTAT_INC(ipips_pdrops);
 		m_freem(m);
 		return;
 	}
 #endif
 	_ipip_input(*mp, *offp, NULL);
 	return (IPPROTO_DONE);
 }
 #endif /* INET */
 
 /*
  * ipip_input gets called when we receive an IP{46} encapsulated packet,
  * either because we got it at a real interface, or because AH or ESP
  * were being used in tunnel mode (in which case the rcvif element will
  * contain the address of the encX interface associated with the tunnel.
  */
 
 static void
 _ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp)
 {
 	struct ip *ipo;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	u_int8_t itos;
 #endif
 	int isr;
 	u_int8_t otos;
 	u_int8_t v;
 	int hlen;
 
 	IPIPSTAT_INC(ipips_ipackets);
 
 	m_copydata(m, 0, 1, &v);
 
 	switch (v >> 4) {
 #ifdef INET
         case 4:
 		hlen = sizeof(struct ip);
 		break;
 #endif /* INET */
 #ifdef INET6
         case 6:
 		hlen = sizeof(struct ip6_hdr);
 		break;
 #endif
         default:
 		IPIPSTAT_INC(ipips_family);
 		m_freem(m);
 		return /* EAFNOSUPPORT */;
 	}
 
 	/* Bring the IP header in the first mbuf, if not there already */
 	if (m->m_len < hlen) {
 		if ((m = m_pullup(m, hlen)) == NULL) {
 			DPRINTF(("%s: m_pullup (1) failed\n", __func__));
 			IPIPSTAT_INC(ipips_hdrops);
 			return;
 		}
 	}
 	ipo = mtod(m, struct ip *);
 
 	/* Keep outer ecn field. */
 	switch (v >> 4) {
 #ifdef INET
 	case 4:
 		otos = ipo->ip_tos;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case 6:
 		otos = (ntohl(mtod(m, struct ip6_hdr *)->ip6_flow) >> 20) & 0xff;
 		break;
 #endif
 	default:
 		panic("ipip_input: unknown ip version %u (outer)", v>>4);
 	}
 
 	/* Remove outer IP header */
 	m_adj(m, iphlen);
 
 	/* Sanity check */
 	if (m->m_pkthdr.len < sizeof(struct ip))  {
 		IPIPSTAT_INC(ipips_hdrops);
 		m_freem(m);
 		return;
 	}
 
 	m_copydata(m, 0, 1, &v);
 
 	switch (v >> 4) {
 #ifdef INET
         case 4:
 		hlen = sizeof(struct ip);
 		break;
 #endif /* INET */
 
 #ifdef INET6
         case 6:
 		hlen = sizeof(struct ip6_hdr);
 		break;
 #endif
 	default:
 		IPIPSTAT_INC(ipips_family);
 		m_freem(m);
 		return; /* EAFNOSUPPORT */
 	}
 
 	/*
 	 * Bring the inner IP header in the first mbuf, if not there already.
 	 */
 	if (m->m_len < hlen) {
 		if ((m = m_pullup(m, hlen)) == NULL) {
 			DPRINTF(("%s: m_pullup (2) failed\n", __func__));
 			IPIPSTAT_INC(ipips_hdrops);
 			return;
 		}
 	}
 
 	/*
 	 * RFC 1853 specifies that the inner TTL should not be touched on
 	 * decapsulation. There's no reason this comment should be here, but
 	 * this is as good as any a position.
 	 */
 
 	/* Some sanity checks in the inner IP header */
 	switch (v >> 4) {
 #ifdef INET
     	case 4:
                 ipo = mtod(m, struct ip *);
 		ip_ecn_egress(V_ip4_ipsec_ecn, &otos, &ipo->ip_tos);
                 break;
 #endif /* INET */
 #ifdef INET6
     	case 6:
                 ip6 = (struct ip6_hdr *) ipo;
 		itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 		ip_ecn_egress(V_ip6_ipsec_ecn, &otos, &itos);
 		ip6->ip6_flow &= ~htonl(0xff << 20);
 		ip6->ip6_flow |= htonl((u_int32_t) itos << 20);
                 break;
 #endif
 	default:
 		panic("ipip_input: unknown ip version %u (inner)", v>>4);
 	}
 
 	/* Check for local address spoofing. */
 	if ((m->m_pkthdr.rcvif == NULL ||
 	    !(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK)) &&
 	    V_ipip_allow != 2) {
 #ifdef INET
 		if ((v >> 4) == IPVERSION &&
 		    in_localip(ipo->ip_src) != 0) {
 			IPIPSTAT_INC(ipips_spoof);
 			m_freem(m);
 			return;
 		}
 #endif
 #ifdef INET6
 		if ((v & IPV6_VERSION_MASK) == IPV6_VERSION &&
 		    in6_localip(&ip6->ip6_src) != 0) {
 			IPIPSTAT_INC(ipips_spoof);
 			m_freem(m);
 			return;
 		}
 #endif
 	}
 
 	/* Statistics */
 	IPIPSTAT_ADD(ipips_ibytes, m->m_pkthdr.len - iphlen);
 
 	/*
 	 * Interface pointer stays the same; if no IPsec processing has
 	 * been done (or will be done), this will point to a normal
 	 * interface. Otherwise, it'll point to an enc interface, which
 	 * will allow a packet filter to distinguish between secure and
 	 * untrusted packets.
 	 */
 
 	switch (v >> 4) {
 #ifdef INET
 	case 4:
 		isr = NETISR_IP;
 		break;
 #endif
 #ifdef INET6
 	case 6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		panic("%s: bogus ip version %u", __func__, v>>4);
 	}
 
 	if (netisr_queue(isr, m)) {	/* (0) on success. */
 		IPIPSTAT_INC(ipips_qfull);
 		DPRINTF(("%s: packet dropped because of full queue\n",
 			__func__));
 	}
 }
 
 int
 ipip_output(
 	struct mbuf *m,
 	struct ipsecrequest *isr,
 	struct mbuf **mp,
 	int skip,
 	int protoff
 )
 {
 	struct secasvar *sav;
 	u_int8_t tp, otos;
 	struct secasindex *saidx;
 	int error;
 #if defined(INET) || defined(INET6)
 	u_int8_t itos;
 #endif
 #ifdef INET
 	struct ip *ipo;
 #endif /* INET */
 #ifdef INET6
 	struct ip6_hdr *ip6, *ip6o;
 #endif /* INET6 */
 
 	sav = isr->sav;
 	IPSEC_ASSERT(sav != NULL, ("null SA"));
 	IPSEC_ASSERT(sav->sah != NULL, ("null SAH"));
 
 	/* XXX Deal with empty TDB source/destination addresses. */
 
 	m_copydata(m, 0, 1, &tp);
 	tp = (tp >> 4) & 0xff;  /* Get the IP version number. */
 
 	saidx = &sav->sah->saidx;
 	switch (saidx->dst.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (saidx->src.sa.sa_family != AF_INET ||
 		    saidx->src.sin.sin_addr.s_addr == INADDR_ANY ||
 		    saidx->dst.sin.sin_addr.s_addr == INADDR_ANY) {
 			DPRINTF(("%s: unspecified tunnel endpoint "
 			    "address in SA %s/%08lx\n", __func__,
 			    ipsec_address(&saidx->dst),
 			    (u_long) ntohl(sav->spi)));
 			IPIPSTAT_INC(ipips_unspec);
 			error = EINVAL;
 			goto bad;
 		}
 
 		M_PREPEND(m, sizeof(struct ip), M_NOWAIT);
 		if (m == 0) {
 			DPRINTF(("%s: M_PREPEND failed\n", __func__));
 			IPIPSTAT_INC(ipips_hdrops);
 			error = ENOBUFS;
 			goto bad;
 		}
 
 		ipo = mtod(m, struct ip *);
 
 		ipo->ip_v = IPVERSION;
 		ipo->ip_hl = 5;
 		ipo->ip_len = htons(m->m_pkthdr.len);
 		ipo->ip_ttl = V_ip_defttl;
 		ipo->ip_sum = 0;
 		ipo->ip_src = saidx->src.sin.sin_addr;
 		ipo->ip_dst = saidx->dst.sin.sin_addr;
 
 		ipo->ip_id = ip_newid();
 
 		/* If the inner protocol is IP... */
 		switch (tp) {
 		case IPVERSION:
 			/* Save ECN notification */
 			m_copydata(m, sizeof(struct ip) +
 			    offsetof(struct ip, ip_tos),
 			    sizeof(u_int8_t), (caddr_t) &itos);
 
 			ipo->ip_p = IPPROTO_IPIP;
 
 			/*
 			 * We should be keeping tunnel soft-state and
 			 * send back ICMPs if needed.
 			 */
 			m_copydata(m, sizeof(struct ip) +
 			    offsetof(struct ip, ip_off),
 			    sizeof(u_int16_t), (caddr_t) &ipo->ip_off);
 			ipo->ip_off = ntohs(ipo->ip_off);
 			ipo->ip_off &= ~(IP_DF | IP_MF | IP_OFFMASK);
 			ipo->ip_off = htons(ipo->ip_off);
 			break;
 #ifdef INET6
 		case (IPV6_VERSION >> 4):
 		{
 			u_int32_t itos32;
 
 			/* Save ECN notification. */
 			m_copydata(m, sizeof(struct ip) +
 			    offsetof(struct ip6_hdr, ip6_flow),
 			    sizeof(u_int32_t), (caddr_t) &itos32);
 			itos = ntohl(itos32) >> 20;
 			ipo->ip_p = IPPROTO_IPV6;
 			ipo->ip_off = 0;
 			break;
 		}
 #endif /* INET6 */
 		default:
 			goto nofamily;
 		}
 
 		otos = 0;
 		ip_ecn_ingress(ECN_ALLOWED, &otos, &itos);
 		ipo->ip_tos = otos;
 		break;
 #endif /* INET */
 
 #ifdef INET6
 	case AF_INET6:
 		if (IN6_IS_ADDR_UNSPECIFIED(&saidx->dst.sin6.sin6_addr) ||
 		    saidx->src.sa.sa_family != AF_INET6 ||
 		    IN6_IS_ADDR_UNSPECIFIED(&saidx->src.sin6.sin6_addr)) {
 			DPRINTF(("%s: unspecified tunnel endpoint "
 			    "address in SA %s/%08lx\n", __func__,
 			    ipsec_address(&saidx->dst),
 			    (u_long) ntohl(sav->spi)));
 			IPIPSTAT_INC(ipips_unspec);
 			error = ENOBUFS;
 			goto bad;
 		}
 
 		/* scoped address handling */
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src))
 			ip6->ip6_src.s6_addr16[1] = 0;
 		if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst))
 			ip6->ip6_dst.s6_addr16[1] = 0;
 
 		M_PREPEND(m, sizeof(struct ip6_hdr), M_NOWAIT);
 		if (m == 0) {
 			DPRINTF(("%s: M_PREPEND failed\n", __func__));
 			IPIPSTAT_INC(ipips_hdrops);
 			error = ENOBUFS;
 			goto bad;
 		}
 
 		/* Initialize IPv6 header */
 		ip6o = mtod(m, struct ip6_hdr *);
 		ip6o->ip6_flow = 0;
 		ip6o->ip6_vfc &= ~IPV6_VERSION_MASK;
 		ip6o->ip6_vfc |= IPV6_VERSION;
 		ip6o->ip6_plen = htons(m->m_pkthdr.len);
 		ip6o->ip6_hlim = IPV6_DEFHLIM;
 		ip6o->ip6_dst = saidx->dst.sin6.sin6_addr;
 		ip6o->ip6_src = saidx->src.sin6.sin6_addr;
 
 		/* Fix payload length */
 		ip6o->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
 
 		switch (tp) {
 #ifdef INET
 		case IPVERSION:
 			/* Save ECN notification */
 			m_copydata(m, sizeof(struct ip6_hdr) +
 			    offsetof(struct ip, ip_tos), sizeof(u_int8_t),
 			    (caddr_t) &itos);
 
 			/* This is really IPVERSION. */
 			ip6o->ip6_nxt = IPPROTO_IPIP;
 			break;
 #endif /* INET */
 		case (IPV6_VERSION >> 4):
 		{
 			u_int32_t itos32;
 
 			/* Save ECN notification. */
 			m_copydata(m, sizeof(struct ip6_hdr) +
 			    offsetof(struct ip6_hdr, ip6_flow),
 			    sizeof(u_int32_t), (caddr_t) &itos32);
 			itos = ntohl(itos32) >> 20;
 
 			ip6o->ip6_nxt = IPPROTO_IPV6;
 			break;
 		}
 		default:
 			goto nofamily;
 		}
 
 		otos = 0;
 		ip_ecn_ingress(V_ip6_ipsec_ecn, &otos, &itos);
 		ip6o->ip6_flow |= htonl((u_int32_t) otos << 20);
 		break;
 #endif /* INET6 */
 
 	default:
 nofamily:
 		DPRINTF(("%s: unsupported protocol family %u\n", __func__,
 		    saidx->dst.sa.sa_family));
 		IPIPSTAT_INC(ipips_family);
 		error = EAFNOSUPPORT;		/* XXX diffs from openbsd */
 		goto bad;
 	}
 
 	IPIPSTAT_INC(ipips_opackets);
 	*mp = m;
 
 #ifdef INET
 	if (saidx->dst.sa.sa_family == AF_INET) {
 #if 0
 		if (sav->tdb_xform->xf_type == XF_IP4)
 			tdb->tdb_cur_bytes +=
 			    m->m_pkthdr.len - sizeof(struct ip);
 #endif
 		IPIPSTAT_ADD(ipips_obytes,
 		    m->m_pkthdr.len - sizeof(struct ip));
 	}
 #endif /* INET */
 
 #ifdef INET6
 	if (saidx->dst.sa.sa_family == AF_INET6) {
 #if 0
 		if (sav->tdb_xform->xf_type == XF_IP4)
 			tdb->tdb_cur_bytes +=
 			    m->m_pkthdr.len - sizeof(struct ip6_hdr);
 #endif
 		IPIPSTAT_ADD(ipips_obytes,
 		    m->m_pkthdr.len - sizeof(struct ip6_hdr));
 	}
 #endif /* INET6 */
 
 	return 0;
 bad:
 	if (m)
 		m_freem(m);
 	*mp = NULL;
 	return (error);
 }
 
 #ifdef IPSEC
 #if defined(INET) || defined(INET6)
 static int
 ipe4_init(struct secasvar *sav, struct xformsw *xsp)
 {
 	sav->tdb_xform = xsp;
 	return 0;
 }
 
 static int
 ipe4_zeroize(struct secasvar *sav)
 {
 	sav->tdb_xform = NULL;
 	return 0;
 }
 
 static int
 ipe4_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff)
 {
 	/* This is a rather serious mistake, so no conditional printing. */
 	printf("%s: should never be called\n", __func__);
 	if (m)
 		m_freem(m);
 	return EOPNOTSUPP;
 }
 
 static struct xformsw ipe4_xformsw = {
 	XF_IP4,		0,		"IPv4 Simple Encapsulation",
 	ipe4_init,	ipe4_zeroize,	ipe4_input,	ipip_output,
 };
 
 extern struct domain inetdomain;
 #endif /* INET || INET6 */
 #ifdef INET
 static struct protosw ipe4_protosw = {
 	.pr_type =	SOCK_RAW,
 	.pr_domain =	&inetdomain,
 	.pr_protocol =	IPPROTO_IPV4,
 	.pr_flags =	PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =	ip4_input,
 	.pr_ctloutput =	rip_ctloutput,
 	.pr_usrreqs =	&rip_usrreqs
 };
 #endif /* INET */
 #if defined(INET6) && defined(INET)
 static struct protosw ipe6_protosw = {
 	.pr_type =	SOCK_RAW,
 	.pr_domain =	&inetdomain,
 	.pr_protocol =	IPPROTO_IPV6,
 	.pr_flags =	PR_ATOMIC|PR_ADDR|PR_LASTHDR,
 	.pr_input =	ip4_input6,
 	.pr_ctloutput =	rip_ctloutput,
 	.pr_usrreqs =	&rip_usrreqs
 };
 #endif /* INET6 && INET */
 
 #ifdef INET
 /*
  * Check the encapsulated packet to see if we want it
  */
 static int
 ipe4_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
 {
 	/*
 	 * Only take packets coming from IPSEC tunnels; the rest
 	 * must be handled by the gif tunnel code.  Note that we
 	 * also return a minimum priority when we want the packet
 	 * so any explicit gif tunnels take precedence.
 	 */
 	return ((m->m_flags & M_IPSEC) != 0 ? 1 : 0);
 }
 #endif /* INET */
 
 static void
 ipe4_attach(void)
 {
 
 	xform_register(&ipe4_xformsw);
 	/* attach to encapsulation framework */
 	/* XXX save return cookie for detach on module remove */
 #ifdef INET
 	(void) encap_attach_func(AF_INET, -1,
 		ipe4_encapcheck, &ipe4_protosw, NULL);
 #endif
 #if defined(INET6) && defined(INET)
 	(void) encap_attach_func(AF_INET6, -1,
 		ipe4_encapcheck, (struct protosw *)&ipe6_protosw, NULL);
 #endif
 }
 SYSINIT(ipe4_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipe4_attach, NULL);
 #endif	/* IPSEC */
Index: user/dchagin/lemul/sys/netpfil/ipfw/ip_fw2.c
===================================================================
--- user/dchagin/lemul/sys/netpfil/ipfw/ip_fw2.c	(revision 274240)
+++ user/dchagin/lemul/sys/netpfil/ipfw/ip_fw2.c	(revision 274241)
@@ -1,2902 +1,2905 @@
 /*-
  * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * The FreeBSD IP packet firewall, main file
  */
 
 #include "opt_ipfw.h"
 #include "opt_ipdivert.h"
 #include "opt_inet.h"
 #ifndef INET
 #error "IPFIREWALL requires INET"
 #endif /* INET */
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/counter.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/jail.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 #include <net/ethernet.h> /* for ETHERTYPE_IP */
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netpfil/pf/pf_mtag.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_carp.h>
 #include <netinet/pim.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/sctp.h>
 
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/ip6_var.h>
 #endif
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #include <machine/in_cksum.h>	/* XXX for in_cksum */
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 /*
  * static variables followed by global ones.
  * All ipfw global variables are here.
  */
 
 static VNET_DEFINE(int, fw_deny_unknown_exthdrs);
 #define	V_fw_deny_unknown_exthdrs	VNET(fw_deny_unknown_exthdrs)
 
 static VNET_DEFINE(int, fw_permit_single_frag6) = 1;
 #define	V_fw_permit_single_frag6	VNET(fw_permit_single_frag6)
 
 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
 static int default_to_accept = 1;
 #else
 static int default_to_accept;
 #endif
 
 VNET_DEFINE(int, autoinc_step);
 VNET_DEFINE(int, fw_one_pass) = 1;
 
 VNET_DEFINE(unsigned int, fw_tables_max);
 VNET_DEFINE(unsigned int, fw_tables_sets) = 0;	/* Don't use set-aware tables */
 /* Use 128 tables by default */
 static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT;
 
 #ifndef LINEAR_SKIPTO
 static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards);
 #define	JUMP(ch, f, num, targ, back)	jump_fast(ch, f, num, targ, back)
 #else
 static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards);
 #define	JUMP(ch, f, num, targ, back)	jump_linear(ch, f, num, targ, back)
 #endif
 
 /*
  * Each rule belongs to one of 32 different sets (0..31).
  * The variable set_disable contains one bit per set.
  * If the bit is set, all rules in the corresponding set
  * are disabled. Set RESVD_SET(31) is reserved for the default rule
  * and rules that are not deleted by the flush command,
  * and CANNOT be disabled.
  * Rules in set RESVD_SET can only be deleted individually.
  */
 VNET_DEFINE(u_int32_t, set_disable);
 #define	V_set_disable			VNET(set_disable)
 
 VNET_DEFINE(int, fw_verbose);
 /* counter for ipfw_log(NULL...) */
 VNET_DEFINE(u_int64_t, norule_counter);
 VNET_DEFINE(int, verbose_limit);
 
 /* layer3_chain contains the list of rules for layer 3 */
 VNET_DEFINE(struct ip_fw_chain, layer3_chain);
 
 /* ipfw_vnet_ready controls when we are open for business */
 VNET_DEFINE(int, ipfw_vnet_ready) = 0;
 
 VNET_DEFINE(int, ipfw_nat_ready) = 0;
 
 ipfw_nat_t *ipfw_nat_ptr = NULL;
 struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
 ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
 ipfw_nat_cfg_t *ipfw_nat_del_ptr;
 ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
 ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
 
 #ifdef SYSCTL_NODE
 uint32_t dummy_def = IPFW_DEFAULT_RULE;
 static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS);
 static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS);
 
 SYSBEGIN(f3)
 
 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
-    CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
+    CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
     "Only do a single pass through ipfw when using dummynet(4)");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
-    CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
     "Rule number auto-increment step");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
-    CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose,
+    CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
     "Log matches to ipfw rules");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
-    CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
     "Set upper limit of matches of ipfw rules logged");
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
     &dummy_def, 0,
     "The default/max possible rule number.");
-SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_max,
-    CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU",
+SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU",
     "Maximum number of concurrently used tables");
-SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets,
-    CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_tables_sets, "IU",
+SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
+    0, 0, sysctl_ipfw_tables_sets, "IU",
     "Use per-set namespace for tables");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
     &default_to_accept, 0,
     "Make the default rule accept all packets.");
 TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables);
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count,
-    CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count,
+    CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
     "Number of static rules");
 
 #ifdef INET6
 SYSCTL_DECL(_net_inet6_ip6);
 SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
-SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
-    CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0,
+SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
+    CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
+    &VNET_NAME(fw_deny_unknown_exthdrs), 0,
     "Deny packets with unknown IPv6 Extension Headers");
-SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6,
-    CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0,
+SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6,
+    CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
+    &VNET_NAME(fw_permit_single_frag6), 0,
     "Permit single packet IPv6 fragments");
 #endif /* INET6 */
 
 SYSEND
 
 #endif /* SYSCTL_NODE */
 
 
 /*
  * Some macros used in the various matching options.
  * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
  * Other macros just cast void * into the appropriate type
  */
 #define	L3HDR(T, ip)	((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
 #define	TCP(p)		((struct tcphdr *)(p))
 #define	SCTP(p)		((struct sctphdr *)(p))
 #define	UDP(p)		((struct udphdr *)(p))
 #define	ICMP(p)		((struct icmphdr *)(p))
 #define	ICMP6(p)	((struct icmp6_hdr *)(p))
 
 static __inline int
 icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
 {
 	int type = icmp->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
 }
 
 #define TT	( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
     (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
 
 static int
 is_icmp_query(struct icmphdr *icmp)
 {
 	int type = icmp->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
 }
 #undef TT
 
 /*
  * The following checks use two arrays of 8 or 16 bits to store the
  * bits that we want set or clear, respectively. They are in the
  * low and high half of cmd->arg1 or cmd->d[0].
  *
  * We scan options and store the bits we find set. We succeed if
  *
  *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
  *
  * The code is sometimes optimized not to store additional variables.
  */
 
 static int
 flags_match(ipfw_insn *cmd, u_int8_t bits)
 {
 	u_char want_clear;
 	bits = ~bits;
 
 	if ( ((cmd->arg1 & 0xff) & bits) != 0)
 		return 0; /* some bits we want set were clear */
 	want_clear = (cmd->arg1 >> 8) & 0xff;
 	if ( (want_clear & bits) != want_clear)
 		return 0; /* some bits we want clear were set */
 	return 1;
 }
 
 static int
 ipopts_match(struct ip *ip, ipfw_insn *cmd)
 {
 	int optlen, bits = 0;
 	u_char *cp = (u_char *)(ip + 1);
 	int x = (ip->ip_hl << 2) - sizeof (struct ip);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[IPOPT_OPTVAL];
 
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[IPOPT_OLEN];
 			if (optlen <= 0 || optlen > x)
 				return 0; /* invalid or truncated */
 		}
 		switch (opt) {
 
 		default:
 			break;
 
 		case IPOPT_LSRR:
 			bits |= IP_FW_IPOPT_LSRR;
 			break;
 
 		case IPOPT_SSRR:
 			bits |= IP_FW_IPOPT_SSRR;
 			break;
 
 		case IPOPT_RR:
 			bits |= IP_FW_IPOPT_RR;
 			break;
 
 		case IPOPT_TS:
 			bits |= IP_FW_IPOPT_TS;
 			break;
 		}
 	}
 	return (flags_match(cmd, bits));
 }
 
 static int
 tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
 {
 	int optlen, bits = 0;
 	u_char *cp = (u_char *)(tcp + 1);
 	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[1];
 			if (optlen <= 0)
 				break;
 		}
 
 		switch (opt) {
 
 		default:
 			break;
 
 		case TCPOPT_MAXSEG:
 			bits |= IP_FW_TCPOPT_MSS;
 			break;
 
 		case TCPOPT_WINDOW:
 			bits |= IP_FW_TCPOPT_WINDOW;
 			break;
 
 		case TCPOPT_SACK_PERMITTED:
 		case TCPOPT_SACK:
 			bits |= IP_FW_TCPOPT_SACK;
 			break;
 
 		case TCPOPT_TIMESTAMP:
 			bits |= IP_FW_TCPOPT_TS;
 			break;
 
 		}
 	}
 	return (flags_match(cmd, bits));
 }
 
 static int
 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain,
     uint32_t *tablearg)
 {
 
 	if (ifp == NULL)	/* no iface with this packet, match fails */
 		return (0);
 
 	/* Check by name or by IP address */
 	if (cmd->name[0] != '\0') { /* match by name */
 		if (cmd->name[0] == '\1') /* use tablearg to match */
 			return ipfw_lookup_table_extended(chain, cmd->p.kidx, 0,
 				&ifp->if_index, tablearg);
 		/* Check name */
 		if (cmd->p.glob) {
 			if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
 				return(1);
 		} else {
 			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
 				return(1);
 		}
 	} else {
 #if !defined(USERSPACE) && defined(__FreeBSD__)	/* and OSX too ? */
 		struct ifaddr *ia;
 
 		if_addr_rlock(ifp);
 		TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
 			if (ia->ifa_addr->sa_family != AF_INET)
 				continue;
 			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
 			    (ia->ifa_addr))->sin_addr.s_addr) {
 				if_addr_runlock(ifp);
 				return(1);	/* match */
 			}
 		}
 		if_addr_runlock(ifp);
 #endif /* __FreeBSD__ */
 	}
 	return(0);	/* no match, fail ... */
 }
 
 /*
  * The verify_path function checks if a route to the src exists and
  * if it is reachable via ifp (when provided).
  * 
  * The 'verrevpath' option checks that the interface that an IP packet
  * arrives on is the same interface that traffic destined for the
  * packet's source address would be routed out of.
  * The 'versrcreach' option just checks that the source address is
  * reachable via any route (except default) in the routing table.
  * These two are a measure to block forged packets. This is also
  * commonly known as "anti-spoofing" or Unicast Reverse Path
  * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
  * is purposely reminiscent of the Cisco IOS command,
  *
  *   ip verify unicast reverse-path
  *   ip verify unicast source reachable-via any
  *
  * which implements the same functionality. But note that the syntax
  * is misleading, and the check may be performed on all IP packets
  * whether unicast, multicast, or broadcast.
  */
 static int
 verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
 {
 #if defined(USERSPACE) || !defined(__FreeBSD__)
 	return 0;
 #else
 	struct route ro;
 	struct sockaddr_in *dst;
 
 	bzero(&ro, sizeof(ro));
 
 	dst = (struct sockaddr_in *)&(ro.ro_dst);
 	dst->sin_family = AF_INET;
 	dst->sin_len = sizeof(*dst);
 	dst->sin_addr = src;
 	in_rtalloc_ign(&ro, 0, fib);
 
 	if (ro.ro_rt == NULL)
 		return 0;
 
 	/*
 	 * If ifp is provided, check for equality with rtentry.
 	 * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
 	 * in order to pass packets injected back by if_simloop():
 	 * routing entry (via lo0) for our own address
 	 * may exist, so we need to handle routing assymetry.
 	 */
 	if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
 		RTFREE(ro.ro_rt);
 		return 0;
 	}
 
 	/* if no ifp provided, check if rtentry is not default route */
 	if (ifp == NULL &&
 	     satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
 		RTFREE(ro.ro_rt);
 		return 0;
 	}
 
 	/* or if this is a blackhole/reject route */
 	if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 		RTFREE(ro.ro_rt);
 		return 0;
 	}
 
 	/* found valid route */
 	RTFREE(ro.ro_rt);
 	return 1;
 #endif /* __FreeBSD__ */
 }
 
 #ifdef INET6
 /*
  * ipv6 specific rules here...
  */
 static __inline int
 icmp6type_match (int type, ipfw_insn_u32 *cmd)
 {
 	return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
 }
 
 static int
 flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
 {
 	int i;
 	for (i=0; i <= cmd->o.arg1; ++i )
 		if (curr_flow == cmd->d[i] )
 			return 1;
 	return 0;
 }
 
 /* support for IP6_*_ME opcodes */
 static int
 search_ip6_addr_net (struct in6_addr * ip6_addr)
 {
 	struct ifnet *mdc;
 	struct ifaddr *mdc2;
 	struct in6_ifaddr *fdm;
 	struct in6_addr copia;
 
 	TAILQ_FOREACH(mdc, &V_ifnet, if_link) {
 		if_addr_rlock(mdc);
 		TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) {
 			if (mdc2->ifa_addr->sa_family == AF_INET6) {
 				fdm = (struct in6_ifaddr *)mdc2;
 				copia = fdm->ia_addr.sin6_addr;
 				/* need for leaving scope_id in the sock_addr */
 				in6_clearscope(&copia);
 				if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) {
 					if_addr_runlock(mdc);
 					return 1;
 				}
 			}
 		}
 		if_addr_runlock(mdc);
 	}
 	return 0;
 }
 
 static int
 verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib)
 {
 	struct route_in6 ro;
 	struct sockaddr_in6 *dst;
 
 	bzero(&ro, sizeof(ro));
 
 	dst = (struct sockaddr_in6 * )&(ro.ro_dst);
 	dst->sin6_family = AF_INET6;
 	dst->sin6_len = sizeof(*dst);
 	dst->sin6_addr = *src;
 
 	in6_rtalloc_ign(&ro, 0, fib);
 	if (ro.ro_rt == NULL)
 		return 0;
 
 	/* 
 	 * if ifp is provided, check for equality with rtentry
 	 * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
 	 * to support the case of sending packets to an address of our own.
 	 * (where the former interface is the first argument of if_simloop()
 	 *  (=ifp), the latter is lo0)
 	 */
 	if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
 		RTFREE(ro.ro_rt);
 		return 0;
 	}
 
 	/* if no ifp provided, check if rtentry is not default route */
 	if (ifp == NULL &&
 	    IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) {
 		RTFREE(ro.ro_rt);
 		return 0;
 	}
 
 	/* or if this is a blackhole/reject route */
 	if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
 		RTFREE(ro.ro_rt);
 		return 0;
 	}
 
 	/* found valid route */
 	RTFREE(ro.ro_rt);
 	return 1;
 
 }
 
 static int
 is_icmp6_query(int icmp6_type)
 {
 	if ((icmp6_type <= ICMP6_MAXTYPE) &&
 	    (icmp6_type == ICMP6_ECHO_REQUEST ||
 	    icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
 	    icmp6_type == ICMP6_WRUREQUEST ||
 	    icmp6_type == ICMP6_FQDN_QUERY ||
 	    icmp6_type == ICMP6_NI_QUERY))
 		return (1);
 
 	return (0);
 }
 
 static void
 send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
 {
 	struct mbuf *m;
 
 	m = args->m;
 	if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *tcp;
 		tcp = (struct tcphdr *)((char *)ip6 + hlen);
 
 		if ((tcp->th_flags & TH_RST) == 0) {
 			struct mbuf *m0;
 			m0 = ipfw_send_pkt(args->m, &(args->f_id),
 			    ntohl(tcp->th_seq), ntohl(tcp->th_ack),
 			    tcp->th_flags | TH_RST);
 			if (m0 != NULL)
 				ip6_output(m0, NULL, NULL, 0, NULL, NULL,
 				    NULL);
 		}
 		FREE_PKT(m);
 	} else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
 #if 0
 		/*
 		 * Unlike above, the mbufs need to line up with the ip6 hdr,
 		 * as the contents are read. We need to m_adj() the
 		 * needed amount.
 		 * The mbuf will however be thrown away so we can adjust it.
 		 * Remember we did an m_pullup on it already so we
 		 * can make some assumptions about contiguousness.
 		 */
 		if (args->L3offset)
 			m_adj(m, args->L3offset);
 #endif
 		icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
 	} else
 		FREE_PKT(m);
 
 	args->m = NULL;
 }
 
 #endif /* INET6 */
 
 
 /*
  * sends a reject message, consuming the mbuf passed as an argument.
  */
 static void
 send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
 {
 
 #if 0
 	/* XXX When ip is not guaranteed to be at mtod() we will
 	 * need to account for this */
 	 * The mbuf will however be thrown away so we can adjust it.
 	 * Remember we did an m_pullup on it already so we
 	 * can make some assumptions about contiguousness.
 	 */
 	if (args->L3offset)
 		m_adj(m, args->L3offset);
 #endif
 	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
 		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
 	} else if (args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *const tcp =
 		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
 		if ( (tcp->th_flags & TH_RST) == 0) {
 			struct mbuf *m;
 			m = ipfw_send_pkt(args->m, &(args->f_id),
 				ntohl(tcp->th_seq), ntohl(tcp->th_ack),
 				tcp->th_flags | TH_RST);
 			if (m != NULL)
 				ip_output(m, NULL, NULL, 0, NULL, NULL);
 		}
 		FREE_PKT(args->m);
 	} else
 		FREE_PKT(args->m);
 	args->m = NULL;
 }
 
 /*
  * Support for uid/gid/jail lookup. These tests are expensive
  * (because we may need to look into the list of active sockets)
  * so we cache the results. ugid_lookupp is 0 if we have not
  * yet done a lookup, 1 if we succeeded, and -1 if we tried
  * and failed. The function always returns the match value.
  * We could actually spare the variable and use *uc, setting
  * it to '(void *)check_uidgid if we have no info, NULL if
  * we tried and failed, or any other value if successful.
  */
 static int
 check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp,
     struct ucred **uc)
 {
 #if defined(USERSPACE)
 	return 0;	// not supported in userspace
 #else
 #ifndef __FreeBSD__
 	/* XXX */
 	return cred_check(insn, proto, oif,
 	    dst_ip, dst_port, src_ip, src_port,
 	    (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
 #else  /* FreeBSD */
 	struct in_addr src_ip, dst_ip;
 	struct inpcbinfo *pi;
 	struct ipfw_flow_id *id;
 	struct inpcb *pcb, *inp;
 	struct ifnet *oif;
 	int lookupflags;
 	int match;
 
 	id = &args->f_id;
 	inp = args->inp;
 	oif = args->oif;
 
 	/*
 	 * Check to see if the UDP or TCP stack supplied us with
 	 * the PCB. If so, rather then holding a lock and looking
 	 * up the PCB, we can use the one that was supplied.
 	 */
 	if (inp && *ugid_lookupp == 0) {
 		INP_LOCK_ASSERT(inp);
 		if (inp->inp_socket != NULL) {
 			*uc = crhold(inp->inp_cred);
 			*ugid_lookupp = 1;
 		} else
 			*ugid_lookupp = -1;
 	}
 	/*
 	 * If we have already been here and the packet has no
 	 * PCB entry associated with it, then we can safely
 	 * assume that this is a no match.
 	 */
 	if (*ugid_lookupp == -1)
 		return (0);
 	if (id->proto == IPPROTO_TCP) {
 		lookupflags = 0;
 		pi = &V_tcbinfo;
 	} else if (id->proto == IPPROTO_UDP) {
 		lookupflags = INPLOOKUP_WILDCARD;
 		pi = &V_udbinfo;
 	} else
 		return 0;
 	lookupflags |= INPLOOKUP_RLOCKPCB;
 	match = 0;
 	if (*ugid_lookupp == 0) {
 		if (id->addr_type == 6) {
 #ifdef INET6
 			if (oif == NULL)
 				pcb = in6_pcblookup_mbuf(pi,
 				    &id->src_ip6, htons(id->src_port),
 				    &id->dst_ip6, htons(id->dst_port),
 				    lookupflags, oif, args->m);
 			else
 				pcb = in6_pcblookup_mbuf(pi,
 				    &id->dst_ip6, htons(id->dst_port),
 				    &id->src_ip6, htons(id->src_port),
 				    lookupflags, oif, args->m);
 #else
 			*ugid_lookupp = -1;
 			return (0);
 #endif
 		} else {
 			src_ip.s_addr = htonl(id->src_ip);
 			dst_ip.s_addr = htonl(id->dst_ip);
 			if (oif == NULL)
 				pcb = in_pcblookup_mbuf(pi,
 				    src_ip, htons(id->src_port),
 				    dst_ip, htons(id->dst_port),
 				    lookupflags, oif, args->m);
 			else
 				pcb = in_pcblookup_mbuf(pi,
 				    dst_ip, htons(id->dst_port),
 				    src_ip, htons(id->src_port),
 				    lookupflags, oif, args->m);
 		}
 		if (pcb != NULL) {
 			INP_RLOCK_ASSERT(pcb);
 			*uc = crhold(pcb->inp_cred);
 			*ugid_lookupp = 1;
 			INP_RUNLOCK(pcb);
 		}
 		if (*ugid_lookupp == 0) {
 			/*
 			 * We tried and failed, set the variable to -1
 			 * so we will not try again on this packet.
 			 */
 			*ugid_lookupp = -1;
 			return (0);
 		}
 	}
 	if (insn->o.opcode == O_UID)
 		match = ((*uc)->cr_uid == (uid_t)insn->d[0]);
 	else if (insn->o.opcode == O_GID)
 		match = groupmember((gid_t)insn->d[0], *uc);
 	else if (insn->o.opcode == O_JAIL)
 		match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
 	return (match);
 #endif /* __FreeBSD__ */
 #endif /* not supported in userspace */
 }
 
 /*
  * Helper function to set args with info on the rule after the matching
  * one. slot is precise, whereas we guess rule_id as they are
  * assigned sequentially.
  */
 static inline void
 set_match(struct ip_fw_args *args, int slot,
 	struct ip_fw_chain *chain)
 {
 	args->rule.chain_id = chain->id;
 	args->rule.slot = slot + 1; /* we use 0 as a marker */
 	args->rule.rule_id = 1 + chain->map[slot]->id;
 	args->rule.rulenum = chain->map[slot]->rulenum;
 }
 
 #ifndef LINEAR_SKIPTO
 /*
  * Helper function to enable cached rule lookups using
  * cached_id and cached_pos fields in ipfw rule.
  */
 static int
 jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards)
 {
 	int f_pos;
 
 	/* If possible use cached f_pos (in f->cached_pos),
 	 * whose version is written in f->cached_id
 	 * (horrible hacks to avoid changing the ABI).
 	 */
 	if (num != IP_FW_TARG && f->cached_id == chain->id)
 		f_pos = f->cached_pos;
 	else {
 		int i = IP_FW_ARG_TABLEARG(chain, num, skipto);
 		/* make sure we do not jump backward */
 		if (jump_backwards == 0 && i <= f->rulenum)
 			i = f->rulenum + 1;
 		if (chain->idxmap != NULL)
 			f_pos = chain->idxmap[i];
 		else
 			f_pos = ipfw_find_rule(chain, i, 0);
 		/* update the cache */
 		if (num != IP_FW_TARG) {
 			f->cached_id = chain->id;
 			f->cached_pos = f_pos;
 		}
 	}
 
 	return (f_pos);
 }
 #else
 /*
  * Helper function to enable real fast rule lookups.
  */
 static int
 jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards)
 {
 	int f_pos;
 
 	num = IP_FW_ARG_TABLEARG(chain, num, skipto);
 	/* make sure we do not jump backward */
 	if (jump_backwards == 0 && num <= f->rulenum)
 		num = f->rulenum + 1;
 	f_pos = chain->idxmap[num];
 
 	return (f_pos);
 }
 #endif
 
 #define	TARG(k, f)	IP_FW_ARG_TABLEARG(chain, k, f)
 /*
  * The main check routine for the firewall.
  *
  * All arguments are in args so we can modify them and return them
  * back to the caller.
  *
  * Parameters:
  *
  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
  *		Starts with the IP header.
  *	args->eh (in)	Mac header if present, NULL for layer3 packet.
  *	args->L3offset	Number of bytes bypassed if we came from L2.
  *			e.g. often sizeof(eh)  ** NOTYET **
  *	args->oif	Outgoing interface, NULL if packet is incoming.
  *		The incoming interface is in the mbuf. (in)
  *	args->divert_rule (in/out)
  *		Skip up to the first rule past this rule number;
  *		upon return, non-zero port number for divert or tee.
  *
  *	args->rule	Pointer to the last matching rule (in/out)
  *	args->next_hop	Socket we are forwarding to (out).
  *	args->next_hop6	IPv6 next hop we are forwarding to (out).
  *	args->f_id	Addresses grabbed from the packet (out)
  * 	args->rule.info	a cookie depending on rule action
  *
  * Return value:
  *
  *	IP_FW_PASS	the packet must be accepted
  *	IP_FW_DENY	the packet must be dropped
  *	IP_FW_DIVERT	divert packet, port in m_tag
  *	IP_FW_TEE	tee packet, port in m_tag
  *	IP_FW_DUMMYNET	to dummynet, pipe in args->cookie
  *	IP_FW_NETGRAPH	into netgraph, cookie args->cookie
  *		args->rule contains the matching rule,
  *		args->rule.info has additional information.
  *
  */
 int
 ipfw_chk(struct ip_fw_args *args)
 {
 
 	/*
 	 * Local variables holding state while processing a packet:
 	 *
 	 * IMPORTANT NOTE: to speed up the processing of rules, there
 	 * are some assumption on the values of the variables, which
 	 * are documented here. Should you change them, please check
 	 * the implementation of the various instructions to make sure
 	 * that they still work.
 	 *
 	 * args->eh	The MAC header. It is non-null for a layer2
 	 *	packet, it is NULL for a layer-3 packet.
 	 * **notyet**
 	 * args->L3offset Offset in the packet to the L3 (IP or equiv.) header.
 	 *
 	 * m | args->m	Pointer to the mbuf, as received from the caller.
 	 *	It may change if ipfw_chk() does an m_pullup, or if it
 	 *	consumes the packet because it calls send_reject().
 	 *	XXX This has to change, so that ipfw_chk() never modifies
 	 *	or consumes the buffer.
 	 * ip	is the beginning of the ip(4 or 6) header.
 	 *	Calculated by adding the L3offset to the start of data.
 	 *	(Until we start using L3offset, the packet is
 	 *	supposed to start with the ip header).
 	 */
 	struct mbuf *m = args->m;
 	struct ip *ip = mtod(m, struct ip *);
 
 	/*
 	 * For rules which contain uid/gid or jail constraints, cache
 	 * a copy of the users credentials after the pcb lookup has been
 	 * executed. This will speed up the processing of rules with
 	 * these types of constraints, as well as decrease contention
 	 * on pcb related locks.
 	 */
 #ifndef __FreeBSD__
 	struct bsd_ucred ucred_cache;
 #else
 	struct ucred *ucred_cache = NULL;
 #endif
 	int ucred_lookup = 0;
 
 	/*
 	 * oif | args->oif	If NULL, ipfw_chk has been called on the
 	 *	inbound path (ether_input, ip_input).
 	 *	If non-NULL, ipfw_chk has been called on the outbound path
 	 *	(ether_output, ip_output).
 	 */
 	struct ifnet *oif = args->oif;
 
 	int f_pos = 0;		/* index of current rule in the array */
 	int retval = 0;
 
 	/*
 	 * hlen	The length of the IP header.
 	 */
 	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
 
 	/*
 	 * offset	The offset of a fragment. offset != 0 means that
 	 *	we have a fragment at this offset of an IPv4 packet.
 	 *	offset == 0 means that (if this is an IPv4 packet)
 	 *	this is the first or only fragment.
 	 *	For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header
 	 *	or there is a single packet fragement (fragement header added
 	 *	without needed).  We will treat a single packet fragment as if
 	 *	there was no fragment header (or log/block depending on the
 	 *	V_fw_permit_single_frag6 sysctl setting).
 	 */
 	u_short offset = 0;
 	u_short ip6f_mf = 0;
 
 	/*
 	 * Local copies of addresses. They are only valid if we have
 	 * an IP packet.
 	 *
 	 * proto	The protocol. Set to 0 for non-ip packets,
 	 *	or to the protocol read from the packet otherwise.
 	 *	proto != 0 means that we have an IPv4 packet.
 	 *
 	 * src_port, dst_port	port numbers, in HOST format. Only
 	 *	valid for TCP and UDP packets.
 	 *
 	 * src_ip, dst_ip	ip addresses, in NETWORK format.
 	 *	Only valid for IPv4 packets.
 	 */
 	uint8_t proto;
 	uint16_t src_port = 0, dst_port = 0;	/* NOTE: host format	*/
 	struct in_addr src_ip, dst_ip;		/* NOTE: network format	*/
 	uint16_t iplen=0;
 	int pktlen;
 	uint16_t	etype = 0;	/* Host order stored ether type */
 
 	/*
 	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
 	 * 	MATCH_NONE when checked and not matched (q = NULL),
 	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
 	 */
 	int dyn_dir = MATCH_UNKNOWN;
 	ipfw_dyn_rule *q = NULL;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 
 	/*
 	 * We store in ulp a pointer to the upper layer protocol header.
 	 * In the ipv4 case this is easy to determine from the header,
 	 * but for ipv6 we might have some additional headers in the middle.
 	 * ulp is NULL if not found.
 	 */
 	void *ulp = NULL;		/* upper layer protocol pointer. */
 
 	/* XXX ipv6 variables */
 	int is_ipv6 = 0;
 	uint8_t	icmp6_type = 0;
 	uint16_t ext_hd = 0;	/* bits vector for extension header filtering */
 	/* end of ipv6 variables */
 
 	int is_ipv4 = 0;
 
 	int done = 0;		/* flag to exit the outer loop */
 	IPFW_RLOCK_TRACKER;
 
 	if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
 		return (IP_FW_PASS);	/* accept */
 
 	dst_ip.s_addr = 0;		/* make sure it is initialized */
 	src_ip.s_addr = 0;		/* make sure it is initialized */
 	pktlen = m->m_pkthdr.len;
 	args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */
 	proto = args->f_id.proto = 0;	/* mark f_id invalid */
 		/* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */
 
 /*
  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
  * pointer might become stale after other pullups (but we never use it
  * this way).
  */
 #define PULLUP_TO(_len, p, T)	PULLUP_LEN(_len, p, sizeof(T))
 #define PULLUP_LEN(_len, p, T)					\
 do {								\
 	int x = (_len) + T;					\
 	if ((m)->m_len < x) {					\
 		args->m = m = m_pullup(m, x);			\
 		if (m == NULL)					\
 			goto pullup_failed;			\
 	}							\
 	p = (mtod(m, char *) + (_len));				\
 } while (0)
 
 	/*
 	 * if we have an ether header,
 	 */
 	if (args->eh)
 		etype = ntohs(args->eh->ether_type);
 
 	/* Identify IP packets and fill up variables. */
 	if (pktlen >= sizeof(struct ip6_hdr) &&
 	    (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) {
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
 		is_ipv6 = 1;
 		args->f_id.addr_type = 6;
 		hlen = sizeof(struct ip6_hdr);
 		proto = ip6->ip6_nxt;
 
 		/* Search extension headers to find upper layer protocols */
 		while (ulp == NULL && offset == 0) {
 			switch (proto) {
 			case IPPROTO_ICMPV6:
 				PULLUP_TO(hlen, ulp, struct icmp6_hdr);
 				icmp6_type = ICMP6(ulp)->icmp6_type;
 				break;
 
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
 				/* save flags for dynamic rules */
 				args->f_id._flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_SCTP:
 				PULLUP_TO(hlen, ulp, struct sctphdr);
 				src_port = SCTP(ulp)->src_port;
 				dst_port = SCTP(ulp)->dest_port;
 				break;
 
 			case IPPROTO_UDP:
 				PULLUP_TO(hlen, ulp, struct udphdr);
 				dst_port = UDP(ulp)->uh_dport;
 				src_port = UDP(ulp)->uh_sport;
 				break;
 
 			case IPPROTO_HOPOPTS:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_hbh);
 				ext_hd |= EXT_HOPOPTS;
 				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_ROUTING:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_rthdr);
 				switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
 				case 0:
 					ext_hd |= EXT_RTHDR0;
 					break;
 				case 2:
 					ext_hd |= EXT_RTHDR2;
 					break;
 				default:
 					if (V_fw_verbose)
 						printf("IPFW2: IPV6 - Unknown "
 						    "Routing Header type(%d)\n",
 						    ((struct ip6_rthdr *)
 						    ulp)->ip6r_type);
 					if (V_fw_deny_unknown_exthdrs)
 					    return (IP_FW_DENY);
 					break;
 				}
 				ext_hd |= EXT_ROUTING;
 				hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
 				proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_FRAGMENT:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_frag);
 				ext_hd |= EXT_FRAGMENT;
 				hlen += sizeof (struct ip6_frag);
 				proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
 				offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
 					IP6F_OFF_MASK;
 				ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg &
 					IP6F_MORE_FRAG;
 				if (V_fw_permit_single_frag6 == 0 &&
 				    offset == 0 && ip6f_mf == 0) {
 					if (V_fw_verbose)
 						printf("IPFW2: IPV6 - Invalid "
 						    "Fragment Header\n");
 					if (V_fw_deny_unknown_exthdrs)
 					    return (IP_FW_DENY);
 					break;
 				}
 				args->f_id.extra =
 				    ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
 				ulp = NULL;
 				break;
 
 			case IPPROTO_DSTOPTS:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_hbh);
 				ext_hd |= EXT_DSTOPTS;
 				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_AH:	/* RFC 2402 */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				ext_hd |= EXT_AH;
 				hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
 				proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_ESP:	/* RFC 2406 */
 				PULLUP_TO(hlen, ulp, uint32_t);	/* SPI, Seq# */
 				/* Anything past Seq# is variable length and
 				 * data past this ext. header is encrypted. */
 				ext_hd |= EXT_ESP;
 				break;
 
 			case IPPROTO_NONE:	/* RFC 2460 */
 				/*
 				 * Packet ends here, and IPv6 header has
 				 * already been pulled up. If ip6e_len!=0
 				 * then octets must be ignored.
 				 */
 				ulp = ip; /* non-NULL to get out of loop. */
 				break;
 
 			case IPPROTO_OSPFIGP:
 				/* XXX OSPF header check? */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				break;
 
 			case IPPROTO_PIM:
 				/* XXX PIM header check? */
 				PULLUP_TO(hlen, ulp, struct pim);
 				break;
 
 			case IPPROTO_CARP:
 				PULLUP_TO(hlen, ulp, struct carp_header);
 				if (((struct carp_header *)ulp)->carp_version !=
 				    CARP_VERSION) 
 					return (IP_FW_DENY);
 				if (((struct carp_header *)ulp)->carp_type !=
 				    CARP_ADVERTISEMENT) 
 					return (IP_FW_DENY);
 				break;
 
 			case IPPROTO_IPV6:	/* RFC 2893 */
 				PULLUP_TO(hlen, ulp, struct ip6_hdr);
 				break;
 
 			case IPPROTO_IPV4:	/* RFC 2893 */
 				PULLUP_TO(hlen, ulp, struct ip);
 				break;
 
 			default:
 				if (V_fw_verbose)
 					printf("IPFW2: IPV6 - Unknown "
 					    "Extension Header(%d), ext_hd=%x\n",
 					     proto, ext_hd);
 				if (V_fw_deny_unknown_exthdrs)
 				    return (IP_FW_DENY);
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				break;
 			} /*switch */
 		}
 		ip = mtod(m, struct ip *);
 		ip6 = (struct ip6_hdr *)ip;
 		args->f_id.src_ip6 = ip6->ip6_src;
 		args->f_id.dst_ip6 = ip6->ip6_dst;
 		args->f_id.src_ip = 0;
 		args->f_id.dst_ip = 0;
 		args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
 	} else if (pktlen >= sizeof(struct ip) &&
 	    (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) {
 	    	is_ipv4 = 1;
 		hlen = ip->ip_hl << 2;
 		args->f_id.addr_type = 4;
 
 		/*
 		 * Collect parameters into local variables for faster matching.
 		 */
 		proto = ip->ip_p;
 		src_ip = ip->ip_src;
 		dst_ip = ip->ip_dst;
 		offset = ntohs(ip->ip_off) & IP_OFFMASK;
 		iplen = ntohs(ip->ip_len);
 		pktlen = iplen < pktlen ? iplen : pktlen;
 
 		if (offset == 0) {
 			switch (proto) {
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
 				/* save flags for dynamic rules */
 				args->f_id._flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_SCTP:
 				PULLUP_TO(hlen, ulp, struct sctphdr);
 				src_port = SCTP(ulp)->src_port;
 				dst_port = SCTP(ulp)->dest_port;
 				break;
 
 			case IPPROTO_UDP:
 				PULLUP_TO(hlen, ulp, struct udphdr);
 				dst_port = UDP(ulp)->uh_dport;
 				src_port = UDP(ulp)->uh_sport;
 				break;
 
 			case IPPROTO_ICMP:
 				PULLUP_TO(hlen, ulp, struct icmphdr);
 				//args->f_id.flags = ICMP(ulp)->icmp_type;
 				break;
 
 			default:
 				break;
 			}
 		}
 
 		ip = mtod(m, struct ip *);
 		args->f_id.src_ip = ntohl(src_ip.s_addr);
 		args->f_id.dst_ip = ntohl(dst_ip.s_addr);
 	}
 #undef PULLUP_TO
 	if (proto) { /* we may have port numbers, store them */
 		args->f_id.proto = proto;
 		args->f_id.src_port = src_port = ntohs(src_port);
 		args->f_id.dst_port = dst_port = ntohs(dst_port);
 	}
 
 	IPFW_PF_RLOCK(chain);
 	if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
 		IPFW_PF_RUNLOCK(chain);
 		return (IP_FW_PASS);	/* accept */
 	}
 	if (args->rule.slot) {
 		/*
 		 * Packet has already been tagged as a result of a previous
 		 * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
 		 * REASS, NETGRAPH, DIVERT/TEE...)
 		 * Validate the slot and continue from the next one
 		 * if still present, otherwise do a lookup.
 		 */
 		f_pos = (args->rule.chain_id == chain->id) ?
 		    args->rule.slot :
 		    ipfw_find_rule(chain, args->rule.rulenum,
 			args->rule.rule_id);
 	} else {
 		f_pos = 0;
 	}
 
 	/*
 	 * Now scan the rules, and parse microinstructions for each rule.
 	 * We have two nested loops and an inner switch. Sometimes we
 	 * need to break out of one or both loops, or re-enter one of
 	 * the loops with updated variables. Loop variables are:
 	 *
 	 *	f_pos (outer loop) points to the current rule.
 	 *		On output it points to the matching rule.
 	 *	done (outer loop) is used as a flag to break the loop.
 	 *	l (inner loop)	residual length of current rule.
 	 *		cmd points to the current microinstruction.
 	 *
 	 * We break the inner loop by setting l=0 and possibly
 	 * cmdlen=0 if we don't want to advance cmd.
 	 * We break the outer loop by setting done=1
 	 * We can restart the inner loop by setting l>0 and f_pos, f, cmd
 	 * as needed.
 	 */
 	for (; f_pos < chain->n_rules; f_pos++) {
 		ipfw_insn *cmd;
 		uint32_t tablearg = 0;
 		int l, cmdlen, skip_or; /* skip rest of OR block */
 		struct ip_fw *f;
 
 		f = chain->map[f_pos];
 		if (V_set_disable & (1 << f->set) )
 			continue;
 
 		skip_or = 0;
 		for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
 		    l -= cmdlen, cmd += cmdlen) {
 			int match;
 
 			/*
 			 * check_body is a jump target used when we find a
 			 * CHECK_STATE, and need to jump to the body of
 			 * the target rule.
 			 */
 
 /* check_body: */
 			cmdlen = F_LEN(cmd);
 			/*
 			 * An OR block (insn_1 || .. || insn_n) has the
 			 * F_OR bit set in all but the last instruction.
 			 * The first match will set "skip_or", and cause
 			 * the following instructions to be skipped until
 			 * past the one with the F_OR bit clear.
 			 */
 			if (skip_or) {		/* skip this instruction */
 				if ((cmd->len & F_OR) == 0)
 					skip_or = 0;	/* next one is good */
 				continue;
 			}
 			match = 0; /* set to 1 if we succeed */
 
 			switch (cmd->opcode) {
 			/*
 			 * The first set of opcodes compares the packet's
 			 * fields with some pattern, setting 'match' if a
 			 * match is found. At the end of the loop there is
 			 * logic to deal with F_NOT and F_OR flags associated
 			 * with the opcode.
 			 */
 			case O_NOP:
 				match = 1;
 				break;
 
 			case O_FORWARD_MAC:
 				printf("ipfw: opcode %d unimplemented\n",
 				    cmd->opcode);
 				break;
 
 			case O_GID:
 			case O_UID:
 			case O_JAIL:
 				/*
 				 * We only check offset == 0 && proto != 0,
 				 * as this ensures that we have a
 				 * packet with the ports info.
 				 */
 				if (offset != 0)
 					break;
 				if (proto == IPPROTO_TCP ||
 				    proto == IPPROTO_UDP)
 					match = check_uidgid(
 						    (ipfw_insn_u32 *)cmd,
 						    args, &ucred_lookup,
 #ifdef __FreeBSD__
 						    &ucred_cache);
 #else
 						    (void *)&ucred_cache);
 #endif
 				break;
 
 			case O_RECV:
 				match = iface_match(m->m_pkthdr.rcvif,
 				    (ipfw_insn_if *)cmd, chain, &tablearg);
 				break;
 
 			case O_XMIT:
 				match = iface_match(oif, (ipfw_insn_if *)cmd,
 				    chain, &tablearg);
 				break;
 
 			case O_VIA:
 				match = iface_match(oif ? oif :
 				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd,
 				    chain, &tablearg);
 				break;
 
 			case O_MACADDR2:
 				if (args->eh != NULL) {	/* have MAC header */
 					u_int32_t *want = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->addr;
 					u_int32_t *mask = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->mask;
 					u_int32_t *hdr = (u_int32_t *)args->eh;
 
 					match =
 					    ( want[0] == (hdr[0] & mask[0]) &&
 					      want[1] == (hdr[1] & mask[1]) &&
 					      want[2] == (hdr[2] & mask[2]) );
 				}
 				break;
 
 			case O_MAC_TYPE:
 				if (args->eh != NULL) {
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match = (etype >= p[0] &&
 						    etype <= p[1]);
 				}
 				break;
 
 			case O_FRAG:
 				match = (offset != 0);
 				break;
 
 			case O_IN:	/* "out" is "not in" */
 				match = (oif == NULL);
 				break;
 
 			case O_LAYER2:
 				match = (args->eh != NULL);
 				break;
 
 			case O_DIVERTED:
 			    {
 				/* For diverted packets, args->rule.info
 				 * contains the divert port (in host format)
 				 * reason and direction.
 				 */
 				uint32_t i = args->rule.info;
 				match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT &&
 				    cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2);
 			    }
 				break;
 
 			case O_PROTO:
 				/*
 				 * We do not allow an arg of 0 so the
 				 * check of "proto" only suffices.
 				 */
 				match = (proto == cmd->arg1);
 				break;
 
 			case O_IP_SRC:
 				match = is_ipv4 &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    src_ip.s_addr);
 				break;
 
 			case O_IP_SRC_LOOKUP:
 			case O_IP_DST_LOOKUP:
 				if (is_ipv4) {
 				    uint32_t key =
 					(cmd->opcode == O_IP_DST_LOOKUP) ?
 					    dst_ip.s_addr : src_ip.s_addr;
 				    uint32_t v = 0;
 
 				    if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
 					/* generic lookup. The key must be
 					 * in 32bit big-endian format.
 					 */
 					v = ((ipfw_insn_u32 *)cmd)->d[1];
 					if (v == 0)
 					    key = dst_ip.s_addr;
 					else if (v == 1)
 					    key = src_ip.s_addr;
 					else if (v == 6) /* dscp */
 					    key = (ip->ip_tos >> 2) & 0x3f;
 					else if (offset != 0)
 					    break;
 					else if (proto != IPPROTO_TCP &&
 						proto != IPPROTO_UDP)
 					    break;
 					else if (v == 2)
 					    key = dst_port;
 					else if (v == 3)
 					    key = src_port;
 #ifndef USERSPACE
 					else if (v == 4 || v == 5) {
 					    check_uidgid(
 						(ipfw_insn_u32 *)cmd,
 						args, &ucred_lookup,
 #ifdef __FreeBSD__
 						&ucred_cache);
 					    if (v == 4 /* O_UID */)
 						key = ucred_cache->cr_uid;
 					    else if (v == 5 /* O_JAIL */)
 						key = ucred_cache->cr_prison->pr_id;
 #else /* !__FreeBSD__ */
 						(void *)&ucred_cache);
 					    if (v ==4 /* O_UID */)
 						key = ucred_cache.uid;
 					    else if (v == 5 /* O_JAIL */)
 						key = ucred_cache.xid;
 #endif /* !__FreeBSD__ */
 					} else
 #endif /* !USERSPACE */
 					    break;
 				    }
 				    match = ipfw_lookup_table(chain,
 					cmd->arg1, key, &v);
 				    if (!match)
 					break;
 				    if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
 					match =
 					    ((ipfw_insn_u32 *)cmd)->d[0] == v;
 				    else
 					tablearg = v;
 				} else if (is_ipv6) {
 					uint32_t v = 0;
 					void *pkey = (cmd->opcode == O_IP_DST_LOOKUP) ?
 						&args->f_id.dst_ip6: &args->f_id.src_ip6;
 					match = ipfw_lookup_table_extended(chain,
 							cmd->arg1,
 							sizeof(struct in6_addr),
 							pkey, &v);
 					if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
 						match = ((ipfw_insn_u32 *)cmd)->d[0] == v;
 					if (match)
 						tablearg = v;
 				}
 				break;
 
 			case O_IP_FLOW_LOOKUP:
 				{
 					uint32_t v = 0;
 					match = ipfw_lookup_table_extended(chain,
 					    cmd->arg1, 0, &args->f_id, &v);
 					if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
 						match = ((ipfw_insn_u32 *)cmd)->d[0] == v;
 					if (match)
 						tablearg = v;
 				}
 				break;
 			case O_IP_SRC_MASK:
 			case O_IP_DST_MASK:
 				if (is_ipv4) {
 				    uint32_t a =
 					(cmd->opcode == O_IP_DST_MASK) ?
 					    dst_ip.s_addr : src_ip.s_addr;
 				    uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
 				    int i = cmdlen-1;
 
 				    for (; !match && i>0; i-= 2, p+= 2)
 					match = (p[0] == (a & p[1]));
 				}
 				break;
 
 			case O_IP_SRC_ME:
 				if (is_ipv4) {
 					struct ifnet *tif;
 
 					INADDR_TO_IFP(src_ip, tif);
 					match = (tif != NULL);
 					break;
 				}
 #ifdef INET6
 				/* FALLTHROUGH */
 			case O_IP6_SRC_ME:
 				match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
 #endif
 				break;
 
 			case O_IP_DST_SET:
 			case O_IP_SRC_SET:
 				if (is_ipv4) {
 					u_int32_t *d = (u_int32_t *)(cmd+1);
 					u_int32_t addr =
 					    cmd->opcode == O_IP_DST_SET ?
 						args->f_id.dst_ip :
 						args->f_id.src_ip;
 
 					    if (addr < d[0])
 						    break;
 					    addr -= d[0]; /* subtract base */
 					    match = (addr < cmd->arg1) &&
 						( d[ 1 + (addr>>5)] &
 						  (1<<(addr & 0x1f)) );
 				}
 				break;
 
 			case O_IP_DST:
 				match = is_ipv4 &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    dst_ip.s_addr);
 				break;
 
 			case O_IP_DST_ME:
 				if (is_ipv4) {
 					struct ifnet *tif;
 
 					INADDR_TO_IFP(dst_ip, tif);
 					match = (tif != NULL);
 					break;
 				}
 #ifdef INET6
 				/* FALLTHROUGH */
 			case O_IP6_DST_ME:
 				match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
 #endif
 				break;
 
 
 			case O_IP_SRCPORT:
 			case O_IP_DSTPORT:
 				/*
 				 * offset == 0 && proto != 0 is enough
 				 * to guarantee that we have a
 				 * packet with port info.
 				 */
 				if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
 				    && offset == 0) {
 					u_int16_t x =
 					    (cmd->opcode == O_IP_SRCPORT) ?
 						src_port : dst_port ;
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match = (x>=p[0] && x<=p[1]);
 				}
 				break;
 
 			case O_ICMPTYPE:
 				match = (offset == 0 && proto==IPPROTO_ICMP &&
 				    icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
 				break;
 
 #ifdef INET6
 			case O_ICMP6TYPE:
 				match = is_ipv6 && offset == 0 &&
 				    proto==IPPROTO_ICMPV6 &&
 				    icmp6type_match(
 					ICMP6(ulp)->icmp6_type,
 					(ipfw_insn_u32 *)cmd);
 				break;
 #endif /* INET6 */
 
 			case O_IPOPT:
 				match = (is_ipv4 &&
 				    ipopts_match(ip, cmd) );
 				break;
 
 			case O_IPVER:
 				match = (is_ipv4 &&
 				    cmd->arg1 == ip->ip_v);
 				break;
 
 			case O_IPID:
 			case O_IPLEN:
 			case O_IPTTL:
 				if (is_ipv4) {	/* only for IP packets */
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    if (cmd->opcode == O_IPLEN)
 					x = iplen;
 				    else if (cmd->opcode == O_IPTTL)
 					x = ip->ip_ttl;
 				    else /* must be IPID */
 					x = ntohs(ip->ip_id);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* otherwise we have ranges */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i>0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_IPPRECEDENCE:
 				match = (is_ipv4 &&
 				    (cmd->arg1 == (ip->ip_tos & 0xe0)) );
 				break;
 
 			case O_IPTOS:
 				match = (is_ipv4 &&
 				    flags_match(cmd, ip->ip_tos));
 				break;
 
 			case O_DSCP:
 			    {
 				uint32_t *p;
 				uint16_t x;
 
 				p = ((ipfw_insn_u32 *)cmd)->d;
 
 				if (is_ipv4)
 					x = ip->ip_tos >> 2;
 				else if (is_ipv6) {
 					uint8_t *v;
 					v = &((struct ip6_hdr *)ip)->ip6_vfc;
 					x = (*v & 0x0F) << 2;
 					v++;
 					x |= *v >> 6;
 				} else
 					break;
 
 				/* DSCP bitmask is stored as low_u32 high_u32 */
 				if (x > 32)
 					match = *(p + 1) & (1 << (x - 32));
 				else
 					match = *p & (1 << x);
 			    }
 				break;
 
 			case O_TCPDATALEN:
 				if (proto == IPPROTO_TCP && offset == 0) {
 				    struct tcphdr *tcp;
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    tcp = TCP(ulp);
 				    x = iplen -
 					((ip->ip_hl + tcp->th_off) << 2);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* otherwise we have ranges */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i>0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_TCPFLAGS:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    flags_match(cmd, TCP(ulp)->th_flags));
 				break;
 
 			case O_TCPOPTS:
 				if (proto == IPPROTO_TCP && offset == 0 && ulp){
 					PULLUP_LEN(hlen, ulp,
 					    (TCP(ulp)->th_off << 2));
 					match = tcpopts_match(TCP(ulp), cmd);
 				}
 				break;
 
 			case O_TCPSEQ:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					TCP(ulp)->th_seq);
 				break;
 
 			case O_TCPACK:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					TCP(ulp)->th_ack);
 				break;
 
 			case O_TCPWIN:
 				if (proto == IPPROTO_TCP && offset == 0) {
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    x = ntohs(TCP(ulp)->th_win);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* Otherwise we have ranges. */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i > 0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_ESTAB:
 				/* reject packets which have SYN only */
 				/* XXX should i also check for TH_ACK ? */
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    (TCP(ulp)->th_flags &
 				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
 				break;
 
 			case O_ALTQ: {
 				struct pf_mtag *at;
 				struct m_tag *mtag;
 				ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
 
 				/*
 				 * ALTQ uses mbuf tags from another
 				 * packet filtering system - pf(4).
 				 * We allocate a tag in its format
 				 * and fill it in, pretending to be pf(4).
 				 */
 				match = 1;
 				at = pf_find_mtag(m);
 				if (at != NULL && at->qid != 0)
 					break;
 				mtag = m_tag_get(PACKET_TAG_PF,
 				    sizeof(struct pf_mtag), M_NOWAIT | M_ZERO);
 				if (mtag == NULL) {
 					/*
 					 * Let the packet fall back to the
 					 * default ALTQ.
 					 */
 					break;
 				}
 				m_tag_prepend(m, mtag);
 				at = (struct pf_mtag *)(mtag + 1);
 				at->qid = altq->qid;
 				at->hdr = ip;
 				break;
 			}
 
 			case O_LOG:
 				ipfw_log(chain, f, hlen, args, m,
 				    oif, offset | ip6f_mf, tablearg, ip);
 				match = 1;
 				break;
 
 			case O_PROB:
 				match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
 				break;
 
 			case O_VERREVPATH:
 				/* Outgoing packets automatically pass/match */
 				match = ((oif != NULL) ||
 				    (m->m_pkthdr.rcvif == NULL) ||
 				    (
 #ifdef INET6
 				    is_ipv6 ?
 					verify_path6(&(args->f_id.src_ip6),
 					    m->m_pkthdr.rcvif, args->f_id.fib) :
 #endif
 				    verify_path(src_ip, m->m_pkthdr.rcvif,
 				        args->f_id.fib)));
 				break;
 
 			case O_VERSRCREACH:
 				/* Outgoing packets automatically pass/match */
 				match = (hlen > 0 && ((oif != NULL) ||
 #ifdef INET6
 				    is_ipv6 ?
 				        verify_path6(&(args->f_id.src_ip6),
 				            NULL, args->f_id.fib) :
 #endif
 				    verify_path(src_ip, NULL, args->f_id.fib)));
 				break;
 
 			case O_ANTISPOOF:
 				/* Outgoing packets automatically pass/match */
 				if (oif == NULL && hlen > 0 &&
 				    (  (is_ipv4 && in_localaddr(src_ip))
 #ifdef INET6
 				    || (is_ipv6 &&
 				        in6_localaddr(&(args->f_id.src_ip6)))
 #endif
 				    ))
 					match =
 #ifdef INET6
 					    is_ipv6 ? verify_path6(
 					        &(args->f_id.src_ip6),
 					        m->m_pkthdr.rcvif,
 						args->f_id.fib) :
 #endif
 					    verify_path(src_ip,
 					    	m->m_pkthdr.rcvif,
 					        args->f_id.fib);
 				else
 					match = 1;
 				break;
 
 			case O_IPSEC:
 #ifdef IPSEC
 				match = (m_tag_find(m,
 				    PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
 #endif
 				/* otherwise no match */
 				break;
 
 #ifdef INET6
 			case O_IP6_SRC:
 				match = is_ipv6 &&
 				    IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
 				    &((ipfw_insn_ip6 *)cmd)->addr6);
 				break;
 
 			case O_IP6_DST:
 				match = is_ipv6 &&
 				IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
 				    &((ipfw_insn_ip6 *)cmd)->addr6);
 				break;
 			case O_IP6_SRC_MASK:
 			case O_IP6_DST_MASK:
 				if (is_ipv6) {
 					int i = cmdlen - 1;
 					struct in6_addr p;
 					struct in6_addr *d =
 					    &((ipfw_insn_ip6 *)cmd)->addr6;
 
 					for (; !match && i > 0; d += 2,
 					    i -= F_INSN_SIZE(struct in6_addr)
 					    * 2) {
 						p = (cmd->opcode ==
 						    O_IP6_SRC_MASK) ?
 						    args->f_id.src_ip6:
 						    args->f_id.dst_ip6;
 						APPLY_MASK(&p, &d[1]);
 						match =
 						    IN6_ARE_ADDR_EQUAL(&d[0],
 						    &p);
 					}
 				}
 				break;
 
 			case O_FLOW6ID:
 				match = is_ipv6 &&
 				    flow6id_match(args->f_id.flow_id6,
 				    (ipfw_insn_u32 *) cmd);
 				break;
 
 			case O_EXT_HDR:
 				match = is_ipv6 &&
 				    (ext_hd & ((ipfw_insn *) cmd)->arg1);
 				break;
 
 			case O_IP6:
 				match = is_ipv6;
 				break;
 #endif
 
 			case O_IP4:
 				match = is_ipv4;
 				break;
 
 			case O_TAG: {
 				struct m_tag *mtag;
 				uint32_t tag = TARG(cmd->arg1, tag);
 
 				/* Packet is already tagged with this tag? */
 				mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
 
 				/* We have `untag' action when F_NOT flag is
 				 * present. And we must remove this mtag from
 				 * mbuf and reset `match' to zero (`match' will
 				 * be inversed later).
 				 * Otherwise we should allocate new mtag and
 				 * push it into mbuf.
 				 */
 				if (cmd->len & F_NOT) { /* `untag' action */
 					if (mtag != NULL)
 						m_tag_delete(m, mtag);
 					match = 0;
 				} else {
 					if (mtag == NULL) {
 						mtag = m_tag_alloc( MTAG_IPFW,
 						    tag, 0, M_NOWAIT);
 						if (mtag != NULL)
 							m_tag_prepend(m, mtag);
 					}
 					match = 1;
 				}
 				break;
 			}
 
 			case O_FIB: /* try match the specified fib */
 				if (args->f_id.fib == cmd->arg1)
 					match = 1;
 				break;
 
 			case O_SOCKARG:	{
 #ifndef USERSPACE	/* not supported in userspace */
 				struct inpcb *inp = args->inp;
 				struct inpcbinfo *pi;
 				
 				if (is_ipv6) /* XXX can we remove this ? */
 					break;
 
 				if (proto == IPPROTO_TCP)
 					pi = &V_tcbinfo;
 				else if (proto == IPPROTO_UDP)
 					pi = &V_udbinfo;
 				else
 					break;
 
 				/*
 				 * XXXRW: so_user_cookie should almost
 				 * certainly be inp_user_cookie?
 				 */
 
 				/* For incomming packet, lookup up the 
 				inpcb using the src/dest ip/port tuple */
 				if (inp == NULL) {
 					inp = in_pcblookup(pi, 
 						src_ip, htons(src_port),
 						dst_ip, htons(dst_port),
 						INPLOOKUP_RLOCKPCB, NULL);
 					if (inp != NULL) {
 						tablearg =
 						    inp->inp_socket->so_user_cookie;
 						if (tablearg)
 							match = 1;
 						INP_RUNLOCK(inp);
 					}
 				} else {
 					if (inp->inp_socket) {
 						tablearg =
 						    inp->inp_socket->so_user_cookie;
 						if (tablearg)
 							match = 1;
 					}
 				}
 #endif /* !USERSPACE */
 				break;
 			}
 
 			case O_TAGGED: {
 				struct m_tag *mtag;
 				uint32_t tag = TARG(cmd->arg1, tag);
 
 				if (cmdlen == 1) {
 					match = m_tag_locate(m, MTAG_IPFW,
 					    tag, NULL) != NULL;
 					break;
 				}
 
 				/* we have ranges */
 				for (mtag = m_tag_first(m);
 				    mtag != NULL && !match;
 				    mtag = m_tag_next(m, mtag)) {
 					uint16_t *p;
 					int i;
 
 					if (mtag->m_tag_cookie != MTAG_IPFW)
 						continue;
 
 					p = ((ipfw_insn_u16 *)cmd)->ports;
 					i = cmdlen - 1;
 					for(; !match && i > 0; i--, p += 2)
 						match =
 						    mtag->m_tag_id >= p[0] &&
 						    mtag->m_tag_id <= p[1];
 				}
 				break;
 			}
 				
 			/*
 			 * The second set of opcodes represents 'actions',
 			 * i.e. the terminal part of a rule once the packet
 			 * matches all previous patterns.
 			 * Typically there is only one action for each rule,
 			 * and the opcode is stored at the end of the rule
 			 * (but there are exceptions -- see below).
 			 *
 			 * In general, here we set retval and terminate the
 			 * outer loop (would be a 'break 3' in some language,
 			 * but we need to set l=0, done=1)
 			 *
 			 * Exceptions:
 			 * O_COUNT and O_SKIPTO actions:
 			 *   instead of terminating, we jump to the next rule
 			 *   (setting l=0), or to the SKIPTO target (setting
 			 *   f/f_len, cmd and l as needed), respectively.
 			 *
 			 * O_TAG, O_LOG and O_ALTQ action parameters:
 			 *   perform some action and set match = 1;
 			 *
 			 * O_LIMIT and O_KEEP_STATE: these opcodes are
 			 *   not real 'actions', and are stored right
 			 *   before the 'action' part of the rule.
 			 *   These opcodes try to install an entry in the
 			 *   state tables; if successful, we continue with
 			 *   the next opcode (match=1; break;), otherwise
 			 *   the packet must be dropped (set retval,
 			 *   break loops with l=0, done=1)
 			 *
 			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
 			 *   cause a lookup of the state table, and a jump
 			 *   to the 'action' part of the parent rule
 			 *   if an entry is found, or
 			 *   (CHECK_STATE only) a jump to the next rule if
 			 *   the entry is not found.
 			 *   The result of the lookup is cached so that
 			 *   further instances of these opcodes become NOPs.
 			 *   The jump to the next rule is done by setting
 			 *   l=0, cmdlen=0.
 			 */
 			case O_LIMIT:
 			case O_KEEP_STATE:
 				if (ipfw_install_state(chain, f,
 				    (ipfw_insn_limit *)cmd, args, tablearg)) {
 					/* error or limit violation */
 					retval = IP_FW_DENY;
 					l = 0;	/* exit inner loop */
 					done = 1; /* exit outer loop */
 				}
 				match = 1;
 				break;
 
 			case O_PROBE_STATE:
 			case O_CHECK_STATE:
 				/*
 				 * dynamic rules are checked at the first
 				 * keep-state or check-state occurrence,
 				 * with the result being stored in dyn_dir.
 				 * The compiler introduces a PROBE_STATE
 				 * instruction for us when we have a
 				 * KEEP_STATE (because PROBE_STATE needs
 				 * to be run first).
 				 */
 				if (dyn_dir == MATCH_UNKNOWN &&
 				    (q = ipfw_lookup_dyn_rule(&args->f_id,
 				     &dyn_dir, proto == IPPROTO_TCP ?
 					TCP(ulp) : NULL))
 					!= NULL) {
 					/*
 					 * Found dynamic entry, update stats
 					 * and jump to the 'action' part of
 					 * the parent rule by setting
 					 * f, cmd, l and clearing cmdlen.
 					 */
 					IPFW_INC_DYN_COUNTER(q, pktlen);
 					/* XXX we would like to have f_pos
 					 * readily accessible in the dynamic
 				         * rule, instead of having to
 					 * lookup q->rule.
 					 */
 					f = q->rule;
 					f_pos = ipfw_find_rule(chain,
 						f->rulenum, f->id);
 					cmd = ACTION_PTR(f);
 					l = f->cmd_len - f->act_ofs;
 					ipfw_dyn_unlock(q);
 					cmdlen = 0;
 					match = 1;
 					break;
 				}
 				/*
 				 * Dynamic entry not found. If CHECK_STATE,
 				 * skip to next rule, if PROBE_STATE just
 				 * ignore and continue with next opcode.
 				 */
 				if (cmd->opcode == O_CHECK_STATE)
 					l = 0;	/* exit inner loop */
 				match = 1;
 				break;
 
 			case O_ACCEPT:
 				retval = 0;	/* accept */
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 
 			case O_PIPE:
 			case O_QUEUE:
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, pipe);
 				if (cmd->opcode == O_PIPE)
 					args->rule.info |= IPFW_IS_PIPE;
 				if (V_fw_one_pass)
 					args->rule.info |= IPFW_ONEPASS;
 				retval = IP_FW_DUMMYNET;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 			case O_DIVERT:
 			case O_TEE:
 				if (args->eh) /* not on layer 2 */
 				    break;
 				/* otherwise this is terminal */
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				retval = (cmd->opcode == O_DIVERT) ?
 					IP_FW_DIVERT : IP_FW_TEE;
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, divert);
 				break;
 
 			case O_COUNT:
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				l = 0;		/* exit inner loop */
 				break;
 
 			case O_SKIPTO:
 			    IPFW_INC_RULE_COUNTER(f, pktlen);
 			    f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0);
 			    /*
 			     * Skip disabled rules, and re-enter
 			     * the inner loop with the correct
 			     * f_pos, f, l and cmd.
 			     * Also clear cmdlen and skip_or
 			     */
 			    for (; f_pos < chain->n_rules - 1 &&
 				    (V_set_disable &
 				     (1 << chain->map[f_pos]->set));
 				    f_pos++)
 				;
 			    /* Re-enter the inner loop at the skipto rule. */
 			    f = chain->map[f_pos];
 			    l = f->cmd_len;
 			    cmd = f->cmd;
 			    match = 1;
 			    cmdlen = 0;
 			    skip_or = 0;
 			    continue;
 			    break;	/* not reached */
 
 			case O_CALLRETURN: {
 				/*
 				 * Implementation of `subroutine' call/return,
 				 * in the stack carried in an mbuf tag. This
 				 * is different from `skipto' in that any call
 				 * address is possible (`skipto' must prevent
 				 * backward jumps to avoid endless loops).
 				 * We have `return' action when F_NOT flag is
 				 * present. The `m_tag_id' field is used as
 				 * stack pointer.
 				 */
 				struct m_tag *mtag;
 				uint16_t jmpto, *stack;
 
 #define	IS_CALL		((cmd->len & F_NOT) == 0)
 #define	IS_RETURN	((cmd->len & F_NOT) != 0)
 				/*
 				 * Hand-rolled version of m_tag_locate() with
 				 * wildcard `type'.
 				 * If not already tagged, allocate new tag.
 				 */
 				mtag = m_tag_first(m);
 				while (mtag != NULL) {
 					if (mtag->m_tag_cookie ==
 					    MTAG_IPFW_CALL)
 						break;
 					mtag = m_tag_next(m, mtag);
 				}
 				if (mtag == NULL && IS_CALL) {
 					mtag = m_tag_alloc(MTAG_IPFW_CALL, 0,
 					    IPFW_CALLSTACK_SIZE *
 					    sizeof(uint16_t), M_NOWAIT);
 					if (mtag != NULL)
 						m_tag_prepend(m, mtag);
 				}
 
 				/*
 				 * On error both `call' and `return' just
 				 * continue with next rule.
 				 */
 				if (IS_RETURN && (mtag == NULL ||
 				    mtag->m_tag_id == 0)) {
 					l = 0;		/* exit inner loop */
 					break;
 				}
 				if (IS_CALL && (mtag == NULL ||
 				    mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) {
 					printf("ipfw: call stack error, "
 					    "go to next rule\n");
 					l = 0;		/* exit inner loop */
 					break;
 				}
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				stack = (uint16_t *)(mtag + 1);
 
 				/*
 				 * The `call' action may use cached f_pos
 				 * (in f->next_rule), whose version is written
 				 * in f->next_rule.
 				 * The `return' action, however, doesn't have
 				 * fixed jump address in cmd->arg1 and can't use
 				 * cache.
 				 */
 				if (IS_CALL) {
 					stack[mtag->m_tag_id] = f->rulenum;
 					mtag->m_tag_id++;
 			    		f_pos = JUMP(chain, f, cmd->arg1,
 					    tablearg, 1);
 				} else {	/* `return' action */
 					mtag->m_tag_id--;
 					jmpto = stack[mtag->m_tag_id] + 1;
 					f_pos = ipfw_find_rule(chain, jmpto, 0);
 				}
 
 				/*
 				 * Skip disabled rules, and re-enter
 				 * the inner loop with the correct
 				 * f_pos, f, l and cmd.
 				 * Also clear cmdlen and skip_or
 				 */
 				for (; f_pos < chain->n_rules - 1 &&
 				    (V_set_disable &
 				    (1 << chain->map[f_pos]->set)); f_pos++)
 					;
 				/* Re-enter the inner loop at the dest rule. */
 				f = chain->map[f_pos];
 				l = f->cmd_len;
 				cmd = f->cmd;
 				cmdlen = 0;
 				skip_or = 0;
 				continue;
 				break;	/* NOTREACHED */
 			}
 #undef IS_CALL
 #undef IS_RETURN
 
 			case O_REJECT:
 				/*
 				 * Drop the packet and send a reject notice
 				 * if the packet is not ICMP (or is an ICMP
 				 * query), and it is not multicast/broadcast.
 				 */
 				if (hlen > 0 && is_ipv4 && offset == 0 &&
 				    (proto != IPPROTO_ICMP ||
 				     is_icmp_query(ICMP(ulp))) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
 					send_reject(args, cmd->arg1, iplen, ip);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 #ifdef INET6
 			case O_UNREACH6:
 				if (hlen > 0 && is_ipv6 &&
 				    ((offset & IP6F_OFF_MASK) == 0) &&
 				    (proto != IPPROTO_ICMPV6 ||
 				     (is_icmp6_query(icmp6_type) == 1)) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
 					send_reject6(
 					    args, cmd->arg1, hlen,
 					    (struct ip6_hdr *)ip);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 #endif
 			case O_DENY:
 				retval = IP_FW_DENY;
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 
 			case O_FORWARD_IP:
 				if (args->eh)	/* not valid on layer2 pkts */
 					break;
 				if (q == NULL || q->rule != f ||
 				    dyn_dir == MATCH_FORWARD) {
 				    struct sockaddr_in *sa;
 				    sa = &(((ipfw_insn_sa *)cmd)->sa);
 				    if (sa->sin_addr.s_addr == INADDR_ANY) {
 					bcopy(sa, &args->hopstore,
 							sizeof(*sa));
 					args->hopstore.sin_addr.s_addr =
 						    htonl(tablearg);
 					args->next_hop = &args->hopstore;
 				    } else {
 					args->next_hop = sa;
 				    }
 				}
 				retval = IP_FW_PASS;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 #ifdef INET6
 			case O_FORWARD_IP6:
 				if (args->eh)	/* not valid on layer2 pkts */
 					break;
 				if (q == NULL || q->rule != f ||
 				    dyn_dir == MATCH_FORWARD) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = &(((ipfw_insn_sa6 *)cmd)->sa);
 					args->next_hop6 = sin6;
 				}
 				retval = IP_FW_PASS;
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 #endif
 
 			case O_NETGRAPH:
 			case O_NGTEE:
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, netgraph);
 				if (V_fw_one_pass)
 					args->rule.info |= IPFW_ONEPASS;
 				retval = (cmd->opcode == O_NETGRAPH) ?
 				    IP_FW_NETGRAPH : IP_FW_NGTEE;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 			case O_SETFIB: {
 				uint32_t fib;
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				fib = TARG(cmd->arg1, fib) & 0x7FFFF;
 				if (fib >= rt_numfibs)
 					fib = 0;
 				M_SETFIB(m, fib);
 				args->f_id.fib = fib;
 				l = 0;		/* exit inner loop */
 				break;
 		        }
 
 			case O_SETDSCP: {
 				uint16_t code;
 
 				code = TARG(cmd->arg1, dscp) & 0x3F;
 				l = 0;		/* exit inner loop */
 				if (is_ipv4) {
 					uint16_t a;
 
 					a = ip->ip_tos;
 					ip->ip_tos = (code << 2) | (ip->ip_tos & 0x03);
 					a += ntohs(ip->ip_sum) - ip->ip_tos;
 					ip->ip_sum = htons(a);
 				} else if (is_ipv6) {
 					uint8_t *v;
 
 					v = &((struct ip6_hdr *)ip)->ip6_vfc;
 					*v = (*v & 0xF0) | (code >> 2);
 					v++;
 					*v = (*v & 0x3F) | ((code & 0x03) << 6);
 				} else
 					break;
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				break;
 			}
 
 			case O_NAT:
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
  				if (!IPFW_NAT_LOADED) {
 				    retval = IP_FW_DENY;
 				    break;
 				}
 
 				struct cfg_nat *t;
 				int nat_id;
 
 				set_match(args, f_pos, chain);
 				/* Check if this is 'global' nat rule */
 				if (cmd->arg1 == 0) {
 					retval = ipfw_nat_ptr(args, NULL, m);
 					break;
 				}
 				t = ((ipfw_insn_nat *)cmd)->nat;
 				if (t == NULL) {
 					nat_id = TARG(cmd->arg1, nat);
 					t = (*lookup_nat_ptr)(&chain->nat, nat_id);
 
 					if (t == NULL) {
 					    retval = IP_FW_DENY;
 					    break;
 					}
 					if (cmd->arg1 != IP_FW_TARG)
 					    ((ipfw_insn_nat *)cmd)->nat = t;
 				}
 				retval = ipfw_nat_ptr(args, t, m);
 				break;
 
 			case O_REASS: {
 				int ip_off;
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				l = 0;	/* in any case exit inner loop */
 				ip_off = ntohs(ip->ip_off);
 
 				/* if not fragmented, go to next rule */
 				if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
 				    break;
 
 				args->m = m = ip_reass(m);
 
 				/*
 				 * do IP header checksum fixup.
 				 */
 				if (m == NULL) { /* fragment got swallowed */
 				    retval = IP_FW_DENY;
 				} else { /* good, packet complete */
 				    int hlen;
 
 				    ip = mtod(m, struct ip *);
 				    hlen = ip->ip_hl << 2;
 				    ip->ip_sum = 0;
 				    if (hlen == sizeof(struct ip))
 					ip->ip_sum = in_cksum_hdr(ip);
 				    else
 					ip->ip_sum = in_cksum(m, hlen);
 				    retval = IP_FW_REASS;
 				    set_match(args, f_pos, chain);
 				}
 				done = 1;	/* exit outer loop */
 				break;
 			}
 
 			default:
 				panic("-- unknown opcode %d\n", cmd->opcode);
 			} /* end of switch() on opcodes */
 			/*
 			 * if we get here with l=0, then match is irrelevant.
 			 */
 
 			if (cmd->len & F_NOT)
 				match = !match;
 
 			if (match) {
 				if (cmd->len & F_OR)
 					skip_or = 1;
 			} else {
 				if (!(cmd->len & F_OR)) /* not an OR block, */
 					break;		/* try next rule    */
 			}
 
 		}	/* end of inner loop, scan opcodes */
 #undef PULLUP_LEN
 
 		if (done)
 			break;
 
 /* next_rule:; */	/* try next rule		*/
 
 	}		/* end of outer for, scan rules */
 
 	if (done) {
 		struct ip_fw *rule = chain->map[f_pos];
 		/* Update statistics */
 		IPFW_INC_RULE_COUNTER(rule, pktlen);
 	} else {
 		retval = IP_FW_DENY;
 		printf("ipfw: ouch!, skip past end of rules, denying packet\n");
 	}
 	IPFW_PF_RUNLOCK(chain);
 #ifdef __FreeBSD__
 	if (ucred_cache != NULL)
 		crfree(ucred_cache);
 #endif
 	return (retval);
 
 pullup_failed:
 	if (V_fw_verbose)
 		printf("ipfw: pullup failed\n");
 	return (IP_FW_DENY);
 }
 
 /*
  * Set maximum number of tables that can be used in given VNET ipfw instance.
  */
 #ifdef SYSCTL_NODE
 static int
 sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	unsigned int ntables;
 
 	ntables = V_fw_tables_max;
 
 	error = sysctl_handle_int(oidp, &ntables, 0, req);
 	/* Read operation or some error */
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	return (ipfw_resize_tables(&V_layer3_chain, ntables));
 }
 
 /*
  * Switches table namespace between global and per-set.
  */
 static int
 sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	unsigned int sets;
 
 	sets = V_fw_tables_sets;
 
 	error = sysctl_handle_int(oidp, &sets, 0, req);
 	/* Read operation or some error */
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	return (ipfw_switch_tables_namespace(&V_layer3_chain, sets));
 }
 #endif
 
 /*
  * Module and VNET glue
  */
 
 /*
  * Stuff that must be initialised only on boot or module load
  */
 static int
 ipfw_init(void)
 {
 	int error = 0;
 
 	/*
  	 * Only print out this stuff the first time around,
 	 * when called from the sysinit code.
 	 */
 	printf("ipfw2 "
 #ifdef INET6
 		"(+ipv6) "
 #endif
 		"initialized, divert %s, nat %s, "
 		"default to %s, logging ",
 #ifdef IPDIVERT
 		"enabled",
 #else
 		"loadable",
 #endif
 #ifdef IPFIREWALL_NAT
 		"enabled",
 #else
 		"loadable",
 #endif
 		default_to_accept ? "accept" : "deny");
 
 	/*
 	 * Note: V_xxx variables can be accessed here but the vnet specific
 	 * initializer may not have been called yet for the VIMAGE case.
 	 * Tuneables will have been processed. We will print out values for
 	 * the default vnet. 
 	 * XXX This should all be rationalized AFTER 8.0
 	 */
 	if (V_fw_verbose == 0)
 		printf("disabled\n");
 	else if (V_verbose_limit == 0)
 		printf("unlimited\n");
 	else
 		printf("limited to %d packets/entry by default\n",
 		    V_verbose_limit);
 
 	/* Check user-supplied table count for validness */
 	if (default_fw_tables > IPFW_TABLES_MAX)
 	  default_fw_tables = IPFW_TABLES_MAX;
 
 	ipfw_init_sopt_handler();
 	ipfw_log_bpf(1); /* init */
 	ipfw_iface_init();
 	return (error);
 }
 
 /*
  * Called for the removal of the last instance only on module unload.
  */
 static void
 ipfw_destroy(void)
 {
 
 	ipfw_iface_destroy();
 	ipfw_log_bpf(0); /* uninit */
 	ipfw_destroy_sopt_handler();
 	printf("IP firewall unloaded\n");
 }
 
 /*
  * Stuff that must be initialized for every instance
  * (including the first of course).
  */
 static int
 vnet_ipfw_init(const void *unused)
 {
 	int error, first;
 	struct ip_fw *rule = NULL;
 	struct ip_fw_chain *chain;
 
 	chain = &V_layer3_chain;
 
 	first = IS_DEFAULT_VNET(curvnet) ? 1 : 0;
 
 	/* First set up some values that are compile time options */
 	V_autoinc_step = 100;	/* bounded to 1..1000 in add_rule() */
 	V_fw_deny_unknown_exthdrs = 1;
 #ifdef IPFIREWALL_VERBOSE
 	V_fw_verbose = 1;
 #endif
 #ifdef IPFIREWALL_VERBOSE_LIMIT
 	V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
 #endif
 #ifdef IPFIREWALL_NAT
 	LIST_INIT(&chain->nat);
 #endif
 
 	ipfw_init_counters();
 	/* insert the default rule and create the initial map */
 	chain->n_rules = 1;
 	chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_WAITOK | M_ZERO);
 	rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw));
 
 	/* Set initial number of tables */
 	V_fw_tables_max = default_fw_tables;
 	error = ipfw_init_tables(chain, first);
 	if (error) {
 		printf("ipfw2: setting up tables failed\n");
 		free(chain->map, M_IPFW);
 		free(rule, M_IPFW);
 		return (ENOSPC);
 	}
 
 	/* fill and insert the default rule */
 	rule->act_ofs = 0;
 	rule->rulenum = IPFW_DEFAULT_RULE;
 	rule->cmd_len = 1;
 	rule->set = RESVD_SET;
 	rule->cmd[0].len = 1;
 	rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
 	chain->default_rule = chain->map[0] = rule;
 	chain->id = rule->id = 1;
 	/* Pre-calculate rules length for legacy dump format */
 	chain->static_len = sizeof(struct ip_fw_rule0);
 
 	IPFW_LOCK_INIT(chain);
 	ipfw_dyn_init(chain);
 #ifdef LINEAR_SKIPTO
 	ipfw_init_skipto_cache(chain);
 #endif
 
 	/* First set up some values that are compile time options */
 	V_ipfw_vnet_ready = 1;		/* Open for business */
 
 	/*
 	 * Hook the sockopt handler and pfil hooks for ipv4 and ipv6.
 	 * Even if the latter two fail we still keep the module alive
 	 * because the sockopt and layer2 paths are still useful.
 	 * ipfw[6]_hook return 0 on success, ENOENT on failure,
 	 * so we can ignore the exact return value and just set a flag.
 	 *
 	 * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so
 	 * changes in the underlying (per-vnet) variables trigger
 	 * immediate hook()/unhook() calls.
 	 * In layer2 we have the same behaviour, except that V_ether_ipfw
 	 * is checked on each packet because there are no pfil hooks.
 	 */
 	V_ip_fw_ctl_ptr = ipfw_ctl3;
 	error = ipfw_attach_hooks(1);
 	return (error);
 }
 
 /*
  * Called for the removal of each instance.
  */
 static int
 vnet_ipfw_uninit(const void *unused)
 {
 	struct ip_fw *reap;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 	int i, last;
 
 	V_ipfw_vnet_ready = 0; /* tell new callers to go away */
 	/*
 	 * disconnect from ipv4, ipv6, layer2 and sockopt.
 	 * Then grab, release and grab again the WLOCK so we make
 	 * sure the update is propagated and nobody will be in.
 	 */
 	(void)ipfw_attach_hooks(0 /* detach */);
 	V_ip_fw_ctl_ptr = NULL;
 
 	last = IS_DEFAULT_VNET(curvnet) ? 1 : 0;
 
 	IPFW_UH_WLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 	IPFW_UH_WLOCK(chain);
 
 	IPFW_WLOCK(chain);
 	ipfw_dyn_uninit(0);	/* run the callout_drain */
 	IPFW_WUNLOCK(chain);
 
 	reap = NULL;
 	IPFW_WLOCK(chain);
 	for (i = 0; i < chain->n_rules; i++)
 		ipfw_reap_add(chain, &reap, chain->map[i]);
 	free(chain->map, M_IPFW);
 #ifdef LINEAR_SKIPTO
 	ipfw_destroy_skipto_cache(chain);
 #endif
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 	ipfw_destroy_tables(chain, last);
 	if (reap != NULL)
 		ipfw_reap_rules(reap);
 	vnet_ipfw_iface_destroy(chain);
 	IPFW_LOCK_DESTROY(chain);
 	ipfw_dyn_uninit(1);	/* free the remaining parts */
 	ipfw_destroy_counters();
 	return (0);
 }
 
 /*
  * Module event handler.
  * In general we have the choice of handling most of these events by the
  * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to
  * use the SYSINIT handlers as they are more capable of expressing the
  * flow of control during module and vnet operations, so this is just
  * a skeleton. Note there is no SYSINIT equivalent of the module
  * SHUTDOWN handler, but we don't have anything to do in that case anyhow.
  */
 static int
 ipfw_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		/* Called once at module load or
 	 	 * system boot if compiled in. */
 		break;
 	case MOD_QUIESCE:
 		/* Called before unload. May veto unloading. */
 		break;
 	case MOD_UNLOAD:
 		/* Called during unload. */
 		break;
 	case MOD_SHUTDOWN:
 		/* Called during system shutdown. */
 		break;
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipfwmod = {
 	"ipfw",
 	ipfw_modevent,
 	0
 };
 
 /* Define startup order. */
 #define	IPFW_SI_SUB_FIREWALL	SI_SUB_PROTO_IFATTACHDOMAIN
 #define	IPFW_MODEVENT_ORDER	(SI_ORDER_ANY - 255) /* On boot slot in here. */
 #define	IPFW_MODULE_ORDER	(IPFW_MODEVENT_ORDER + 1) /* A little later. */
 #define	IPFW_VNET_ORDER		(IPFW_MODEVENT_ORDER + 2) /* Later still. */
 
 DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER);
 FEATURE(ipfw_ctl3, "ipfw new sockopt calls");
 MODULE_VERSION(ipfw, 3);
 /* should declare some dependencies here */
 
 /*
  * Starting up. Done in order after ipfwmod() has been called.
  * VNET_SYSINIT is also called for each existing vnet and each new vnet.
  */
 SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
 	    ipfw_init, NULL);
 VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
 	    vnet_ipfw_init, NULL);
  
 /*
  * Closing up shop. These are done in REVERSE ORDER, but still
  * after ipfwmod() has been called. Not called on reboot.
  * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
  * or when the module is unloaded.
  */
 SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
 	    ipfw_destroy, NULL);
 VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
 	    vnet_ipfw_uninit, NULL);
 /* end of file */
Index: user/dchagin/lemul/sys/netpfil/ipfw/ip_fw_dynamic.c
===================================================================
--- user/dchagin/lemul/sys/netpfil/ipfw/ip_fw_dynamic.c	(revision 274240)
+++ user/dchagin/lemul/sys/netpfil/ipfw/ip_fw_dynamic.c	(revision 274241)
@@ -1,1605 +1,1605 @@
 /*-
  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define        DEB(x)
 #define        DDB(x) x
 
 /*
  * Dynamic rule support for ipfw
  */
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #ifndef INET
 #error IPFIREWALL requires INET.
 #endif /* INET */
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <net/ethernet.h> /* for ETHERTYPE_IP */
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>	/* ip_defttl */
 #include <netinet/ip_fw.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 
 #include <netinet/ip6.h>	/* IN6_ARE_ADDR_EQUAL */
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #endif
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #include <machine/in_cksum.h>	/* XXX for in_cksum */
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 /*
  * Description of dynamic rules.
  *
  * Dynamic rules are stored in lists accessed through a hash table
  * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
  * be modified through the sysctl variable dyn_buckets which is
  * updated when the table becomes empty.
  *
  * XXX currently there is only one list, ipfw_dyn.
  *
  * When a packet is received, its address fields are first masked
  * with the mask defined for the rule, then hashed, then matched
  * against the entries in the corresponding list.
  * Dynamic rules can be used for different purposes:
  *  + stateful rules;
  *  + enforcing limits on the number of sessions;
  *  + in-kernel NAT (not implemented yet)
  *
  * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
  * measured in seconds and depending on the flags.
  *
  * The total number of dynamic rules is equal to UMA zone items count.
  * The max number of dynamic rules is dyn_max. When we reach
  * the maximum number of rules we do not create anymore. This is
  * done to avoid consuming too much memory, but also too much
  * time when searching on each packet (ideally, we should try instead
  * to put a limit on the length of the list on each bucket...).
  *
  * Each dynamic rule holds a pointer to the parent ipfw rule so
  * we know what action to perform. Dynamic rules are removed when
  * the parent rule is deleted. This can be changed by dyn_keep_states
  * sysctl.
  *
  * There are some limitations with dynamic rules -- we do not
  * obey the 'randomized match', and we do not do multiple
  * passes through the firewall. XXX check the latter!!!
  */
 
 struct ipfw_dyn_bucket {
 	struct mtx	mtx;		/* Bucket protecting lock */
 	ipfw_dyn_rule	*head;		/* Pointer to first rule */
 };
 
 /*
  * Static variables followed by global ones
  */
 static VNET_DEFINE(struct ipfw_dyn_bucket *, ipfw_dyn_v);
 static VNET_DEFINE(u_int32_t, dyn_buckets_max);
 static VNET_DEFINE(u_int32_t, curr_dyn_buckets);
 static VNET_DEFINE(struct callout, ipfw_timeout);
 #define	V_ipfw_dyn_v			VNET(ipfw_dyn_v)
 #define	V_dyn_buckets_max		VNET(dyn_buckets_max)
 #define	V_curr_dyn_buckets		VNET(curr_dyn_buckets)
 #define V_ipfw_timeout                  VNET(ipfw_timeout)
 
 static VNET_DEFINE(uma_zone_t, ipfw_dyn_rule_zone);
 #define	V_ipfw_dyn_rule_zone		VNET(ipfw_dyn_rule_zone)
 
 #define	IPFW_BUCK_LOCK_INIT(b)	\
 	mtx_init(&(b)->mtx, "IPFW dynamic bucket", NULL, MTX_DEF)
 #define	IPFW_BUCK_LOCK_DESTROY(b)	\
 	mtx_destroy(&(b)->mtx)
 #define	IPFW_BUCK_LOCK(i)	mtx_lock(&V_ipfw_dyn_v[(i)].mtx)
 #define	IPFW_BUCK_UNLOCK(i)	mtx_unlock(&V_ipfw_dyn_v[(i)].mtx)
 #define	IPFW_BUCK_ASSERT(i)	mtx_assert(&V_ipfw_dyn_v[(i)].mtx, MA_OWNED)
 
 
 static VNET_DEFINE(int, dyn_keep_states);
 #define	V_dyn_keep_states		VNET(dyn_keep_states)
 
 /*
  * Timeouts for various events in handing dynamic rules.
  */
 static VNET_DEFINE(u_int32_t, dyn_ack_lifetime);
 static VNET_DEFINE(u_int32_t, dyn_syn_lifetime);
 static VNET_DEFINE(u_int32_t, dyn_fin_lifetime);
 static VNET_DEFINE(u_int32_t, dyn_rst_lifetime);
 static VNET_DEFINE(u_int32_t, dyn_udp_lifetime);
 static VNET_DEFINE(u_int32_t, dyn_short_lifetime);
 
 #define	V_dyn_ack_lifetime		VNET(dyn_ack_lifetime)
 #define	V_dyn_syn_lifetime		VNET(dyn_syn_lifetime)
 #define	V_dyn_fin_lifetime		VNET(dyn_fin_lifetime)
 #define	V_dyn_rst_lifetime		VNET(dyn_rst_lifetime)
 #define	V_dyn_udp_lifetime		VNET(dyn_udp_lifetime)
 #define	V_dyn_short_lifetime		VNET(dyn_short_lifetime)
 
 /*
  * Keepalives are sent if dyn_keepalive is set. They are sent every
  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
  * seconds of lifetime of a rule.
  * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
  * than dyn_keepalive_period.
  */
 
 static VNET_DEFINE(u_int32_t, dyn_keepalive_interval);
 static VNET_DEFINE(u_int32_t, dyn_keepalive_period);
 static VNET_DEFINE(u_int32_t, dyn_keepalive);
 static VNET_DEFINE(time_t, dyn_keepalive_last);
 
 #define	V_dyn_keepalive_interval	VNET(dyn_keepalive_interval)
 #define	V_dyn_keepalive_period		VNET(dyn_keepalive_period)
 #define	V_dyn_keepalive			VNET(dyn_keepalive)
 #define	V_dyn_keepalive_last		VNET(dyn_keepalive_last)
 
 static VNET_DEFINE(u_int32_t, dyn_max);		/* max # of dynamic rules */
 
 #define	DYN_COUNT			uma_zone_get_cur(V_ipfw_dyn_rule_zone)
 #define	V_dyn_max			VNET(dyn_max)
 
 /* for userspace, we emulate the uma_zone_counter with ipfw_dyn_count */
 static int ipfw_dyn_count;	/* number of objects */
 
 #ifdef USERSPACE /* emulation of UMA object counters for userspace */
 #define uma_zone_get_cur(x)	ipfw_dyn_count
 #endif /* USERSPACE */
 
 static int last_log;	/* Log ratelimiting */
 
 static void ipfw_dyn_tick(void *vnetx);
 static void check_dyn_rules(struct ip_fw_chain *, ipfw_range_tlv *, int, int);
 #ifdef SYSCTL_NODE
 
 static int sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS);
 static int sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS);
 
 SYSBEGIN(f2)
 
 SYSCTL_DECL(_net_inet_ip_fw);
-SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
-    CTLFLAG_RW, &VNET_NAME(dyn_buckets_max), 0,
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_buckets_max), 0,
     "Max number of dyn. buckets");
-SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
-    CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
+    CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
     "Current Number of dyn. buckets");
-SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
-    CTLTYPE_UINT|CTLFLAG_RD, 0, 0, sysctl_ipfw_dyn_count, "IU",
+SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RD, 0, 0, sysctl_ipfw_dyn_count, "IU",
     "Number of dyn. rules");
-SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
-    CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_dyn_max, "IU",
+SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
+    CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_dyn_max, "IU",
     "Max number of dyn. rules");
-SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
-    CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
     "Lifetime of dyn. rules for acks");
-SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
-    CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0,
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0,
     "Lifetime of dyn. rules for syn");
-SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
-    CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0,
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0,
     "Lifetime of dyn. rules for fin");
-SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
-    CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0,
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0,
     "Lifetime of dyn. rules for rst");
-SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
-    CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0,
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0,
     "Lifetime of dyn. rules for UDP");
-SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
-    CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0,
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0,
     "Lifetime of dyn. rules for other situations");
-SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
-    CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
     "Enable keepalives for dyn. rules");
-SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keep_states,
-    CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0,
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keep_states,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0,
     "Do not flush dynamic states on rule deletion");
 
 SYSEND
 
 #endif /* SYSCTL_NODE */
 
 
 #ifdef INET6
 static __inline int
 hash_packet6(struct ipfw_flow_id *id)
 {
 	u_int32_t i;
 	i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
 	    (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
 	    (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
 	    (id->src_ip6.__u6_addr.__u6_addr32[3]) ^
 	    (id->dst_port) ^ (id->src_port);
 	return i;
 }
 #endif
 
 /*
  * IMPORTANT: the hash function for dynamic rules must be commutative
  * in source and destination (ip,port), because rules are bidirectional
  * and we want to find both in the same bucket.
  */
 static __inline int
 hash_packet(struct ipfw_flow_id *id, int buckets)
 {
 	u_int32_t i;
 
 #ifdef INET6
 	if (IS_IP6_FLOW_ID(id)) 
 		i = hash_packet6(id);
 	else
 #endif /* INET6 */
 	i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
 	i &= (buckets - 1);
 	return i;
 }
 
 /**
  * Print customizable flow id description via log(9) facility.
  */
 static void
 print_dyn_rule_flags(struct ipfw_flow_id *id, int dyn_type, int log_flags,
     char *prefix, char *postfix)
 {
 	struct in_addr da;
 #ifdef INET6
 	char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
 #else
 	char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
 #endif
 
 #ifdef INET6
 	if (IS_IP6_FLOW_ID(id)) {
 		ip6_sprintf(src, &id->src_ip6);
 		ip6_sprintf(dst, &id->dst_ip6);
 	} else
 #endif
 	{
 		da.s_addr = htonl(id->src_ip);
 		inet_ntop(AF_INET, &da, src, sizeof(src));
 		da.s_addr = htonl(id->dst_ip);
 		inet_ntop(AF_INET, &da, dst, sizeof(dst));
 	}
 	log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n",
 	    prefix, dyn_type, src, id->src_port, dst,
 	    id->dst_port, DYN_COUNT, postfix);
 }
 
 #define	print_dyn_rule(id, dtype, prefix, postfix)	\
 	print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix)
 
 #define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)
 #define TIME_LE(a,b)       ((int)((a)-(b)) < 0)
 
 /*
  * Lookup a dynamic rule, locked version.
  */
 static ipfw_dyn_rule *
 lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int i, int *match_direction,
     struct tcphdr *tcp)
 {
 	/*
 	 * Stateful ipfw extensions.
 	 * Lookup into dynamic session queue.
 	 */
 #define MATCH_REVERSE	0
 #define MATCH_FORWARD	1
 #define MATCH_NONE	2
 #define MATCH_UNKNOWN	3
 	int dir = MATCH_NONE;
 	ipfw_dyn_rule *prev, *q = NULL;
 
 	IPFW_BUCK_ASSERT(i);
 
 	for (prev = NULL, q = V_ipfw_dyn_v[i].head; q; prev = q, q = q->next) {
 		if (q->dyn_type == O_LIMIT_PARENT && q->count)
 			continue;
 
 		if (pkt->proto != q->id.proto || q->dyn_type == O_LIMIT_PARENT)
 			continue;
 
 		if (IS_IP6_FLOW_ID(pkt)) {
 			if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.src_ip6) &&
 			    IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.dst_ip6) &&
 			    pkt->src_port == q->id.src_port &&
 			    pkt->dst_port == q->id.dst_port) {
 				dir = MATCH_FORWARD;
 				break;
 			}
 			if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.dst_ip6) &&
 			    IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.src_ip6) &&
 			    pkt->src_port == q->id.dst_port &&
 			    pkt->dst_port == q->id.src_port) {
 				dir = MATCH_REVERSE;
 				break;
 			}
 		} else {
 			if (pkt->src_ip == q->id.src_ip &&
 			    pkt->dst_ip == q->id.dst_ip &&
 			    pkt->src_port == q->id.src_port &&
 			    pkt->dst_port == q->id.dst_port) {
 				dir = MATCH_FORWARD;
 				break;
 			}
 			if (pkt->src_ip == q->id.dst_ip &&
 			    pkt->dst_ip == q->id.src_ip &&
 			    pkt->src_port == q->id.dst_port &&
 			    pkt->dst_port == q->id.src_port) {
 				dir = MATCH_REVERSE;
 				break;
 			}
 		}
 	}
 	if (q == NULL)
 		goto done;	/* q = NULL, not found */
 
 	if (prev != NULL) {	/* found and not in front */
 		prev->next = q->next;
 		q->next = V_ipfw_dyn_v[i].head;
 		V_ipfw_dyn_v[i].head = q;
 	}
 	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
 		uint32_t ack;
 		u_char flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST);
 
 #define BOTH_SYN	(TH_SYN | (TH_SYN << 8))
 #define BOTH_FIN	(TH_FIN | (TH_FIN << 8))
 #define	TCP_FLAGS	(TH_FLAGS | (TH_FLAGS << 8))
 #define	ACK_FWD		0x10000			/* fwd ack seen */
 #define	ACK_REV		0x20000			/* rev ack seen */
 
 		q->state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
 		switch (q->state & TCP_FLAGS) {
 		case TH_SYN:			/* opening */
 			q->expire = time_uptime + V_dyn_syn_lifetime;
 			break;
 
 		case BOTH_SYN:			/* move to established */
 		case BOTH_SYN | TH_FIN:		/* one side tries to close */
 		case BOTH_SYN | (TH_FIN << 8):
 #define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
 			if (tcp == NULL)
 				break;
 
 			ack = ntohl(tcp->th_ack);
 			if (dir == MATCH_FORWARD) {
 				if (q->ack_fwd == 0 ||
 				    _SEQ_GE(ack, q->ack_fwd)) {
 					q->ack_fwd = ack;
 					q->state |= ACK_FWD;
 				}
 			} else {
 				if (q->ack_rev == 0 ||
 				    _SEQ_GE(ack, q->ack_rev)) {
 					q->ack_rev = ack;
 					q->state |= ACK_REV;
 				}
 			}
 			if ((q->state & (ACK_FWD | ACK_REV)) ==
 			    (ACK_FWD | ACK_REV)) {
 				q->expire = time_uptime + V_dyn_ack_lifetime;
 				q->state &= ~(ACK_FWD | ACK_REV);
 			}
 			break;
 
 		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
 			if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
 				V_dyn_fin_lifetime = V_dyn_keepalive_period - 1;
 			q->expire = time_uptime + V_dyn_fin_lifetime;
 			break;
 
 		default:
 #if 0
 			/*
 			 * reset or some invalid combination, but can also
 			 * occur if we use keep-state the wrong way.
 			 */
 			if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
 				printf("invalid state: 0x%x\n", q->state);
 #endif
 			if (V_dyn_rst_lifetime >= V_dyn_keepalive_period)
 				V_dyn_rst_lifetime = V_dyn_keepalive_period - 1;
 			q->expire = time_uptime + V_dyn_rst_lifetime;
 			break;
 		}
 	} else if (pkt->proto == IPPROTO_UDP) {
 		q->expire = time_uptime + V_dyn_udp_lifetime;
 	} else {
 		/* other protocols */
 		q->expire = time_uptime + V_dyn_short_lifetime;
 	}
 done:
 	if (match_direction != NULL)
 		*match_direction = dir;
 	return (q);
 }
 
 ipfw_dyn_rule *
 ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
     struct tcphdr *tcp)
 {
 	ipfw_dyn_rule *q;
 	int i;
 
 	i = hash_packet(pkt, V_curr_dyn_buckets);
 
 	IPFW_BUCK_LOCK(i);
 	q = lookup_dyn_rule_locked(pkt, i, match_direction, tcp);
 	if (q == NULL)
 		IPFW_BUCK_UNLOCK(i);
 	/* NB: return table locked when q is not NULL */
 	return q;
 }
 
 /*
  * Unlock bucket mtx
  * @p - pointer to dynamic rule
  */
 void
 ipfw_dyn_unlock(ipfw_dyn_rule *q)
 {
 
 	IPFW_BUCK_UNLOCK(q->bucket);
 }
 
 static int
 resize_dynamic_table(struct ip_fw_chain *chain, int nbuckets)
 {
 	int i, k, nbuckets_old;
 	ipfw_dyn_rule *q;
 	struct ipfw_dyn_bucket *dyn_v, *dyn_v_old;
 
 	/* Check if given number is power of 2 and less than 64k */
 	if ((nbuckets > 65536) || (!powerof2(nbuckets)))
 		return 1;
 
 	CTR3(KTR_NET, "%s: resize dynamic hash: %d -> %d", __func__,
 	    V_curr_dyn_buckets, nbuckets);
 
 	/* Allocate and initialize new hash */
 	dyn_v = malloc(nbuckets * sizeof(ipfw_dyn_rule), M_IPFW,
 	    M_WAITOK | M_ZERO);
 
 	for (i = 0 ; i < nbuckets; i++)
 		IPFW_BUCK_LOCK_INIT(&dyn_v[i]);
 
 	/*
 	 * Call upper half lock, as get_map() do to ease
 	 * read-only access to dynamic rules hash from sysctl
 	 */
 	IPFW_UH_WLOCK(chain);
 
 	/*
 	 * Acquire chain write lock to permit hash access
 	 * for main traffic path without additional locks
 	 */
 	IPFW_WLOCK(chain);
 
 	/* Save old values */
 	nbuckets_old = V_curr_dyn_buckets;
 	dyn_v_old = V_ipfw_dyn_v;
 
 	/* Skip relinking if array is not set up */
 	if (V_ipfw_dyn_v == NULL)
 		V_curr_dyn_buckets = 0;
 
 	/* Re-link all dynamic states */
 	for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
 		while (V_ipfw_dyn_v[i].head != NULL) {
 			/* Remove from current chain */
 			q = V_ipfw_dyn_v[i].head;
 			V_ipfw_dyn_v[i].head = q->next;
 
 			/* Get new hash value */
 			k = hash_packet(&q->id, nbuckets);
 			q->bucket = k;
 			/* Add to the new head */
 			q->next = dyn_v[k].head;
 			dyn_v[k].head = q;
              }
 	}
 
 	/* Update current pointers/buckets values */
 	V_curr_dyn_buckets = nbuckets;
 	V_ipfw_dyn_v = dyn_v;
 
 	IPFW_WUNLOCK(chain);
 
 	IPFW_UH_WUNLOCK(chain);
 
 	/* Start periodic callout on initial creation */
 	if (dyn_v_old == NULL) {
         	callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, curvnet, 0);
 		return (0);
 	}
 
 	/* Destroy all mutexes */
 	for (i = 0 ; i < nbuckets_old ; i++)
 		IPFW_BUCK_LOCK_DESTROY(&dyn_v_old[i]);
 
 	/* Free old hash */
 	free(dyn_v_old, M_IPFW);
 
 	return 0;
 }
 
 /**
  * Install state of type 'type' for a dynamic session.
  * The hash table contains two type of rules:
  * - regular rules (O_KEEP_STATE)
  * - rules for sessions with limited number of sess per user
  *   (O_LIMIT). When they are created, the parent is
  *   increased by 1, and decreased on delete. In this case,
  *   the third parameter is the parent rule and not the chain.
  * - "parent" rules for the above (O_LIMIT_PARENT).
  */
 static ipfw_dyn_rule *
 add_dyn_rule(struct ipfw_flow_id *id, int i, u_int8_t dyn_type, struct ip_fw *rule)
 {
 	ipfw_dyn_rule *r;
 
 	IPFW_BUCK_ASSERT(i);
 
 	r = uma_zalloc(V_ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
 	if (r == NULL) {
 		if (last_log != time_uptime) {
 			last_log = time_uptime;
 			log(LOG_DEBUG,
 			    "ipfw: Cannot allocate dynamic state, "
 			    "consider increasing net.inet.ip.fw.dyn_max\n");
 		}
 		return NULL;
 	}
 	ipfw_dyn_count++;
 
 	/*
 	 * refcount on parent is already incremented, so
 	 * it is safe to use parent unlocked.
 	 */
 	if (dyn_type == O_LIMIT) {
 		ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
 		if ( parent->dyn_type != O_LIMIT_PARENT)
 			panic("invalid parent");
 		r->parent = parent;
 		rule = parent->rule;
 	}
 
 	r->id = *id;
 	r->expire = time_uptime + V_dyn_syn_lifetime;
 	r->rule = rule;
 	r->dyn_type = dyn_type;
 	IPFW_ZERO_DYN_COUNTER(r);
 	r->count = 0;
 
 	r->bucket = i;
 	r->next = V_ipfw_dyn_v[i].head;
 	V_ipfw_dyn_v[i].head = r;
 	DEB(print_dyn_rule(id, dyn_type, "add dyn entry", "total");)
 	return r;
 }
 
 /**
  * lookup dynamic parent rule using pkt and rule as search keys.
  * If the lookup fails, then install one.
  */
 static ipfw_dyn_rule *
 lookup_dyn_parent(struct ipfw_flow_id *pkt, int *pindex, struct ip_fw *rule)
 {
 	ipfw_dyn_rule *q;
 	int i, is_v6;
 
 	is_v6 = IS_IP6_FLOW_ID(pkt);
 	i = hash_packet( pkt, V_curr_dyn_buckets );
 	*pindex = i;
 	IPFW_BUCK_LOCK(i);
 	for (q = V_ipfw_dyn_v[i].head ; q != NULL ; q=q->next)
 		if (q->dyn_type == O_LIMIT_PARENT &&
 		    rule== q->rule &&
 		    pkt->proto == q->id.proto &&
 		    pkt->src_port == q->id.src_port &&
 		    pkt->dst_port == q->id.dst_port &&
 		    (
 			(is_v6 &&
 			 IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
 				&(q->id.src_ip6)) &&
 			 IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
 				&(q->id.dst_ip6))) ||
 			(!is_v6 &&
 			 pkt->src_ip == q->id.src_ip &&
 			 pkt->dst_ip == q->id.dst_ip)
 		    )
 		) {
 			q->expire = time_uptime + V_dyn_short_lifetime;
 			DEB(print_dyn_rule(pkt, q->dyn_type,
 			    "lookup_dyn_parent found", "");)
 			return q;
 		}
 
 	/* Add virtual limiting rule */
 	return add_dyn_rule(pkt, i, O_LIMIT_PARENT, rule);
 }
 
 /**
  * Install dynamic state for rule type cmd->o.opcode
  *
  * Returns 1 (failure) if state is not installed because of errors or because
  * session limitations are enforced.
  */
 int
 ipfw_install_state(struct ip_fw_chain *chain, struct ip_fw *rule,
     ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg)
 {
 	ipfw_dyn_rule *q;
 	int i;
 
 	DEB(print_dyn_rule(&args->f_id, cmd->o.opcode, "install_state", "");)
 	
 	i = hash_packet(&args->f_id, V_curr_dyn_buckets);
 
 	IPFW_BUCK_LOCK(i);
 
 	q = lookup_dyn_rule_locked(&args->f_id, i, NULL, NULL);
 
 	if (q != NULL) {	/* should never occur */
 		DEB(
 		if (last_log != time_uptime) {
 			last_log = time_uptime;
 			printf("ipfw: %s: entry already present, done\n",
 			    __func__);
 		})
 		IPFW_BUCK_UNLOCK(i);
 		return (0);
 	}
 
 	/*
 	 * State limiting is done via uma(9) zone limiting.
 	 * Save pointer to newly-installed rule and reject
 	 * packet if add_dyn_rule() returned NULL.
 	 * Note q is currently set to NULL.
 	 */
 
 	switch (cmd->o.opcode) {
 	case O_KEEP_STATE:	/* bidir rule */
 		q = add_dyn_rule(&args->f_id, i, O_KEEP_STATE, rule);
 		break;
 
 	case O_LIMIT: {		/* limit number of sessions */
 		struct ipfw_flow_id id;
 		ipfw_dyn_rule *parent;
 		uint32_t conn_limit;
 		uint16_t limit_mask = cmd->limit_mask;
 		int pindex;
 
 		conn_limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit);
 		  
 		DEB(
 		if (cmd->conn_limit == IP_FW_TARG)
 			printf("ipfw: %s: O_LIMIT rule, conn_limit: %u "
 			    "(tablearg)\n", __func__, conn_limit);
 		else
 			printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n",
 			    __func__, conn_limit);
 		)
 
 		id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0;
 		id.proto = args->f_id.proto;
 		id.addr_type = args->f_id.addr_type;
 		id.fib = M_GETFIB(args->m);
 
 		if (IS_IP6_FLOW_ID (&(args->f_id))) {
 			if (limit_mask & DYN_SRC_ADDR)
 				id.src_ip6 = args->f_id.src_ip6;
 			if (limit_mask & DYN_DST_ADDR)
 				id.dst_ip6 = args->f_id.dst_ip6;
 		} else {
 			if (limit_mask & DYN_SRC_ADDR)
 				id.src_ip = args->f_id.src_ip;
 			if (limit_mask & DYN_DST_ADDR)
 				id.dst_ip = args->f_id.dst_ip;
 		}
 		if (limit_mask & DYN_SRC_PORT)
 			id.src_port = args->f_id.src_port;
 		if (limit_mask & DYN_DST_PORT)
 			id.dst_port = args->f_id.dst_port;
 
 		/*
 		 * We have to release lock for previous bucket to
 		 * avoid possible deadlock
 		 */
 		IPFW_BUCK_UNLOCK(i);
 
 		if ((parent = lookup_dyn_parent(&id, &pindex, rule)) == NULL) {
 			printf("ipfw: %s: add parent failed\n", __func__);
 			IPFW_BUCK_UNLOCK(pindex);
 			return (1);
 		}
 
 		if (parent->count >= conn_limit) {
 			if (V_fw_verbose && last_log != time_uptime) {
 				last_log = time_uptime;
 				char sbuf[24];
 				last_log = time_uptime;
 				snprintf(sbuf, sizeof(sbuf),
 				    "%d drop session",
 				    parent->rule->rulenum);
 				print_dyn_rule_flags(&args->f_id,
 				    cmd->o.opcode,
 				    LOG_SECURITY | LOG_DEBUG,
 				    sbuf, "too many entries");
 			}
 			IPFW_BUCK_UNLOCK(pindex);
 			return (1);
 		}
 		/* Increment counter on parent */
 		parent->count++;
 		IPFW_BUCK_UNLOCK(pindex);
 
 		IPFW_BUCK_LOCK(i);
 		q = add_dyn_rule(&args->f_id, i, O_LIMIT, (struct ip_fw *)parent);
 		if (q == NULL) {
 			/* Decrement index and notify caller */
 			IPFW_BUCK_UNLOCK(i);
 			IPFW_BUCK_LOCK(pindex);
 			parent->count--;
 			IPFW_BUCK_UNLOCK(pindex);
 			return (1);
 		}
 		break;
 	}
 	default:
 		printf("ipfw: %s: unknown dynamic rule type %u\n",
 		    __func__, cmd->o.opcode);
 	}
 
 	if (q == NULL) {
 		IPFW_BUCK_UNLOCK(i);
 		return (1);	/* Notify caller about failure */
 	}
 
 	/* XXX just set lifetime */
 	lookup_dyn_rule_locked(&args->f_id, i, NULL, NULL);
 
 	IPFW_BUCK_UNLOCK(i);
 	return (0);
 }
 
 /*
  * Generate a TCP packet, containing either a RST or a keepalive.
  * When flags & TH_RST, we are sending a RST packet, because of a
  * "reset" action matched the packet.
  * Otherwise we are sending a keepalive, and flags & TH_
  * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
  * so that MAC can label the reply appropriately.
  */
 struct mbuf *
 ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
     u_int32_t ack, int flags)
 {
 	struct mbuf *m = NULL;		/* stupid compiler */
 	int len, dir;
 	struct ip *h = NULL;		/* stupid compiler */
 #ifdef INET6
 	struct ip6_hdr *h6 = NULL;
 #endif
 	struct tcphdr *th = NULL;
 
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 	M_SETFIB(m, id->fib);
 #ifdef MAC
 	if (replyto != NULL)
 		mac_netinet_firewall_reply(replyto, m);
 	else
 		mac_netinet_firewall_send(m);
 #else
 	(void)replyto;		/* don't warn about unused arg */
 #endif
 
 	switch (id->addr_type) {
 	case 4:
 		len = sizeof(struct ip) + sizeof(struct tcphdr);
 		break;
 #ifdef INET6
 	case 6:
 		len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		break;
 #endif
 	default:
 		/* XXX: log me?!? */
 		FREE_PKT(m);
 		return (NULL);
 	}
 	dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN);
 
 	m->m_data += max_linkhdr;
 	m->m_flags |= M_SKIP_FIREWALL;
 	m->m_pkthdr.len = m->m_len = len;
 	m->m_pkthdr.rcvif = NULL;
 	bzero(m->m_data, len);
 
 	switch (id->addr_type) {
 	case 4:
 		h = mtod(m, struct ip *);
 
 		/* prepare for checksum */
 		h->ip_p = IPPROTO_TCP;
 		h->ip_len = htons(sizeof(struct tcphdr));
 		if (dir) {
 			h->ip_src.s_addr = htonl(id->src_ip);
 			h->ip_dst.s_addr = htonl(id->dst_ip);
 		} else {
 			h->ip_src.s_addr = htonl(id->dst_ip);
 			h->ip_dst.s_addr = htonl(id->src_ip);
 		}
 
 		th = (struct tcphdr *)(h + 1);
 		break;
 #ifdef INET6
 	case 6:
 		h6 = mtod(m, struct ip6_hdr *);
 
 		/* prepare for checksum */
 		h6->ip6_nxt = IPPROTO_TCP;
 		h6->ip6_plen = htons(sizeof(struct tcphdr));
 		if (dir) {
 			h6->ip6_src = id->src_ip6;
 			h6->ip6_dst = id->dst_ip6;
 		} else {
 			h6->ip6_src = id->dst_ip6;
 			h6->ip6_dst = id->src_ip6;
 		}
 
 		th = (struct tcphdr *)(h6 + 1);
 		break;
 #endif
 	}
 
 	if (dir) {
 		th->th_sport = htons(id->src_port);
 		th->th_dport = htons(id->dst_port);
 	} else {
 		th->th_sport = htons(id->dst_port);
 		th->th_dport = htons(id->src_port);
 	}
 	th->th_off = sizeof(struct tcphdr) >> 2;
 
 	if (flags & TH_RST) {
 		if (flags & TH_ACK) {
 			th->th_seq = htonl(ack);
 			th->th_flags = TH_RST;
 		} else {
 			if (flags & TH_SYN)
 				seq++;
 			th->th_ack = htonl(seq);
 			th->th_flags = TH_RST | TH_ACK;
 		}
 	} else {
 		/*
 		 * Keepalive - use caller provided sequence numbers
 		 */
 		th->th_seq = htonl(seq);
 		th->th_ack = htonl(ack);
 		th->th_flags = TH_ACK;
 	}
 
 	switch (id->addr_type) {
 	case 4:
 		th->th_sum = in_cksum(m, len);
 
 		/* finish the ip header */
 		h->ip_v = 4;
 		h->ip_hl = sizeof(*h) >> 2;
 		h->ip_tos = IPTOS_LOWDELAY;
 		h->ip_off = htons(0);
 		h->ip_len = htons(len);
 		h->ip_ttl = V_ip_defttl;
 		h->ip_sum = 0;
 		break;
 #ifdef INET6
 	case 6:
 		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6),
 		    sizeof(struct tcphdr));
 
 		/* finish the ip6 header */
 		h6->ip6_vfc |= IPV6_VERSION;
 		h6->ip6_hlim = IPV6_DEFHLIM;
 		break;
 #endif
 	}
 
 	return (m);
 }
 
 /*
  * Queue keepalive packets for given dynamic rule
  */
 static struct mbuf **
 ipfw_dyn_send_ka(struct mbuf **mtailp, ipfw_dyn_rule *q)
 {
 	struct mbuf *m_rev, *m_fwd;
 
 	m_rev = (q->state & ACK_REV) ? NULL :
 	    ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN);
 	m_fwd = (q->state & ACK_FWD) ? NULL :
 	    ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1, q->ack_rev, 0);
 
 	if (m_rev != NULL) {
 		*mtailp = m_rev;
 		mtailp = &(*mtailp)->m_nextpkt;
 	}
 	if (m_fwd != NULL) {
 		*mtailp = m_fwd;
 		mtailp = &(*mtailp)->m_nextpkt;
 	}
 
 	return (mtailp);
 }
 
 /*
  * This procedure is used to perform various maintance
  * on dynamic hash list. Currently it is called every second.
  */
 static void
 ipfw_dyn_tick(void * vnetx) 
 {
 	struct ip_fw_chain *chain;
 	int check_ka = 0;
 #ifdef VIMAGE
 	struct vnet *vp = vnetx;
 #endif
 
 	CURVNET_SET(vp);
 
 	chain = &V_layer3_chain;
 
 	/* Run keepalive checks every keepalive_period iff ka is enabled */
 	if ((V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) &&
 	    (V_dyn_keepalive != 0)) {
 		V_dyn_keepalive_last = time_uptime;
 		check_ka = 1;
 	}
 
 	check_dyn_rules(chain, NULL, check_ka, 1);
 
 	callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, vnetx, 0);
 
 	CURVNET_RESTORE();
 }
 
 
 /*
  * Walk thru all dynamic states doing generic maintance:
  * 1) free expired states
  * 2) free all states based on deleted rule / set
  * 3) send keepalives for states if needed
  *
  * @chain - pointer to current ipfw rules chain
  * @rule - delete all states originated by given rule if != NULL
  * @set - delete all states originated by any rule in set @set if != RESVD_SET
  * @check_ka - perform checking/sending keepalives
  * @timer - indicate call from timer routine.
  *
  * Timer routine must call this function unlocked to permit
  * sending keepalives/resizing table.
  *
  * Others has to call function with IPFW_UH_WLOCK held.
  * Additionally, function assume that dynamic rule/set is
  * ALREADY deleted so no new states can be generated by
  * 'deleted' rules.
  *
  * Write lock is needed to ensure that unused parent rules
  * are not freed by other instance (see stage 2, 3)
  */
 static void
 check_dyn_rules(struct ip_fw_chain *chain, ipfw_range_tlv *rt,
     int check_ka, int timer)
 {
 	struct mbuf *m0, *m, *mnext, **mtailp;
 	struct ip *h;
 	int i, dyn_count, new_buckets = 0, max_buckets;
 	int expired = 0, expired_limits = 0, parents = 0, total = 0;
 	ipfw_dyn_rule *q, *q_prev, *q_next;
 	ipfw_dyn_rule *exp_head, **exptailp;
 	ipfw_dyn_rule *exp_lhead, **expltailp;
 
 	KASSERT(V_ipfw_dyn_v != NULL, ("%s: dynamic table not allocated",
 	    __func__));
 
 	/* Avoid possible LOR */
 	KASSERT(!check_ka || timer, ("%s: keepalive check with lock held",
 	    __func__));
 
 	/*
 	 * Do not perform any checks if we currently have no dynamic states
 	 */
 	if (DYN_COUNT == 0)
 		return;
 
 	/* Expired states */
 	exp_head = NULL;
 	exptailp = &exp_head;
 
 	/* Expired limit states */
 	exp_lhead = NULL;
 	expltailp = &exp_lhead;
 
 	/*
 	 * We make a chain of packets to go out here -- not deferring
 	 * until after we drop the IPFW dynamic rule lock would result
 	 * in a lock order reversal with the normal packet input -> ipfw
 	 * call stack.
 	 */
 	m0 = NULL;
 	mtailp = &m0;
 
 	/* Protect from hash resizing */
 	if (timer != 0)
 		IPFW_UH_WLOCK(chain);
 	else
 		IPFW_UH_WLOCK_ASSERT(chain);
 
 #define	NEXT_RULE()	{ q_prev = q; q = q->next ; continue; }
 
 	/* Stage 1: perform requested deletion */
 	for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
 		IPFW_BUCK_LOCK(i);
 		for (q = V_ipfw_dyn_v[i].head, q_prev = q; q ; ) {
 			/* account every rule */
 			total++;
 
 			/* Skip parent rules at all */
 			if (q->dyn_type == O_LIMIT_PARENT) {
 				parents++;
 				NEXT_RULE();
 			}
 
 			/*
 			 * Remove rules which are:
 			 * 1) expired
 			 * 2) matches deletion range
 			 */
 			if ((TIME_LEQ(q->expire, time_uptime)) ||
 			    (rt != NULL && ipfw_match_range(q->rule, rt))) {
 				if (TIME_LE(time_uptime, q->expire) &&
 				    q->dyn_type == O_KEEP_STATE &&
 				    V_dyn_keep_states != 0) {
 					/*
 					 * Do not delete state if
 					 * it is not expired and
 					 * dyn_keep_states is ON.
 					 * However we need to re-link it
 					 * to any other stable rule
 					 */
 					q->rule = chain->default_rule;
 					NEXT_RULE();
 				}
 
 				/* Unlink q from current list */
 				q_next = q->next;
 				if (q == V_ipfw_dyn_v[i].head)
 					V_ipfw_dyn_v[i].head = q_next;
 				else
 					q_prev->next = q_next;
 
 				q->next = NULL;
 
 				/* queue q to expire list */
 				if (q->dyn_type != O_LIMIT) {
 					*exptailp = q;
 					exptailp = &(*exptailp)->next;
 					DEB(print_dyn_rule(&q->id, q->dyn_type,
 					    "unlink entry", "left");
 					)
 				} else {
 					/* Separate list for limit rules */
 					*expltailp = q;
 					expltailp = &(*expltailp)->next;
 					expired_limits++;
 					DEB(print_dyn_rule(&q->id, q->dyn_type,
 					    "unlink limit entry", "left");
 					)
 				}
 
 				q = q_next;
 				expired++;
 				continue;
 			}
 
 			/*
 			 * Check if we need to send keepalive:
 			 * we need to ensure if is time to do KA,
 			 * this is established TCP session, and
 			 * expire time is within keepalive interval
 			 */
 			if ((check_ka != 0) && (q->id.proto == IPPROTO_TCP) &&
 			    ((q->state & BOTH_SYN) == BOTH_SYN) &&
 			    (TIME_LEQ(q->expire, time_uptime +
 			      V_dyn_keepalive_interval)))
 				mtailp = ipfw_dyn_send_ka(mtailp, q);
 
 			NEXT_RULE();
 		}
 		IPFW_BUCK_UNLOCK(i);
 	}
 
 	/* Stage 2: decrement counters from O_LIMIT parents */
 	if (expired_limits != 0) {
 		/*
 		 * XXX: Note that deleting set with more than one
 		 * heavily-used LIMIT rules can result in overwhelming
 		 * locking due to lack of per-hash value sorting
 		 *
 		 * We should probably think about:
 		 * 1) pre-allocating hash of size, say,
 		 * MAX(16, V_curr_dyn_buckets / 1024)
 		 * 2) checking if expired_limits is large enough
 		 * 3) If yes, init hash (or its part), re-link
 		 * current list and start decrementing procedure in
 		 * each bucket separately
 		 */
 
 		/*
 		 * Small optimization: do not unlock bucket until
 		 * we see the next item resides in different bucket
 		 */
 		if (exp_lhead != NULL) {
 			i = exp_lhead->parent->bucket;
 			IPFW_BUCK_LOCK(i);
 		}
 		for (q = exp_lhead; q != NULL; q = q->next) {
 			if (i != q->parent->bucket) {
 				IPFW_BUCK_UNLOCK(i);
 				i = q->parent->bucket;
 				IPFW_BUCK_LOCK(i);
 			}
 
 			/* Decrease parent refcount */
 			q->parent->count--;
 		}
 		if (exp_lhead != NULL)
 			IPFW_BUCK_UNLOCK(i);
 	}
 
 	/*
 	 * We protectet ourselves from unused parent deletion
 	 * (from the timer function) by holding UH write lock.
 	 */
 
 	/* Stage 3: remove unused parent rules */
 	if ((parents != 0) && (expired != 0)) {
 		for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
 			IPFW_BUCK_LOCK(i);
 			for (q = V_ipfw_dyn_v[i].head, q_prev = q ; q ; ) {
 				if (q->dyn_type != O_LIMIT_PARENT)
 					NEXT_RULE();
 
 				if (q->count != 0)
 					NEXT_RULE();
 
 				/* Parent rule without consumers */
 
 				/* Unlink q from current list */
 				q_next = q->next;
 				if (q == V_ipfw_dyn_v[i].head)
 					V_ipfw_dyn_v[i].head = q_next;
 				else
 					q_prev->next = q_next;
 
 				q->next = NULL;
 
 				/* Add to expired list */
 				*exptailp = q;
 				exptailp = &(*exptailp)->next;
 
 				DEB(print_dyn_rule(&q->id, q->dyn_type,
 				    "unlink parent entry", "left");
 				)
 
 				expired++;
 
 				q = q_next;
 			}
 			IPFW_BUCK_UNLOCK(i);
 		}
 	}
 
 #undef NEXT_RULE
 
 	if (timer != 0) {
 		/*
 		 * Check if we need to resize hash:
 		 * if current number of states exceeds number of buckes in hash,
 		 * grow hash size to the minimum power of 2 which is bigger than
 		 * current states count. Limit hash size by 64k.
 		 */
 		max_buckets = (V_dyn_buckets_max > 65536) ?
 		    65536 : V_dyn_buckets_max;
 	
 		dyn_count = DYN_COUNT;
 	
 		if ((dyn_count > V_curr_dyn_buckets * 2) &&
 		    (dyn_count < max_buckets)) {
 			new_buckets = V_curr_dyn_buckets;
 			while (new_buckets < dyn_count) {
 				new_buckets *= 2;
 	
 				if (new_buckets >= max_buckets)
 					break;
 			}
 		}
 
 		IPFW_UH_WUNLOCK(chain);
 	}
 
 	/* Finally delete old states ad limits if any */
 	for (q = exp_head; q != NULL; q = q_next) {
 		q_next = q->next;
 		uma_zfree(V_ipfw_dyn_rule_zone, q);
 		ipfw_dyn_count--;
 	}
 
 	for (q = exp_lhead; q != NULL; q = q_next) {
 		q_next = q->next;
 		uma_zfree(V_ipfw_dyn_rule_zone, q);
 		ipfw_dyn_count--;
 	}
 
 	/*
 	 * The rest code MUST be called from timer routine only
 	 * without holding any locks
 	 */
 	if (timer == 0)
 		return;
 
 	/* Send keepalive packets if any */
 	for (m = m0; m != NULL; m = mnext) {
 		mnext = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		h = mtod(m, struct ip *);
 		if (h->ip_v == 4)
 			ip_output(m, NULL, NULL, 0, NULL, NULL);
 #ifdef INET6
 		else
 			ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
 #endif
 	}
 
 	/* Run table resize without holding any locks */
 	if (new_buckets != 0)
 		resize_dynamic_table(chain, new_buckets);
 }
 
 /*
  * Deletes all dynamic rules originated by given rule or all rules in
  * given set. Specify RESVD_SET to indicate set should not be used.
  * @chain - pointer to current ipfw rules chain
  * @rr - delete all states originated by rules in matched range.
  *
  * Function has to be called with IPFW_UH_WLOCK held.
  * Additionally, function assume that dynamic rule/set is
  * ALREADY deleted so no new states can be generated by
  * 'deleted' rules.
  */
 void
 ipfw_expire_dyn_rules(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
 {
 
 	check_dyn_rules(chain, rt, 0, 0);
 }
 
 /*
  * Check if rule contains at least one dynamic opcode.
  *
  * Returns 1 if such opcode is found, 0 otherwise.
  */
 int
 ipfw_is_dyn_rule(struct ip_fw *rule)
 {
 	int cmdlen, l;
 	ipfw_insn *cmd;
 
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		switch (cmd->opcode) {
 		case O_LIMIT:
 		case O_KEEP_STATE:
 		case O_PROBE_STATE:
 		case O_CHECK_STATE:
 			return (1);
 		}
 	}
 
 	return (0);
 }
 
 void
 ipfw_dyn_init(struct ip_fw_chain *chain)
 {
 
         V_ipfw_dyn_v = NULL;
         V_dyn_buckets_max = 256; /* must be power of 2 */
         V_curr_dyn_buckets = 256; /* must be power of 2 */
  
         V_dyn_ack_lifetime = 300;
         V_dyn_syn_lifetime = 20;
         V_dyn_fin_lifetime = 1;
         V_dyn_rst_lifetime = 1;
         V_dyn_udp_lifetime = 10;
         V_dyn_short_lifetime = 5;
 
         V_dyn_keepalive_interval = 20;
         V_dyn_keepalive_period = 5;
         V_dyn_keepalive = 1;    /* do send keepalives */
 	V_dyn_keepalive_last = time_uptime;
         
         V_dyn_max = 16384; /* max # of dynamic rules */
 
 	V_ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule",
 	    sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 
 	/* Enforce limit on dynamic rules */
 	uma_zone_set_max(V_ipfw_dyn_rule_zone, V_dyn_max);
 
         callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);
 
 	/*
 	 * This can potentially be done on first dynamic rule
 	 * being added to chain.
 	 */
 	resize_dynamic_table(chain, V_curr_dyn_buckets);
 }
 
 void
 ipfw_dyn_uninit(int pass)
 {
 	int i;
 
 	if (pass == 0) {
 		callout_drain(&V_ipfw_timeout);
 		return;
 	}
 
 	if (V_ipfw_dyn_v != NULL) {
 		/*
 		 * Skip deleting all dynamic states -
 		 * uma_zdestroy() does this more efficiently;
 		 */
 
 		/* Destroy all mutexes */
 		for (i = 0 ; i < V_curr_dyn_buckets ; i++)
 			IPFW_BUCK_LOCK_DESTROY(&V_ipfw_dyn_v[i]);
 		free(V_ipfw_dyn_v, M_IPFW);
 		V_ipfw_dyn_v = NULL;
 	}
 
         uma_zdestroy(V_ipfw_dyn_rule_zone);
 }
 
 #ifdef SYSCTL_NODE
 /*
  * Get/set maximum number of dynamic states in given VNET instance.
  */
 static int
 sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	unsigned int nstates;
 
 	nstates = V_dyn_max;
 
 	error = sysctl_handle_int(oidp, &nstates, 0, req);
 	/* Read operation or some error */
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	V_dyn_max = nstates;
 	uma_zone_set_max(V_ipfw_dyn_rule_zone, V_dyn_max);
 
 	return (0);
 }
 
 /*
  * Get current number of dynamic states in given VNET instance.
  */
 static int
 sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	unsigned int nstates;
 
 	nstates = DYN_COUNT;
 
 	error = sysctl_handle_int(oidp, &nstates, 0, req);
 
 	return (error);
 }
 #endif
 
 /*
  * Returns size of dynamic states in legacy format
  */
 int
 ipfw_dyn_len(void)
 {
 
 	return (V_ipfw_dyn_v == NULL) ? 0 :
 		(DYN_COUNT * sizeof(ipfw_dyn_rule));
 }
 
 /*
  * Returns number of dynamic states.
  * Used by dump format v1 (current).
  */
 int
 ipfw_dyn_get_count(void)
 {
 
 	return (V_ipfw_dyn_v == NULL) ? 0 : DYN_COUNT;
 }
 
 static void
 export_dyn_rule(ipfw_dyn_rule *src, ipfw_dyn_rule *dst)
 {
 
 	memcpy(dst, src, sizeof(*src));
 	memcpy(&(dst->rule), &(src->rule->rulenum), sizeof(src->rule->rulenum));
 	/*
 	 * store set number into high word of
 	 * dst->rule pointer.
 	 */
 	memcpy((char *)&dst->rule + sizeof(src->rule->rulenum),
 	    &(src->rule->set), sizeof(src->rule->set));
 	/*
 	 * store a non-null value in "next".
 	 * The userland code will interpret a
 	 * NULL here as a marker
 	 * for the last dynamic rule.
 	 */
 	memcpy(&dst->next, &dst, sizeof(dst));
 	dst->expire =
 	    TIME_LEQ(dst->expire, time_uptime) ?  0 : dst->expire - time_uptime;
 }
 
 /*
  * Fills int buffer given by @sd with dynamic states.
  * Used by dump format v1 (current).
  *
  * Returns 0 on success.
  */
 int
 ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd)
 {
 	ipfw_dyn_rule *p;
 	ipfw_obj_dyntlv *dst, *last;
 	ipfw_obj_ctlv *ctlv;
 	int i;
 	size_t sz;
 
 	if (V_ipfw_dyn_v == NULL)
 		return (0);
 
 	IPFW_UH_RLOCK_ASSERT(chain);
 
 	ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv));
 	if (ctlv == NULL)
 		return (ENOMEM);
 	sz = sizeof(ipfw_obj_dyntlv);
 	ctlv->head.type = IPFW_TLV_DYNSTATE_LIST;
 	ctlv->objsize = sz;
 	last = NULL;
 
 	for (i = 0 ; i < V_curr_dyn_buckets; i++) {
 		IPFW_BUCK_LOCK(i);
 		for (p = V_ipfw_dyn_v[i].head ; p != NULL; p = p->next) {
 			dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd, sz);
 			if (dst == NULL) {
 				IPFW_BUCK_UNLOCK(i);
 				return (ENOMEM);
 			}
 
 			export_dyn_rule(p, &dst->state);
 			dst->head.length = sz;
 			dst->head.type = IPFW_TLV_DYN_ENT;
 			last = dst;
 		}
 		IPFW_BUCK_UNLOCK(i);
 	}
 
 	if (last != NULL) /* mark last dynamic rule */
 		last->head.flags = IPFW_DF_LAST;
 
 	return (0);
 }
 
 /*
  * Fill given buffer with dynamic states (legacy format).
  * IPFW_UH_RLOCK has to be held while calling.
  */
 void
 ipfw_get_dynamic(struct ip_fw_chain *chain, char **pbp, const char *ep)
 {
 	ipfw_dyn_rule *p, *last = NULL;
 	char *bp;
 	int i;
 
 	if (V_ipfw_dyn_v == NULL)
 		return;
 	bp = *pbp;
 
 	IPFW_UH_RLOCK_ASSERT(chain);
 
 	for (i = 0 ; i < V_curr_dyn_buckets; i++) {
 		IPFW_BUCK_LOCK(i);
 		for (p = V_ipfw_dyn_v[i].head ; p != NULL; p = p->next) {
 			if (bp + sizeof *p <= ep) {
 				ipfw_dyn_rule *dst =
 					(ipfw_dyn_rule *)bp;
 
 				export_dyn_rule(p, dst);
 				last = dst;
 				bp += sizeof(ipfw_dyn_rule);
 			}
 		}
 		IPFW_BUCK_UNLOCK(i);
 	}
 
 	if (last != NULL) /* mark last dynamic rule */
 		bzero(&last->next, sizeof(last));
 	*pbp = bp;
 }
 /* end of file */
Index: user/dchagin/lemul/sys/netpfil/ipfw/ip_fw_pfil.c
===================================================================
--- user/dchagin/lemul/sys/netpfil/ipfw/ip_fw_pfil.c	(revision 274240)
+++ user/dchagin/lemul/sys/netpfil/ipfw/ip_fw_pfil.c	(revision 274241)
@@ -1,574 +1,575 @@
 /*-
  * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #ifndef INET
 #error IPFIREWALL requires INET.
 #endif /* INET */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/ethernet.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 
 #include <netgraph/ng_ipfw.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #include <machine/in_cksum.h>
 
 static VNET_DEFINE(int, fw_enable) = 1;
 #define V_fw_enable	VNET(fw_enable)
 
 #ifdef INET6
 static VNET_DEFINE(int, fw6_enable) = 1;
 #define V_fw6_enable	VNET(fw6_enable)
 #endif
 
 static VNET_DEFINE(int, fwlink_enable) = 0;
 #define V_fwlink_enable	VNET(fwlink_enable)
 
 int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
 
 /* Forward declarations. */
 static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int);
 int ipfw_check_packet(void *, struct mbuf **, struct ifnet *, int,
 	struct inpcb *);
 int ipfw_check_frame(void *, struct mbuf **, struct ifnet *, int,
 	struct inpcb *);
 
 #ifdef SYSCTL_NODE
 
 SYSBEGIN(f1)
 
 SYSCTL_DECL(_net_inet_ip_fw);
-SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
-    ipfw_chg_hook, "I", "Enable ipfw");
+SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3,
+    &VNET_NAME(fw_enable), 0, ipfw_chg_hook, "I", "Enable ipfw");
 #ifdef INET6
 SYSCTL_DECL(_net_inet6_ip6_fw);
-SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
-    ipfw_chg_hook, "I", "Enable ipfw+6");
+SYSCTL_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3,
+    &VNET_NAME(fw6_enable), 0, ipfw_chg_hook, "I", "Enable ipfw+6");
 #endif /* INET6 */
 
 SYSCTL_DECL(_net_link_ether);
-SYSCTL_VNET_PROC(_net_link_ether, OID_AUTO, ipfw,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fwlink_enable), 0,
-    ipfw_chg_hook, "I", "Pass ether pkts through firewall");
+SYSCTL_PROC(_net_link_ether, OID_AUTO, ipfw,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3,
+    &VNET_NAME(fwlink_enable), 0, ipfw_chg_hook, "I",
+    "Pass ether pkts through firewall");
 
 SYSEND
 
 #endif /* SYSCTL_NODE */
 
 /*
  * The pfilter hook to pass packets to ipfw_chk and then to
  * dummynet, divert, netgraph or other modules.
  * The packet may be consumed.
  */
 int
 ipfw_check_packet(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
     struct inpcb *inp)
 {
 	struct ip_fw_args args;
 	struct m_tag *tag;
 	int ipfw;
 	int ret;
 
 	/* convert dir to IPFW values */
 	dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT;
 	bzero(&args, sizeof(args));
 
 again:
 	/*
 	 * extract and remove the tag if present. If we are left
 	 * with onepass, optimize the outgoing path.
 	 */
 	tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL);
 	if (tag != NULL) {
 		args.rule = *((struct ipfw_rule_ref *)(tag+1));
 		m_tag_delete(*m0, tag);
 		if (args.rule.info & IPFW_ONEPASS)
 			return (0);
 	}
 
 	args.m = *m0;
 	args.oif = dir == DIR_OUT ? ifp : NULL;
 	args.inp = inp;
 
 	ipfw = ipfw_chk(&args);
 	*m0 = args.m;
 
 	KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
 	    __func__));
 
 	/* breaking out of the switch means drop */
 	ret = 0;	/* default return value for pass */
 	switch (ipfw) {
 	case IP_FW_PASS:
 		/* next_hop may be set by ipfw_chk */
 		if (args.next_hop == NULL && args.next_hop6 == NULL)
 			break; /* pass */
 #if (!defined(INET6) && !defined(INET))
 		ret = EACCES;
 #else
 	    {
 		struct m_tag *fwd_tag;
 		size_t len;
 
 		KASSERT(args.next_hop == NULL || args.next_hop6 == NULL,
 		    ("%s: both next_hop=%p and next_hop6=%p not NULL", __func__,
 		     args.next_hop, args.next_hop6));
 #ifdef INET6
 		if (args.next_hop6 != NULL)
 			len = sizeof(struct sockaddr_in6);
 #endif
 #ifdef INET
 		if (args.next_hop != NULL)
 			len = sizeof(struct sockaddr_in);
 #endif
 
 		/* Incoming packets should not be tagged so we do not
 		 * m_tag_find. Outgoing packets may be tagged, so we
 		 * reuse the tag if present.
 		 */
 		fwd_tag = (dir == DIR_IN) ? NULL :
 			m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL);
 		if (fwd_tag != NULL) {
 			m_tag_unlink(*m0, fwd_tag);
 		} else {
 			fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, len,
 			    M_NOWAIT);
 			if (fwd_tag == NULL) {
 				ret = EACCES;
 				break; /* i.e. drop */
 			}
 		}
 #ifdef INET6
 		if (args.next_hop6 != NULL) {
 			bcopy(args.next_hop6, (fwd_tag+1), len);
 			if (in6_localip(&args.next_hop6->sin6_addr))
 				(*m0)->m_flags |= M_FASTFWD_OURS;
 			(*m0)->m_flags |= M_IP6_NEXTHOP;
 		}
 #endif
 #ifdef INET
 		if (args.next_hop != NULL) {
 			bcopy(args.next_hop, (fwd_tag+1), len);
 			if (in_localip(args.next_hop->sin_addr))
 				(*m0)->m_flags |= M_FASTFWD_OURS;
 			(*m0)->m_flags |= M_IP_NEXTHOP;
 		}
 #endif
 		m_tag_prepend(*m0, fwd_tag);
 	    }
 #endif /* INET || INET6 */
 		break;
 
 	case IP_FW_DENY:
 		ret = EACCES;
 		break; /* i.e. drop */
 
 	case IP_FW_DUMMYNET:
 		ret = EACCES;
 		if (ip_dn_io_ptr == NULL)
 			break; /* i.e. drop */
 		if (mtod(*m0, struct ip *)->ip_v == 4)
 			ret = ip_dn_io_ptr(m0, dir, &args);
 		else if (mtod(*m0, struct ip *)->ip_v == 6)
 			ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args);
 		else
 			break; /* drop it */
 		/*
 		 * XXX should read the return value.
 		 * dummynet normally eats the packet and sets *m0=NULL
 		 * unless the packet can be sent immediately. In this
 		 * case args is updated and we should re-run the
 		 * check without clearing args.
 		 */
 		if (*m0 != NULL)
 			goto again;
 		break;
 
 	case IP_FW_TEE:
 	case IP_FW_DIVERT:
 		if (ip_divert_ptr == NULL) {
 			ret = EACCES;
 			break; /* i.e. drop */
 		}
 		ret = ipfw_divert(m0, dir, &args.rule,
 			(ipfw == IP_FW_TEE) ? 1 : 0);
 		/* continue processing for the original packet (tee). */
 		if (*m0)
 			goto again;
 		break;
 
 	case IP_FW_NGTEE:
 	case IP_FW_NETGRAPH:
 		if (ng_ipfw_input_p == NULL) {
 			ret = EACCES;
 			break; /* i.e. drop */
 		}
 		ret = ng_ipfw_input_p(m0, dir, &args,
 			(ipfw == IP_FW_NGTEE) ? 1 : 0);
 		if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */
 			goto again;	/* continue with packet */
 		break;
 
 	case IP_FW_NAT:
 		/* honor one-pass in case of successful nat */
 		if (V_fw_one_pass)
 			break; /* ret is already 0 */
 		goto again;
 
 	case IP_FW_REASS:
 		goto again;		/* continue with packet */
 	
 	default:
 		KASSERT(0, ("%s: unknown retval", __func__));
 	}
 
 	if (ret != 0) {
 		if (*m0)
 			FREE_PKT(*m0);
 		*m0 = NULL;
 	}
 
 	return ret;
 }
 
 /*
  * ipfw processing for ethernet packets (in and out).
  * Inteface is NULL from ether_demux, and ifp from
  * ether_output_frame.
  */
 int
 ipfw_check_frame(void *arg, struct mbuf **m0, struct ifnet *dst, int dir,
     struct inpcb *inp)
 {
 	struct ether_header *eh;
 	struct ether_header save_eh;
 	struct mbuf *m;
 	int i, ret;
 	struct ip_fw_args args;
 	struct m_tag *mtag;
 
 	/* fetch start point from rule, if any */
 	mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL);
 	if (mtag == NULL) {
 		args.rule.slot = 0;
 	} else {
 		/* dummynet packet, already partially processed */
 		struct ipfw_rule_ref *r;
 
 		/* XXX can we free it after use ? */
 		mtag->m_tag_id = PACKET_TAG_NONE;
 		r = (struct ipfw_rule_ref *)(mtag + 1);
 		if (r->info & IPFW_ONEPASS)
 			return (0);
 		args.rule = *r;
 	}
 
 	/* I need some amt of data to be contiguous */
 	m = *m0;
 	i = min(m->m_pkthdr.len, max_protohdr);
 	if (m->m_len < i) {
 		m = m_pullup(m, i);
 		if (m == NULL) {
 			*m0 = m;
 			return (0);
 		}
 	}
 	eh = mtod(m, struct ether_header *);
 	save_eh = *eh;			/* save copy for restore below */
 	m_adj(m, ETHER_HDR_LEN);	/* strip ethernet header */
 
 	args.m = m;		/* the packet we are looking at		*/
 	args.oif = dir == PFIL_OUT ? dst: NULL;	/* destination, if any	*/
 	args.next_hop = NULL;	/* we do not support forward yet	*/
 	args.next_hop6 = NULL;	/* we do not support forward yet	*/
 	args.eh = &save_eh;	/* MAC header for bridged/MAC packets	*/
 	args.inp = NULL;	/* used by ipfw uid/gid/jail rules	*/
 	i = ipfw_chk(&args);
 	m = args.m;
 	if (m != NULL) {
 		/*
 		 * Restore Ethernet header, as needed, in case the
 		 * mbuf chain was replaced by ipfw.
 		 */
 		M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
 		if (m == NULL) {
 			*m0 = NULL;
 			return (0);
 		}
 		if (eh != mtod(m, struct ether_header *))
 			bcopy(&save_eh, mtod(m, struct ether_header *),
 				ETHER_HDR_LEN);
 	}
 	*m0 = m;
 
 	ret = 0;
 	/* Check result of ipfw_chk() */
 	switch (i) {
 	case IP_FW_PASS:
 		break;
 
 	case IP_FW_DENY:
 		ret = EACCES;
 		break; /* i.e. drop */
 
 	case IP_FW_DUMMYNET:
 		ret = EACCES;
 		int dir;
 
 		if (ip_dn_io_ptr == NULL)
 			break; /* i.e. drop */
 
 		*m0 = NULL;
 		dir = PROTO_LAYER2 | (dst ? DIR_OUT : DIR_IN);
 		ip_dn_io_ptr(&m, dir, &args);
 		return 0;
 
 	default:
 		KASSERT(0, ("%s: unknown retval", __func__));
 	}
 
 	if (ret != 0) {
 		if (*m0)
 			FREE_PKT(*m0);
 		*m0 = NULL;
 	}
 
 	return ret;
 }
 
 /* do the divert, return 1 on error 0 on success */
 static int
 ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule,
 	int tee)
 {
 	/*
 	 * ipfw_chk() has already tagged the packet with the divert tag.
 	 * If tee is set, copy packet and return original.
 	 * If not tee, consume packet and send it to divert socket.
 	 */
 	struct mbuf *clone;
 	struct ip *ip = mtod(*m0, struct ip *);
 	struct m_tag *tag;
 
 	/* Cloning needed for tee? */
 	if (tee == 0) {
 		clone = *m0;	/* use the original mbuf */
 		*m0 = NULL;
 	} else {
 		clone = m_dup(*m0, M_NOWAIT);
 		/* If we cannot duplicate the mbuf, we sacrifice the divert
 		 * chain and continue with the tee-ed packet.
 		 */
 		if (clone == NULL)
 			return 1;
 	}
 
 	/*
 	 * Divert listeners can normally handle non-fragmented packets,
 	 * but we can only reass in the non-tee case.
 	 * This means that listeners on a tee rule may get fragments,
 	 * and have to live with that.
 	 * Note that we now have the 'reass' ipfw option so if we care
 	 * we can do it before a 'tee'.
 	 */
 	if (!tee) switch (ip->ip_v) {
 	case IPVERSION:
 	    if (ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) {
 		int hlen;
 		struct mbuf *reass;
 
 		reass = ip_reass(clone); /* Reassemble packet. */
 		if (reass == NULL)
 			return 0; /* not an error */
 		/* if reass = NULL then it was consumed by ip_reass */
 		/*
 		 * IP header checksum fixup after reassembly and leave header
 		 * in network byte order.
 		 */
 		ip = mtod(reass, struct ip *);
 		hlen = ip->ip_hl << 2;
 		ip->ip_sum = 0;
 		if (hlen == sizeof(struct ip))
 			ip->ip_sum = in_cksum_hdr(ip);
 		else
 			ip->ip_sum = in_cksum(reass, hlen);
 		clone = reass;
 	    }
 	    break;
 #ifdef INET6
 	case IPV6_VERSION >> 4:
 	    {
 	    struct ip6_hdr *const ip6 = mtod(clone, struct ip6_hdr *);
 
 		if (ip6->ip6_nxt == IPPROTO_FRAGMENT) {
 			int nxt, off;
 
 			off = sizeof(struct ip6_hdr);
 			nxt = frag6_input(&clone, &off, 0);
 			if (nxt == IPPROTO_DONE)
 				return (0);
 		}
 		break;
 	    }
 #endif
 	}
 
 	/* attach a tag to the packet with the reinject info */
 	tag = m_tag_alloc(MTAG_IPFW_RULE, 0,
 		    sizeof(struct ipfw_rule_ref), M_NOWAIT);
 	if (tag == NULL) {
 		FREE_PKT(clone);
 		return 1;
 	}
 	*((struct ipfw_rule_ref *)(tag+1)) = *rule;
 	m_tag_prepend(clone, tag);
 
 	/* Do the dirty job... */
 	ip_divert_ptr(clone, incoming);
 	return 0;
 }
 
 /*
  * attach or detach hooks for a given protocol family
  */
 static int
 ipfw_hook(int onoff, int pf)
 {
 	struct pfil_head *pfh;
 	void *hook_func;
 
 	pfh = pfil_head_get(PFIL_TYPE_AF, pf);
 	if (pfh == NULL)
 		return ENOENT;
 
 	hook_func = (pf == AF_LINK) ? ipfw_check_frame : ipfw_check_packet;
 
 	(void) (onoff ? pfil_add_hook : pfil_remove_hook)
 	    (hook_func, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh);
 
 	return 0;
 }
 
 int
 ipfw_attach_hooks(int arg)
 {
 	int error = 0;
 
 	if (arg == 0) /* detach */
 		ipfw_hook(0, AF_INET);
 	else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) {
                 error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */
                 printf("ipfw_hook() error\n");
         }
 #ifdef INET6
 	if (arg == 0) /* detach */
 		ipfw_hook(0, AF_INET6);
 	else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) {
                 error = ENOENT;
                 printf("ipfw6_hook() error\n");
         }
 #endif
 	if (arg == 0) /* detach */
 		ipfw_hook(0, AF_LINK);
 	else if (V_fwlink_enable && ipfw_hook(1, AF_LINK) != 0) {
                 error = ENOENT;
                 printf("ipfw_link_hook() error\n");
         }
 	return error;
 }
 
 int
 ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
 {
 	int newval;
 	int error;
 	int af;
 
 	if (arg1 == &V_fw_enable)
 		af = AF_INET;
 #ifdef INET6
 	else if (arg1 == &V_fw6_enable)
 		af = AF_INET6;
 #endif
 	else if (arg1 == &V_fwlink_enable)
 		af = AF_LINK;
 	else 
 		return (EINVAL);
 
 	newval = *(int *)arg1;
 	/* Handle sysctl change */
 	error = sysctl_handle_int(oidp, &newval, 0, req);
 
 	if (error)
 		return (error);
 
 	/* Formalize new value */
 	newval = (newval) ? 1 : 0;
 
 	if (*(int *)arg1 == newval)
 		return (0);
 
 	error = ipfw_hook(newval, af);
 	if (error)
 		return (error);
 	*(int *)arg1 = newval;
 
 	return (0);
 }
 /* end of file */
Index: user/dchagin/lemul/sys/netpfil/pf/if_pfsync.c
===================================================================
--- user/dchagin/lemul/sys/netpfil/pf/if_pfsync.c	(revision 274240)
+++ user/dchagin/lemul/sys/netpfil/pf/if_pfsync.c	(revision 274241)
@@ -1,2421 +1,2421 @@
 /*-
  * Copyright (c) 2002 Michael Shalayeff
  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *
  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
 /*
  * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $
  *
  * Revisions picked from OpenBSD after revision 1.110 import:
  * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input()
  * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
  * 1.120, 1.175 - use monotonic time_uptime
  * 1.122 - reduce number of updates for non-TCP sessions
  * 1.125, 1.127 - rewrite merge or stale processing
  * 1.128 - cleanups
  * 1.146 - bzero() mbuf before sparsely filling it with data
  * 1.170 - SIOCSIFMTU checks
  * 1.126, 1.142 - deferred packets processing
  * 1.173 - correct expire time processing
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_pf.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/endian.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/vnet.h>
 #include <net/pfvar.h>
 #include <net/if_pfsync.h>
 
 #include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 
 #define PFSYNC_MINPKT ( \
 	sizeof(struct ip) + \
 	sizeof(struct pfsync_header) + \
 	sizeof(struct pfsync_subheader) )
 
 struct pfsync_pkt {
 	struct ip *ip;
 	struct in_addr src;
 	u_int8_t flags;
 };
 
 static int	pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
 		    struct pfsync_state_peer *);
 static int	pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
 
 static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
 	pfsync_in_clr,			/* PFSYNC_ACT_CLR */
 	pfsync_in_ins,			/* PFSYNC_ACT_INS */
 	pfsync_in_iack,			/* PFSYNC_ACT_INS_ACK */
 	pfsync_in_upd,			/* PFSYNC_ACT_UPD */
 	pfsync_in_upd_c,		/* PFSYNC_ACT_UPD_C */
 	pfsync_in_ureq,			/* PFSYNC_ACT_UPD_REQ */
 	pfsync_in_del,			/* PFSYNC_ACT_DEL */
 	pfsync_in_del_c,		/* PFSYNC_ACT_DEL_C */
 	pfsync_in_error,		/* PFSYNC_ACT_INS_F */
 	pfsync_in_error,		/* PFSYNC_ACT_DEL_F */
 	pfsync_in_bus,			/* PFSYNC_ACT_BUS */
 	pfsync_in_tdb,			/* PFSYNC_ACT_TDB */
 	pfsync_in_eof			/* PFSYNC_ACT_EOF */
 };
 
 struct pfsync_q {
 	void		(*write)(struct pf_state *, void *);
 	size_t		len;
 	u_int8_t	action;
 };
 
 /* we have one of these for every PFSYNC_S_ */
 static void	pfsync_out_state(struct pf_state *, void *);
 static void	pfsync_out_iack(struct pf_state *, void *);
 static void	pfsync_out_upd_c(struct pf_state *, void *);
 static void	pfsync_out_del(struct pf_state *, void *);
 
 static struct pfsync_q pfsync_qs[] = {
 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
 	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
 	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
 	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
 };
 
 static void	pfsync_q_ins(struct pf_state *, int);
 static void	pfsync_q_del(struct pf_state *);
 
 static void	pfsync_update_state(struct pf_state *);
 
 struct pfsync_upd_req_item {
 	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
 	struct pfsync_upd_req			ur_msg;
 };
 
 struct pfsync_deferral {
 	struct pfsync_softc		*pd_sc;
 	TAILQ_ENTRY(pfsync_deferral)	pd_entry;
 	u_int				pd_refs;
 	struct callout			pd_tmo;
 
 	struct pf_state			*pd_st;
 	struct mbuf			*pd_m;
 };
 
 struct pfsync_softc {
 	/* Configuration */
 	struct ifnet		*sc_ifp;
 	struct ifnet		*sc_sync_if;
 	struct ip_moptions	sc_imo;
 	struct in_addr		sc_sync_peer;
 	uint32_t		sc_flags;
 #define	PFSYNCF_OK		0x00000001
 #define	PFSYNCF_DEFER		0x00000002
 #define	PFSYNCF_PUSH		0x00000004
 	uint8_t			sc_maxupdates;
 	struct ip		sc_template;
 	struct callout		sc_tmo;
 	struct mtx		sc_mtx;
 
 	/* Queued data */
 	size_t			sc_len;
 	TAILQ_HEAD(, pf_state)			sc_qs[PFSYNC_S_COUNT];
 	TAILQ_HEAD(, pfsync_upd_req_item)	sc_upd_req_list;
 	TAILQ_HEAD(, pfsync_deferral)		sc_deferrals;
 	u_int			sc_deferred;
 	void			*sc_plus;
 	size_t			sc_pluslen;
 
 	/* Bulk update info */
 	struct mtx		sc_bulk_mtx;
 	uint32_t		sc_ureq_sent;
 	int			sc_bulk_tries;
 	uint32_t		sc_ureq_received;
 	int			sc_bulk_hashid;
 	uint64_t		sc_bulk_stateid;
 	uint32_t		sc_bulk_creatorid;
 	struct callout		sc_bulk_tmo;
 	struct callout		sc_bulkfail_tmo;
 };
 
 #define	PFSYNC_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
 #define	PFSYNC_UNLOCK(sc)	mtx_unlock(&(sc)->sc_mtx)
 #define	PFSYNC_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
 
 #define	PFSYNC_BLOCK(sc)	mtx_lock(&(sc)->sc_bulk_mtx)
 #define	PFSYNC_BUNLOCK(sc)	mtx_unlock(&(sc)->sc_bulk_mtx)
 #define	PFSYNC_BLOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
 
 static const char pfsyncname[] = "pfsync";
 static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data");
 static VNET_DEFINE(struct pfsync_softc	*, pfsyncif) = NULL;
 #define	V_pfsyncif		VNET(pfsyncif)
 static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL;
 #define	V_pfsync_swi_cookie	VNET(pfsync_swi_cookie)
 static VNET_DEFINE(struct pfsyncstats, pfsyncstats);
 #define	V_pfsyncstats		VNET(pfsyncstats)
 static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW;
 #define	V_pfsync_carp_adj	VNET(pfsync_carp_adj)
 
 static void	pfsync_timeout(void *);
 static void	pfsync_push(struct pfsync_softc *);
 static void	pfsyncintr(void *);
 static int	pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
 		    void *);
 static void	pfsync_multicast_cleanup(struct pfsync_softc *);
 static void	pfsync_pointers_init(void);
 static void	pfsync_pointers_uninit(void);
 static int	pfsync_init(void);
 static void	pfsync_uninit(void);
 
 SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
-SYSCTL_VNET_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_RW,
+SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(pfsyncstats), pfsyncstats,
     "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
 SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
     &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
 
 static int	pfsync_clone_create(struct if_clone *, int, caddr_t);
 static void	pfsync_clone_destroy(struct ifnet *);
 static int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
 		    struct pf_state_peer *);
 static int	pfsyncoutput(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
 
 static int	pfsync_defer(struct pf_state *, struct mbuf *);
 static void	pfsync_undefer(struct pfsync_deferral *, int);
 static void	pfsync_undefer_state(struct pf_state *, int);
 static void	pfsync_defer_tmo(void *);
 
 static void	pfsync_request_update(u_int32_t, u_int64_t);
 static void	pfsync_update_state_req(struct pf_state *);
 
 static void	pfsync_drop(struct pfsync_softc *);
 static void	pfsync_sendout(int);
 static void	pfsync_send_plus(void *, size_t);
 
 static void	pfsync_bulk_start(void);
 static void	pfsync_bulk_status(u_int8_t);
 static void	pfsync_bulk_update(void *);
 static void	pfsync_bulk_fail(void *);
 
 #ifdef IPSEC
 static void	pfsync_update_net_tdb(struct pfsync_tdb *);
 #endif
 
 #define PFSYNC_MAX_BULKTRIES	12
 
 VNET_DEFINE(struct if_clone *, pfsync_cloner);
 #define	V_pfsync_cloner	VNET(pfsync_cloner)
 
 static int
 pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
 {
 	struct pfsync_softc *sc;
 	struct ifnet *ifp;
 	int q;
 
 	if (unit != 0)
 		return (EINVAL);
 
 	sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
 	sc->sc_flags |= PFSYNCF_OK;
 
 	for (q = 0; q < PFSYNC_S_COUNT; q++)
 		TAILQ_INIT(&sc->sc_qs[q]);
 
 	TAILQ_INIT(&sc->sc_upd_req_list);
 	TAILQ_INIT(&sc->sc_deferrals);
 
 	sc->sc_len = PFSYNC_MINPKT;
 	sc->sc_maxupdates = 128;
 
 	ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
 	if (ifp == NULL) {
 		free(sc, M_PFSYNC);
 		return (ENOSPC);
 	}
 	if_initname(ifp, pfsyncname, unit);
 	ifp->if_softc = sc;
 	ifp->if_ioctl = pfsyncioctl;
 	ifp->if_output = pfsyncoutput;
 	ifp->if_type = IFT_PFSYNC;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	ifp->if_hdrlen = sizeof(struct pfsync_header);
 	ifp->if_mtu = ETHERMTU;
 	mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF);
 	mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
 	callout_init(&sc->sc_tmo, CALLOUT_MPSAFE);
 	callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
 	callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
 
 	if_attach(ifp);
 
 	bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
 
 	V_pfsyncif = sc;
 
 	return (0);
 }
 
 static void
 pfsync_clone_destroy(struct ifnet *ifp)
 {
 	struct pfsync_softc *sc = ifp->if_softc;
 
 	/*
 	 * At this stage, everything should have already been
 	 * cleared by pfsync_uninit(), and we have only to
 	 * drain callouts.
 	 */
 	while (sc->sc_deferred > 0) {
 		struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals);
 
 		TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
 		sc->sc_deferred--;
 		if (callout_stop(&pd->pd_tmo)) {
 			pf_release_state(pd->pd_st);
 			m_freem(pd->pd_m);
 			free(pd, M_PFSYNC);
 		} else {
 			pd->pd_refs++;
 			callout_drain(&pd->pd_tmo);
 			free(pd, M_PFSYNC);
 		}
 	}
 
 	callout_drain(&sc->sc_tmo);
 	callout_drain(&sc->sc_bulkfail_tmo);
 	callout_drain(&sc->sc_bulk_tmo);
 
 	if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 		(*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
 	bpfdetach(ifp);
 	if_detach(ifp);
 
 	pfsync_drop(sc);
 
 	if_free(ifp);
 	if (sc->sc_imo.imo_membership)
 		pfsync_multicast_cleanup(sc);
 	mtx_destroy(&sc->sc_mtx);
 	mtx_destroy(&sc->sc_bulk_mtx);
 	free(sc, M_PFSYNC);
 
 	V_pfsyncif = NULL;
 }
 
 static int
 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
     struct pf_state_peer *d)
 {
 	if (s->scrub.scrub_flag && d->scrub == NULL) {
 		d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
 		if (d->scrub == NULL)
 			return (ENOMEM);
 	}
 
 	return (0);
 }
 
 
 static int
 pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 #ifndef	__NO_STRICT_ALIGNMENT
 	struct pfsync_state_key key[2];
 #endif
 	struct pfsync_state_key *kw, *ks;
 	struct pf_state	*st = NULL;
 	struct pf_state_key *skw = NULL, *sks = NULL;
 	struct pf_rule *r = NULL;
 	struct pfi_kif	*kif;
 	int error;
 
 	PF_RULES_RASSERT();
 
 	if (sp->creatorid == 0) {
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("%s: invalid creator id: %08x\n", __func__,
 			    ntohl(sp->creatorid));
 		return (EINVAL);
 	}
 
 	if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("%s: unknown interface: %s\n", __func__,
 			    sp->ifname);
 		if (flags & PFSYNC_SI_IOCTL)
 			return (EINVAL);
 		return (0);	/* skip this state */
 	}
 
 	/*
 	 * If the ruleset checksums match or the state is coming from the ioctl,
 	 * it's safe to associate the state with the rule of that number.
 	 */
 	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
 	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
 	    pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
 		r = pf_main_ruleset.rules[
 		    PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
 	else
 		r = &V_pf_default_rule;
 
 	if ((r->max_states &&
 	    counter_u64_fetch(r->states_cur) >= r->max_states))
 		goto cleanup;
 
 	/*
 	 * XXXGL: consider M_WAITOK in ioctl path after.
 	 */
 	if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
 		goto cleanup;
 
 	if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
 		goto cleanup;
 
 #ifndef	__NO_STRICT_ALIGNMENT
 	bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2);
 	kw = &key[PF_SK_WIRE];
 	ks = &key[PF_SK_STACK];
 #else
 	kw = &sp->key[PF_SK_WIRE];
 	ks = &sp->key[PF_SK_STACK];
 #endif
 
 	if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) ||
 	    PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) ||
 	    kw->port[0] != ks->port[0] ||
 	    kw->port[1] != ks->port[1]) {
 		sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
 		if (sks == NULL)
 			goto cleanup;
 	} else
 		sks = skw;
 
 	/* allocate memory for scrub info */
 	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
 	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
 		goto cleanup;
 
 	/* Copy to state key(s). */
 	skw->addr[0] = kw->addr[0];
 	skw->addr[1] = kw->addr[1];
 	skw->port[0] = kw->port[0];
 	skw->port[1] = kw->port[1];
 	skw->proto = sp->proto;
 	skw->af = sp->af;
 	if (sks != skw) {
 		sks->addr[0] = ks->addr[0];
 		sks->addr[1] = ks->addr[1];
 		sks->port[0] = ks->port[0];
 		sks->port[1] = ks->port[1];
 		sks->proto = sp->proto;
 		sks->af = sp->af;
 	}
 
 	/* copy to state */
 	bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
 	st->creation = time_uptime - ntohl(sp->creation);
 	st->expire = time_uptime;
 	if (sp->expire) {
 		uint32_t timeout;
 
 		timeout = r->timeout[sp->timeout];
 		if (!timeout)
 			timeout = V_pf_default_rule.timeout[sp->timeout];
 
 		/* sp->expire may have been adaptively scaled by export. */
 		st->expire -= timeout - ntohl(sp->expire);
 	}
 
 	st->direction = sp->direction;
 	st->log = sp->log;
 	st->timeout = sp->timeout;
 	st->state_flags = sp->state_flags;
 
 	st->id = sp->id;
 	st->creatorid = sp->creatorid;
 	pf_state_peer_ntoh(&sp->src, &st->src);
 	pf_state_peer_ntoh(&sp->dst, &st->dst);
 
 	st->rule.ptr = r;
 	st->nat_rule.ptr = NULL;
 	st->anchor.ptr = NULL;
 	st->rt_kif = NULL;
 
 	st->pfsync_time = time_uptime;
 	st->sync_state = PFSYNC_S_NONE;
 
 	if (!(flags & PFSYNC_SI_IOCTL))
 		st->state_flags |= PFSTATE_NOSYNC;
 
 	if ((error = pf_state_insert(kif, skw, sks, st)) != 0)
 		goto cleanup_state;
 
 	/* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
 	counter_u64_add(r->states_cur, 1);
 	counter_u64_add(r->states_tot, 1);
 
 	if (!(flags & PFSYNC_SI_IOCTL)) {
 		st->state_flags &= ~PFSTATE_NOSYNC;
 		if (st->state_flags & PFSTATE_ACK) {
 			pfsync_q_ins(st, PFSYNC_S_IACK);
 			pfsync_push(sc);
 		}
 	}
 	st->state_flags &= ~PFSTATE_ACK;
 	PF_STATE_UNLOCK(st);
 
 	return (0);
 
 cleanup:
 	error = ENOMEM;
 	if (skw == sks)
 		sks = NULL;
 	if (skw != NULL)
 		uma_zfree(V_pf_state_key_z, skw);
 	if (sks != NULL)
 		uma_zfree(V_pf_state_key_z, sks);
 
 cleanup_state:	/* pf_state_insert() frees the state keys. */
 	if (st) {
 		if (st->dst.scrub)
 			uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
 		if (st->src.scrub)
 			uma_zfree(V_pf_state_scrub_z, st->src.scrub);
 		uma_zfree(V_pf_state_z, st);
 	}
 	return (error);
 }
 
 static int
 pfsync_input(struct mbuf **mp, int *offp __unused, int proto __unused)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_pkt pkt;
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct pfsync_header *ph;
 	struct pfsync_subheader subh;
 
 	int offset, len;
 	int rv;
 	uint16_t count;
 
 	*mp = NULL;
 	V_pfsyncstats.pfsyncs_ipackets++;
 
 	/* Verify that we have a sync interface configured. */
 	if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
 	    (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		goto done;
 
 	/* verify that the packet came in on the right interface */
 	if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
 		V_pfsyncstats.pfsyncs_badif++;
 		goto done;
 	}
 
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	/* verify that the IP TTL is 255. */
 	if (ip->ip_ttl != PFSYNC_DFLTTL) {
 		V_pfsyncstats.pfsyncs_badttl++;
 		goto done;
 	}
 
 	offset = ip->ip_hl << 2;
 	if (m->m_pkthdr.len < offset + sizeof(*ph)) {
 		V_pfsyncstats.pfsyncs_hdrops++;
 		goto done;
 	}
 
 	if (offset + sizeof(*ph) > m->m_len) {
 		if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
 			V_pfsyncstats.pfsyncs_hdrops++;
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 	}
 	ph = (struct pfsync_header *)((char *)ip + offset);
 
 	/* verify the version */
 	if (ph->version != PFSYNC_VERSION) {
 		V_pfsyncstats.pfsyncs_badver++;
 		goto done;
 	}
 
 	len = ntohs(ph->len) + offset;
 	if (m->m_pkthdr.len < len) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		goto done;
 	}
 
 	/* Cheaper to grab this now than having to mess with mbufs later */
 	pkt.ip = ip;
 	pkt.src = ip->ip_src;
 	pkt.flags = 0;
 
 	/*
 	 * Trusting pf_chksum during packet processing, as well as seeking
 	 * in interface name tree, require holding PF_RULES_RLOCK().
 	 */
 	PF_RULES_RLOCK();
 	if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
 		pkt.flags |= PFSYNC_SI_CKSUM;
 
 	offset += sizeof(*ph);
 	while (offset <= len - sizeof(subh)) {
 		m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
 		offset += sizeof(subh);
 
 		if (subh.action >= PFSYNC_ACT_MAX) {
 			V_pfsyncstats.pfsyncs_badact++;
 			PF_RULES_RUNLOCK();
 			goto done;
 		}
 
 		count = ntohs(subh.count);
 		V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
 		rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
 		if (rv == -1) {
 			PF_RULES_RUNLOCK();
 			return (IPPROTO_DONE);
 		}
 
 		offset += rv;
 	}
 	PF_RULES_RUNLOCK();
 
 done:
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 static int
 pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_clr *clr;
 	struct mbuf *mp;
 	int len = sizeof(*clr) * count;
 	int i, offp;
 	u_int32_t creatorid;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	clr = (struct pfsync_clr *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		creatorid = clr[i].creatorid;
 
 		if (clr[i].ifname[0] != '\0' &&
 		    pfi_kif_find(clr[i].ifname) == NULL)
 			continue;
 
 		for (int i = 0; i <= pf_hashmask; i++) {
 			struct pf_idhash *ih = &V_pf_idhash[i];
 			struct pf_state *s;
 relock:
 			PF_HASHROW_LOCK(ih);
 			LIST_FOREACH(s, &ih->states, entry) {
 				if (s->creatorid == creatorid) {
 					s->state_flags |= PFSTATE_NOSYNC;
 					pf_unlink_state(s, PF_ENTER_LOCKED);
 					goto relock;
 				}
 			}
 			PF_HASHROW_UNLOCK(ih);
 		}
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct mbuf *mp;
 	struct pfsync_state *sa, *sp;
 	int len = sizeof(*sp) * count;
 	int i, offp;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_state *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		/* Check for invalid values. */
 		if (sp->timeout >= PFTM_MAX ||
 		    sp->src.state > PF_TCPS_PROXY_DST ||
 		    sp->dst.state > PF_TCPS_PROXY_DST ||
 		    sp->direction > PF_OUT ||
 		    (sp->af != AF_INET && sp->af != AF_INET6)) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf("%s: invalid value\n", __func__);
 			V_pfsyncstats.pfsyncs_badval++;
 			continue;
 		}
 
 		if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
 			/* Drop out, but process the rest of the actions. */
 			break;
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_ins_ack *ia, *iaa;
 	struct pf_state *st;
 
 	struct mbuf *mp;
 	int len = count * sizeof(*ia);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		ia = &iaa[i];
 
 		st = pf_find_state_byid(ia->id, ia->creatorid);
 		if (st == NULL)
 			continue;
 
 		if (st->state_flags & PFSTATE_ACK) {
 			PFSYNC_LOCK(V_pfsyncif);
 			pfsync_undefer_state(st, 0);
 			PFSYNC_UNLOCK(V_pfsyncif);
 		}
 		PF_STATE_UNLOCK(st);
 	}
 	/*
 	 * XXX this is not yet implemented, but we know the size of the
 	 * message so we can skip it.
 	 */
 
 	return (count * sizeof(struct pfsync_ins_ack));
 }
 
 static int
 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
     struct pfsync_state_peer *dst)
 {
 	int sync = 0;
 
 	PF_STATE_LOCK_ASSERT(st);
 
 	/*
 	 * The state should never go backwards except
 	 * for syn-proxy states.  Neither should the
 	 * sequence window slide backwards.
 	 */
 	if ((st->src.state > src->state &&
 	    (st->src.state < PF_TCPS_PROXY_SRC ||
 	    src->state >= PF_TCPS_PROXY_SRC)) ||
 
 	    (st->src.state == src->state &&
 	    SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
 		sync++;
 	else
 		pf_state_peer_ntoh(src, &st->src);
 
 	if ((st->dst.state > dst->state) ||
 
 	    (st->dst.state >= TCPS_SYN_SENT &&
 	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
 		sync++;
 	else
 		pf_state_peer_ntoh(dst, &st->dst);
 
 	return (sync);
 }
 
 static int
 pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_state *sa, *sp;
 	struct pf_state *st;
 	int sync;
 
 	struct mbuf *mp;
 	int len = count * sizeof(*sp);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_state *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		/* check for invalid values */
 		if (sp->timeout >= PFTM_MAX ||
 		    sp->src.state > PF_TCPS_PROXY_DST ||
 		    sp->dst.state > PF_TCPS_PROXY_DST) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
 				printf("pfsync_input: PFSYNC_ACT_UPD: "
 				    "invalid value\n");
 			}
 			V_pfsyncstats.pfsyncs_badval++;
 			continue;
 		}
 
 		st = pf_find_state_byid(sp->id, sp->creatorid);
 		if (st == NULL) {
 			/* insert the update */
 			if (pfsync_state_import(sp, 0))
 				V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
 
 		if (st->state_flags & PFSTATE_ACK) {
 			PFSYNC_LOCK(sc);
 			pfsync_undefer_state(st, 1);
 			PFSYNC_UNLOCK(sc);
 		}
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
 			sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
 		else {
 			sync = 0;
 
 			/*
 			 * Non-TCP protocol state machine always go
 			 * forwards
 			 */
 			if (st->src.state > sp->src.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&sp->src, &st->src);
 			if (st->dst.state > sp->dst.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&sp->dst, &st->dst);
 		}
 		if (sync < 2) {
 			pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
 			pf_state_peer_ntoh(&sp->dst, &st->dst);
 			st->expire = time_uptime;
 			st->timeout = sp->timeout;
 		}
 		st->pfsync_time = time_uptime;
 
 		if (sync) {
 			V_pfsyncstats.pfsyncs_stale++;
 
 			pfsync_update_state(st);
 			PF_STATE_UNLOCK(st);
 			PFSYNC_LOCK(sc);
 			pfsync_push(sc);
 			PFSYNC_UNLOCK(sc);
 			continue;
 		}
 		PF_STATE_UNLOCK(st);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_upd_c *ua, *up;
 	struct pf_state *st;
 	int len = count * sizeof(*up);
 	int sync;
 	struct mbuf *mp;
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	ua = (struct pfsync_upd_c *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		up = &ua[i];
 
 		/* check for invalid values */
 		if (up->timeout >= PFTM_MAX ||
 		    up->src.state > PF_TCPS_PROXY_DST ||
 		    up->dst.state > PF_TCPS_PROXY_DST) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
 				printf("pfsync_input: "
 				    "PFSYNC_ACT_UPD_C: "
 				    "invalid value\n");
 			}
 			V_pfsyncstats.pfsyncs_badval++;
 			continue;
 		}
 
 		st = pf_find_state_byid(up->id, up->creatorid);
 		if (st == NULL) {
 			/* We don't have this state. Ask for it. */
 			PFSYNC_LOCK(sc);
 			pfsync_request_update(up->creatorid, up->id);
 			PFSYNC_UNLOCK(sc);
 			continue;
 		}
 
 		if (st->state_flags & PFSTATE_ACK) {
 			PFSYNC_LOCK(sc);
 			pfsync_undefer_state(st, 1);
 			PFSYNC_UNLOCK(sc);
 		}
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
 			sync = pfsync_upd_tcp(st, &up->src, &up->dst);
 		else {
 			sync = 0;
 
 			/*
 			 * Non-TCP protocol state machine always go
 			 * forwards
 			 */
 			if (st->src.state > up->src.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&up->src, &st->src);
 			if (st->dst.state > up->dst.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&up->dst, &st->dst);
 		}
 		if (sync < 2) {
 			pfsync_alloc_scrub_memory(&up->dst, &st->dst);
 			pf_state_peer_ntoh(&up->dst, &st->dst);
 			st->expire = time_uptime;
 			st->timeout = up->timeout;
 		}
 		st->pfsync_time = time_uptime;
 
 		if (sync) {
 			V_pfsyncstats.pfsyncs_stale++;
 
 			pfsync_update_state(st);
 			PF_STATE_UNLOCK(st);
 			PFSYNC_LOCK(sc);
 			pfsync_push(sc);
 			PFSYNC_UNLOCK(sc);
 			continue;
 		}
 		PF_STATE_UNLOCK(st);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_upd_req *ur, *ura;
 	struct mbuf *mp;
 	int len = count * sizeof(*ur);
 	int i, offp;
 
 	struct pf_state *st;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	ura = (struct pfsync_upd_req *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		ur = &ura[i];
 
 		if (ur->id == 0 && ur->creatorid == 0)
 			pfsync_bulk_start();
 		else {
 			st = pf_find_state_byid(ur->id, ur->creatorid);
 			if (st == NULL) {
 				V_pfsyncstats.pfsyncs_badstate++;
 				continue;
 			}
 			if (st->state_flags & PFSTATE_NOSYNC) {
 				PF_STATE_UNLOCK(st);
 				continue;
 			}
 
 			pfsync_update_state_req(st);
 			PF_STATE_UNLOCK(st);
 		}
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct mbuf *mp;
 	struct pfsync_state *sa, *sp;
 	struct pf_state *st;
 	int len = count * sizeof(*sp);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_state *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		st = pf_find_state_byid(sp->id, sp->creatorid);
 		if (st == NULL) {
 			V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
 		st->state_flags |= PFSTATE_NOSYNC;
 		pf_unlink_state(st, PF_ENTER_LOCKED);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct mbuf *mp;
 	struct pfsync_del_c *sa, *sp;
 	struct pf_state *st;
 	int len = count * sizeof(*sp);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_del_c *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		st = pf_find_state_byid(sp->id, sp->creatorid);
 		if (st == NULL) {
 			V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
 
 		st->state_flags |= PFSTATE_NOSYNC;
 		pf_unlink_state(st, PF_ENTER_LOCKED);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_bus *bus;
 	struct mbuf *mp;
 	int len = count * sizeof(*bus);
 	int offp;
 
 	PFSYNC_BLOCK(sc);
 
 	/* If we're not waiting for a bulk update, who cares. */
 	if (sc->sc_ureq_sent == 0) {
 		PFSYNC_BUNLOCK(sc);
 		return (len);
 	}
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		PFSYNC_BUNLOCK(sc);
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	bus = (struct pfsync_bus *)(mp->m_data + offp);
 
 	switch (bus->status) {
 	case PFSYNC_BUS_START:
 		callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
 		    V_pf_limits[PF_LIMIT_STATES].limit /
 		    ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
 		    sizeof(struct pfsync_state)),
 		    pfsync_bulk_fail, sc);
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: received bulk update start\n");
 		break;
 
 	case PFSYNC_BUS_END:
 		if (time_uptime - ntohl(bus->endtime) >=
 		    sc->sc_ureq_sent) {
 			/* that's it, we're happy */
 			sc->sc_ureq_sent = 0;
 			sc->sc_bulk_tries = 0;
 			callout_stop(&sc->sc_bulkfail_tmo);
 			if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 				(*carp_demote_adj_p)(-V_pfsync_carp_adj,
 				    "pfsync bulk done");
 			sc->sc_flags |= PFSYNCF_OK;
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf("pfsync: received valid "
 				    "bulk update end\n");
 		} else {
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf("pfsync: received invalid "
 				    "bulk update end: bad timestamp\n");
 		}
 		break;
 	}
 	PFSYNC_BUNLOCK(sc);
 
 	return (len);
 }
 
 static int
 pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	int len = count * sizeof(struct pfsync_tdb);
 
 #if defined(IPSEC)
 	struct pfsync_tdb *tp;
 	struct mbuf *mp;
 	int offp;
 	int i;
 	int s;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	tp = (struct pfsync_tdb *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++)
 		pfsync_update_net_tdb(&tp[i]);
 #endif
 
 	return (len);
 }
 
 #if defined(IPSEC)
 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
 static void
 pfsync_update_net_tdb(struct pfsync_tdb *pt)
 {
 	struct tdb		*tdb;
 	int			 s;
 
 	/* check for invalid values */
 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
 	    (pt->dst.sa.sa_family != AF_INET &&
 	    pt->dst.sa.sa_family != AF_INET6))
 		goto bad;
 
 	tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
 	if (tdb) {
 		pt->rpl = ntohl(pt->rpl);
 		pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
 
 		/* Neither replay nor byte counter should ever decrease. */
 		if (pt->rpl < tdb->tdb_rpl ||
 		    pt->cur_bytes < tdb->tdb_cur_bytes) {
 			goto bad;
 		}
 
 		tdb->tdb_rpl = pt->rpl;
 		tdb->tdb_cur_bytes = pt->cur_bytes;
 	}
 	return;
 
 bad:
 	if (V_pf_status.debug >= PF_DEBUG_MISC)
 		printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
 		    "invalid value\n");
 	V_pfsyncstats.pfsyncs_badstate++;
 	return;
 }
 #endif
 
 
 static int
 pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	/* check if we are at the right place in the packet */
 	if (offset != m->m_pkthdr.len)
 		V_pfsyncstats.pfsyncs_badlen++;
 
 	/* we're done. free and let the caller return */
 	m_freem(m);
 	return (-1);
 }
 
 static int
 pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	V_pfsyncstats.pfsyncs_badact++;
 
 	m_freem(m);
 	return (-1);
 }
 
 static int
 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *rt)
 {
 	m_freem(m);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct pfsync_softc *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct pfsyncreq pfsyncr;
 	int error;
 
 	switch (cmd) {
 	case SIOCSIFFLAGS:
 		PFSYNC_LOCK(sc);
 		if (ifp->if_flags & IFF_UP) {
 			ifp->if_drv_flags |= IFF_DRV_RUNNING;
 			PFSYNC_UNLOCK(sc);
 			pfsync_pointers_init();
 		} else {
 			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 			PFSYNC_UNLOCK(sc);
 			pfsync_pointers_uninit();
 		}
 		break;
 	case SIOCSIFMTU:
 		if (!sc->sc_sync_if ||
 		    ifr->ifr_mtu <= PFSYNC_MINPKT ||
 		    ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
 			return (EINVAL);
 		if (ifr->ifr_mtu < ifp->if_mtu) {
 			PFSYNC_LOCK(sc);
 			if (sc->sc_len > PFSYNC_MINPKT)
 				pfsync_sendout(1);
 			PFSYNC_UNLOCK(sc);
 		}
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 	case SIOCGETPFSYNC:
 		bzero(&pfsyncr, sizeof(pfsyncr));
 		PFSYNC_LOCK(sc);
 		if (sc->sc_sync_if) {
 			strlcpy(pfsyncr.pfsyncr_syncdev,
 			    sc->sc_sync_if->if_xname, IFNAMSIZ);
 		}
 		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
 		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
 		pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
 		    (sc->sc_flags & PFSYNCF_DEFER));
 		PFSYNC_UNLOCK(sc);
 		return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
 
 	case SIOCSETPFSYNC:
 	    {
 		struct ip_moptions *imo = &sc->sc_imo;
 		struct ifnet *sifp;
 		struct ip *ip;
 		void *mship = NULL;
 
 		if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
 			return (error);
 		if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
 			return (error);
 
 		if (pfsyncr.pfsyncr_maxupdates > 255)
 			return (EINVAL);
 
 		if (pfsyncr.pfsyncr_syncdev[0] == 0)
 			sifp = NULL;
 		else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
 			return (EINVAL);
 
 		if (sifp != NULL && (
 		    pfsyncr.pfsyncr_syncpeer.s_addr == 0 ||
 		    pfsyncr.pfsyncr_syncpeer.s_addr ==
 		    htonl(INADDR_PFSYNC_GROUP)))
 			mship = malloc((sizeof(struct in_multi *) *
 			    IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO);
 
 		PFSYNC_LOCK(sc);
 		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
 			sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
 		else
 			sc->sc_sync_peer.s_addr =
 			    pfsyncr.pfsyncr_syncpeer.s_addr;
 
 		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
 		if (pfsyncr.pfsyncr_defer) {
 			sc->sc_flags |= PFSYNCF_DEFER;
 			pfsync_defer_ptr = pfsync_defer;
 		} else {
 			sc->sc_flags &= ~PFSYNCF_DEFER;
 			pfsync_defer_ptr = NULL;
 		}
 
 		if (sifp == NULL) {
 			if (sc->sc_sync_if)
 				if_rele(sc->sc_sync_if);
 			sc->sc_sync_if = NULL;
 			if (imo->imo_membership)
 				pfsync_multicast_cleanup(sc);
 			PFSYNC_UNLOCK(sc);
 			break;
 		}
 
 		if (sc->sc_len > PFSYNC_MINPKT &&
 		    (sifp->if_mtu < sc->sc_ifp->if_mtu ||
 		    (sc->sc_sync_if != NULL &&
 		    sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
 		    sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
 			pfsync_sendout(1);
 
 		if (imo->imo_membership)
 			pfsync_multicast_cleanup(sc);
 
 		if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
 			error = pfsync_multicast_setup(sc, sifp, mship);
 			if (error) {
 				if_rele(sifp);
 				free(mship, M_PFSYNC);
 				return (error);
 			}
 		}
 		if (sc->sc_sync_if)
 			if_rele(sc->sc_sync_if);
 		sc->sc_sync_if = sifp;
 
 		ip = &sc->sc_template;
 		bzero(ip, sizeof(*ip));
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(sc->sc_template) >> 2;
 		ip->ip_tos = IPTOS_LOWDELAY;
 		/* len and id are set later. */
 		ip->ip_off = htons(IP_DF);
 		ip->ip_ttl = PFSYNC_DFLTTL;
 		ip->ip_p = IPPROTO_PFSYNC;
 		ip->ip_src.s_addr = INADDR_ANY;
 		ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
 
 		/* Request a full state table update. */
 		if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 			(*carp_demote_adj_p)(V_pfsync_carp_adj,
 			    "pfsync bulk start");
 		sc->sc_flags &= ~PFSYNCF_OK;
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: requesting bulk update\n");
 		pfsync_request_update(0, 0);
 		PFSYNC_UNLOCK(sc);
 		PFSYNC_BLOCK(sc);
 		sc->sc_ureq_sent = time_uptime;
 		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
 		    sc);
 		PFSYNC_BUNLOCK(sc);
 
 		break;
 	    }
 	default:
 		return (ENOTTY);
 	}
 
 	return (0);
 }
 
 static void
 pfsync_out_state(struct pf_state *st, void *buf)
 {
 	struct pfsync_state *sp = buf;
 
 	pfsync_state_export(sp, st);
 }
 
 static void
 pfsync_out_iack(struct pf_state *st, void *buf)
 {
 	struct pfsync_ins_ack *iack = buf;
 
 	iack->id = st->id;
 	iack->creatorid = st->creatorid;
 }
 
 static void
 pfsync_out_upd_c(struct pf_state *st, void *buf)
 {
 	struct pfsync_upd_c *up = buf;
 
 	bzero(up, sizeof(*up));
 	up->id = st->id;
 	pf_state_peer_hton(&st->src, &up->src);
 	pf_state_peer_hton(&st->dst, &up->dst);
 	up->creatorid = st->creatorid;
 	up->timeout = st->timeout;
 }
 
 static void
 pfsync_out_del(struct pf_state *st, void *buf)
 {
 	struct pfsync_del_c *dp = buf;
 
 	dp->id = st->id;
 	dp->creatorid = st->creatorid;
 	st->state_flags |= PFSTATE_NOSYNC;
 }
 
 static void
 pfsync_drop(struct pfsync_softc *sc)
 {
 	struct pf_state *st, *next;
 	struct pfsync_upd_req_item *ur;
 	int q;
 
 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
 		if (TAILQ_EMPTY(&sc->sc_qs[q]))
 			continue;
 
 		TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
 			KASSERT(st->sync_state == q,
 				("%s: st->sync_state == q",
 					__func__));
 			st->sync_state = PFSYNC_S_NONE;
 			pf_release_state(st);
 		}
 		TAILQ_INIT(&sc->sc_qs[q]);
 	}
 
 	while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
 		TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
 		free(ur, M_PFSYNC);
 	}
 
 	sc->sc_plus = NULL;
 	sc->sc_len = PFSYNC_MINPKT;
 }
 
 static void
 pfsync_sendout(int schedswi)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct ifnet *ifp = sc->sc_ifp;
 	struct mbuf *m;
 	struct ip *ip;
 	struct pfsync_header *ph;
 	struct pfsync_subheader *subh;
 	struct pf_state *st;
 	struct pfsync_upd_req_item *ur;
 	int offset;
 	int q, count = 0;
 
 	KASSERT(sc != NULL, ("%s: null sc", __func__));
 	KASSERT(sc->sc_len > PFSYNC_MINPKT,
 	    ("%s: sc_len %zu", __func__, sc->sc_len));
 	PFSYNC_LOCK_ASSERT(sc);
 
 	if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
 		pfsync_drop(sc);
 		return;
 	}
 
 	m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		V_pfsyncstats.pfsyncs_onomem++;
 		return;
 	}
 	m->m_data += max_linkhdr;
 	m->m_len = m->m_pkthdr.len = sc->sc_len;
 
 	/* build the ip header */
 	ip = (struct ip *)m->m_data;
 	bcopy(&sc->sc_template, ip, sizeof(*ip));
 	offset = sizeof(*ip);
 
 	ip->ip_len = htons(m->m_pkthdr.len);
 	ip->ip_id = htons(ip_randomid());
 
 	/* build the pfsync header */
 	ph = (struct pfsync_header *)(m->m_data + offset);
 	bzero(ph, sizeof(*ph));
 	offset += sizeof(*ph);
 
 	ph->version = PFSYNC_VERSION;
 	ph->len = htons(sc->sc_len - sizeof(*ip));
 	bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
 
 	/* walk the queues */
 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
 		if (TAILQ_EMPTY(&sc->sc_qs[q]))
 			continue;
 
 		subh = (struct pfsync_subheader *)(m->m_data + offset);
 		offset += sizeof(*subh);
 
 		count = 0;
 		TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) {
 			KASSERT(st->sync_state == q,
 				("%s: st->sync_state == q",
 					__func__));
 			/*
 			 * XXXGL: some of write methods do unlocked reads
 			 * of state data :(
 			 */
 			pfsync_qs[q].write(st, m->m_data + offset);
 			offset += pfsync_qs[q].len;
 			st->sync_state = PFSYNC_S_NONE;
 			pf_release_state(st);
 			count++;
 		}
 		TAILQ_INIT(&sc->sc_qs[q]);
 
 		bzero(subh, sizeof(*subh));
 		subh->action = pfsync_qs[q].action;
 		subh->count = htons(count);
 		V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
 	}
 
 	if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
 		subh = (struct pfsync_subheader *)(m->m_data + offset);
 		offset += sizeof(*subh);
 
 		count = 0;
 		while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
 			TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
 
 			bcopy(&ur->ur_msg, m->m_data + offset,
 			    sizeof(ur->ur_msg));
 			offset += sizeof(ur->ur_msg);
 			free(ur, M_PFSYNC);
 			count++;
 		}
 
 		bzero(subh, sizeof(*subh));
 		subh->action = PFSYNC_ACT_UPD_REQ;
 		subh->count = htons(count);
 		V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
 	}
 
 	/* has someone built a custom region for us to add? */
 	if (sc->sc_plus != NULL) {
 		bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
 		offset += sc->sc_pluslen;
 
 		sc->sc_plus = NULL;
 	}
 
 	subh = (struct pfsync_subheader *)(m->m_data + offset);
 	offset += sizeof(*subh);
 
 	bzero(subh, sizeof(*subh));
 	subh->action = PFSYNC_ACT_EOF;
 	subh->count = htons(1);
 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
 
 	/* we're done, let's put it on the wire */
 	if (ifp->if_bpf) {
 		m->m_data += sizeof(*ip);
 		m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
 		BPF_MTAP(ifp, m);
 		m->m_data -= sizeof(*ip);
 		m->m_len = m->m_pkthdr.len = sc->sc_len;
 	}
 
 	if (sc->sc_sync_if == NULL) {
 		sc->sc_len = PFSYNC_MINPKT;
 		m_freem(m);
 		return;
 	}
 
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 	sc->sc_len = PFSYNC_MINPKT;
 
 	if (!_IF_QFULL(&sc->sc_ifp->if_snd))
 		_IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
 	else {
 		m_freem(m);
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1);
 	}
 	if (schedswi)
 		swi_sched(V_pfsync_swi_cookie, 0);
 }
 
 static void
 pfsync_insert_state(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	if (st->state_flags & PFSTATE_NOSYNC)
 		return;
 
 	if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
 	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
 		st->state_flags |= PFSTATE_NOSYNC;
 		return;
 	}
 
 	KASSERT(st->sync_state == PFSYNC_S_NONE,
 		("%s: st->sync_state %u", __func__, st->sync_state));
 
 	PFSYNC_LOCK(sc);
 	if (sc->sc_len == PFSYNC_MINPKT)
 		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
 
 	pfsync_q_ins(st, PFSYNC_S_INS);
 	PFSYNC_UNLOCK(sc);
 
 	st->sync_updates = 0;
 }
 
 static int
 pfsync_defer(struct pf_state *st, struct mbuf *m)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_deferral *pd;
 
 	if (m->m_flags & (M_BCAST|M_MCAST))
 		return (0);
 
 	PFSYNC_LOCK(sc);
 
 	if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
 	    !(sc->sc_flags & PFSYNCF_DEFER)) {
 		PFSYNC_UNLOCK(sc);
 		return (0);
 	}
 
 	 if (sc->sc_deferred >= 128)
 		pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
 
 	pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
 	if (pd == NULL)
 		return (0);
 	sc->sc_deferred++;
 
 	m->m_flags |= M_SKIP_FIREWALL;
 	st->state_flags |= PFSTATE_ACK;
 
 	pd->pd_sc = sc;
 	pd->pd_refs = 0;
 	pd->pd_st = st;
 	pf_ref_state(st);
 	pd->pd_m = m;
 
 	TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
 	callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 	callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
 
 	pfsync_push(sc);
 
 	return (1);
 }
 
 static void
 pfsync_undefer(struct pfsync_deferral *pd, int drop)
 {
 	struct pfsync_softc *sc = pd->pd_sc;
 	struct mbuf *m = pd->pd_m;
 	struct pf_state *st = pd->pd_st;
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
 	sc->sc_deferred--;
 	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
 	free(pd, M_PFSYNC);
 	pf_release_state(st);
 
 	if (drop)
 		m_freem(m);
 	else {
 		_IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
 		pfsync_push(sc);
 	}
 }
 
 static void
 pfsync_defer_tmo(void *arg)
 {
 	struct pfsync_deferral *pd = arg;
 	struct pfsync_softc *sc = pd->pd_sc;
 	struct mbuf *m = pd->pd_m;
 	struct pf_state *st = pd->pd_st;
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
 
 	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
 	sc->sc_deferred--;
 	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
 	if (pd->pd_refs == 0)
 		free(pd, M_PFSYNC);
 	PFSYNC_UNLOCK(sc);
 
 	ip_output(m, NULL, NULL, 0, NULL, NULL);
 
 	pf_release_state(st);
 
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_undefer_state(struct pf_state *st, int drop)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_deferral *pd;
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
 		 if (pd->pd_st == st) {
 			if (callout_stop(&pd->pd_tmo))
 				pfsync_undefer(pd, drop);
 			return;
 		}
 	}
 
 	panic("%s: unable to find deferred state", __func__);
 }
 
 static void
 pfsync_update_state(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	int sync = 0;
 
 	PF_STATE_LOCK_ASSERT(st);
 	PFSYNC_LOCK(sc);
 
 	if (st->state_flags & PFSTATE_ACK)
 		pfsync_undefer_state(st, 0);
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
 			pfsync_q_del(st);
 		PFSYNC_UNLOCK(sc);
 		return;
 	}
 
 	if (sc->sc_len == PFSYNC_MINPKT)
 		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
 
 	switch (st->sync_state) {
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_INS:
 		/* we're already handling it */
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
 			st->sync_updates++;
 			if (st->sync_updates >= sc->sc_maxupdates)
 				sync = 1;
 		}
 		break;
 
 	case PFSYNC_S_IACK:
 		pfsync_q_del(st);
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_UPD_C);
 		st->sync_updates = 0;
 		break;
 
 	default:
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 
 	if (sync || (time_uptime - st->pfsync_time) < 2)
 		pfsync_push(sc);
 
 	PFSYNC_UNLOCK(sc);
 }
 
 static void
 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_upd_req_item *item;
 	size_t nlen = sizeof(struct pfsync_upd_req);
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	/*
 	 * This code does a bit to prevent multiple update requests for the
 	 * same state being generated. It searches current subheader queue,
 	 * but it doesn't lookup into queue of already packed datagrams.
 	 */
 	TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry)
 		if (item->ur_msg.id == id &&
 		    item->ur_msg.creatorid == creatorid)
 			return;
 
 	item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
 	if (item == NULL)
 		return; /* XXX stats */
 
 	item->ur_msg.id = id;
 	item->ur_msg.creatorid = creatorid;
 
 	if (TAILQ_EMPTY(&sc->sc_upd_req_list))
 		nlen += sizeof(struct pfsync_subheader);
 
 	if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
 		pfsync_sendout(1);
 
 		nlen = sizeof(struct pfsync_subheader) +
 		    sizeof(struct pfsync_upd_req);
 	}
 
 	TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
 	sc->sc_len += nlen;
 }
 
 static void
 pfsync_update_state_req(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	PF_STATE_LOCK_ASSERT(st);
 	PFSYNC_LOCK(sc);
 
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
 			pfsync_q_del(st);
 		PFSYNC_UNLOCK(sc);
 		return;
 	}
 
 	switch (st->sync_state) {
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_IACK:
 		pfsync_q_del(st);
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_UPD);
 		pfsync_push(sc);
 		break;
 
 	case PFSYNC_S_INS:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_DEL:
 		/* we're already handling it */
 		break;
 
 	default:
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 
 	PFSYNC_UNLOCK(sc);
 }
 
 static void
 pfsync_delete_state(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	PFSYNC_LOCK(sc);
 	if (st->state_flags & PFSTATE_ACK)
 		pfsync_undefer_state(st, 1);
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
 			pfsync_q_del(st);
 		PFSYNC_UNLOCK(sc);
 		return;
 	}
 
 	if (sc->sc_len == PFSYNC_MINPKT)
 		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
 
 	switch (st->sync_state) {
 	case PFSYNC_S_INS:
 		/* We never got to tell the world so just forget about it. */
 		pfsync_q_del(st);
 		break;
 
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_IACK:
 		pfsync_q_del(st);
 		/* FALLTHROUGH to putting it on the del list */
 
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_DEL);
 		break;
 
 	default:
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 	PFSYNC_UNLOCK(sc);
 }
 
 static void
 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct {
 		struct pfsync_subheader subh;
 		struct pfsync_clr clr;
 	} __packed r;
 
 	bzero(&r, sizeof(r));
 
 	r.subh.action = PFSYNC_ACT_CLR;
 	r.subh.count = htons(1);
 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
 
 	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
 	r.clr.creatorid = creatorid;
 
 	PFSYNC_LOCK(sc);
 	pfsync_send_plus(&r, sizeof(r));
 	PFSYNC_UNLOCK(sc);
 }
 
 static void
 pfsync_q_ins(struct pf_state *st, int q)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	size_t nlen = pfsync_qs[q].len;
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	KASSERT(st->sync_state == PFSYNC_S_NONE,
 		("%s: st->sync_state %u", __func__, st->sync_state));
 	KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
 	    sc->sc_len));
 
 	if (TAILQ_EMPTY(&sc->sc_qs[q]))
 		nlen += sizeof(struct pfsync_subheader);
 
 	if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
 		pfsync_sendout(1);
 
 		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
 	}
 
 	sc->sc_len += nlen;
 	TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
 	st->sync_state = q;
 	pf_ref_state(st);
 }
 
 static void
 pfsync_q_del(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	int q = st->sync_state;
 
 	PFSYNC_LOCK_ASSERT(sc);
 	KASSERT(st->sync_state != PFSYNC_S_NONE,
 		("%s: st->sync_state != PFSYNC_S_NONE", __func__));
 
 	sc->sc_len -= pfsync_qs[q].len;
 	TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
 	st->sync_state = PFSYNC_S_NONE;
 	pf_release_state(st);
 
 	if (TAILQ_EMPTY(&sc->sc_qs[q]))
 		sc->sc_len -= sizeof(struct pfsync_subheader);
 }
 
 static void
 pfsync_bulk_start(void)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	if (V_pf_status.debug >= PF_DEBUG_MISC)
 		printf("pfsync: received bulk update request\n");
 
 	PFSYNC_BLOCK(sc);
 
 	sc->sc_ureq_received = time_uptime;
 	sc->sc_bulk_hashid = 0;
 	sc->sc_bulk_stateid = 0;
 	pfsync_bulk_status(PFSYNC_BUS_START);
 	callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
 	PFSYNC_BUNLOCK(sc);
 }
 
 static void
 pfsync_bulk_update(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 	struct pf_state *s;
 	int i, sent = 0;
 
 	PFSYNC_BLOCK_ASSERT(sc);
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 
 	/*
 	 * Start with last state from previous invocation.
 	 * It may had gone, in this case start from the
 	 * hash slot.
 	 */
 	s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
 
 	if (s != NULL)
 		i = PF_IDHASH(s);
 	else
 		i = sc->sc_bulk_hashid;
 
 	for (; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 
 		if (s != NULL)
 			PF_HASHROW_ASSERT(ih);
 		else {
 			PF_HASHROW_LOCK(ih);
 			s = LIST_FIRST(&ih->states);
 		}
 
 		for (; s; s = LIST_NEXT(s, entry)) {
 
 			if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
 			    sizeof(struct pfsync_state)) {
 				/* We've filled a packet. */
 				sc->sc_bulk_hashid = i;
 				sc->sc_bulk_stateid = s->id;
 				sc->sc_bulk_creatorid = s->creatorid;
 				PF_HASHROW_UNLOCK(ih);
 				callout_reset(&sc->sc_bulk_tmo, 1,
 				    pfsync_bulk_update, sc);
 				goto full;
 			}
 
 			if (s->sync_state == PFSYNC_S_NONE &&
 			    s->timeout < PFTM_MAX &&
 			    s->pfsync_time <= sc->sc_ureq_received) {
 				pfsync_update_state_req(s);
 				sent++;
 			}
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 
 	/* We're done. */
 	pfsync_bulk_status(PFSYNC_BUS_END);
 
 full:
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_bulk_status(u_int8_t status)
 {
 	struct {
 		struct pfsync_subheader subh;
 		struct pfsync_bus bus;
 	} __packed r;
 
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	bzero(&r, sizeof(r));
 
 	r.subh.action = PFSYNC_ACT_BUS;
 	r.subh.count = htons(1);
 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
 
 	r.bus.creatorid = V_pf_status.hostid;
 	r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
 	r.bus.status = status;
 
 	PFSYNC_LOCK(sc);
 	pfsync_send_plus(&r, sizeof(r));
 	PFSYNC_UNLOCK(sc);
 }
 
 static void
 pfsync_bulk_fail(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 
 	PFSYNC_BLOCK_ASSERT(sc);
 
 	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
 		/* Try again */
 		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
 		    pfsync_bulk_fail, V_pfsyncif);
 		PFSYNC_LOCK(sc);
 		pfsync_request_update(0, 0);
 		PFSYNC_UNLOCK(sc);
 	} else {
 		/* Pretend like the transfer was ok. */
 		sc->sc_ureq_sent = 0;
 		sc->sc_bulk_tries = 0;
 		PFSYNC_LOCK(sc);
 		if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 			(*carp_demote_adj_p)(-V_pfsync_carp_adj,
 			    "pfsync bulk fail");
 		sc->sc_flags |= PFSYNCF_OK;
 		PFSYNC_UNLOCK(sc);
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: failed to receive bulk update\n");
 	}
 
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_send_plus(void *plus, size_t pluslen)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu)
 		pfsync_sendout(1);
 
 	sc->sc_plus = plus;
 	sc->sc_len += (sc->sc_pluslen = pluslen);
 
 	pfsync_sendout(1);
 }
 
 static void
 pfsync_timeout(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 	PFSYNC_LOCK(sc);
 	pfsync_push(sc);
 	PFSYNC_UNLOCK(sc);
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_push(struct pfsync_softc *sc)
 {
 
 	PFSYNC_LOCK_ASSERT(sc);
 
 	sc->sc_flags |= PFSYNCF_PUSH;
 	swi_sched(V_pfsync_swi_cookie, 0);
 }
 
 static void
 pfsyncintr(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 	struct mbuf *m, *n;
 
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 
 	PFSYNC_LOCK(sc);
 	if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) {
 		pfsync_sendout(0);
 		sc->sc_flags &= ~PFSYNCF_PUSH;
 	}
 	_IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m);
 	PFSYNC_UNLOCK(sc);
 
 	for (; m != NULL; m = n) {
 
 		n = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 
 		/*
 		 * We distinguish between a deferral packet and our
 		 * own pfsync packet based on M_SKIP_FIREWALL
 		 * flag. This is XXX.
 		 */
 		if (m->m_flags & M_SKIP_FIREWALL)
 			ip_output(m, NULL, NULL, 0, NULL, NULL);
 		else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
 		    NULL) == 0)
 			V_pfsyncstats.pfsyncs_opackets++;
 		else
 			V_pfsyncstats.pfsyncs_oerrors++;
 	}
 	CURVNET_RESTORE();
 }
 
 static int
 pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship)
 {
 	struct ip_moptions *imo = &sc->sc_imo;
 	int error;
 
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		return (EADDRNOTAVAIL);
 
 	imo->imo_membership = (struct in_multi **)mship;
 	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
 	imo->imo_multicast_vif = -1;
 
 	if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
 	    &imo->imo_membership[0])) != 0) {
 		imo->imo_membership = NULL;
 		return (error);
 	}
 	imo->imo_num_memberships++;
 	imo->imo_multicast_ifp = ifp;
 	imo->imo_multicast_ttl = PFSYNC_DFLTTL;
 	imo->imo_multicast_loop = 0;
 
 	return (0);
 }
 
 static void
 pfsync_multicast_cleanup(struct pfsync_softc *sc)
 {
 	struct ip_moptions *imo = &sc->sc_imo;
 
 	in_leavegroup(imo->imo_membership[0], NULL);
 	free(imo->imo_membership, M_PFSYNC);
 	imo->imo_membership = NULL;
 	imo->imo_multicast_ifp = NULL;
 }
 
 #ifdef INET
 extern  struct domain inetdomain;
 static struct protosw in_pfsync_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_PFSYNC,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		pfsync_input,
 	.pr_output =		rip_output,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 };
 #endif
 
 static void
 pfsync_pointers_init()
 {
 
 	PF_RULES_WLOCK();
 	pfsync_state_import_ptr = pfsync_state_import;
 	pfsync_insert_state_ptr = pfsync_insert_state;
 	pfsync_update_state_ptr = pfsync_update_state;
 	pfsync_delete_state_ptr = pfsync_delete_state;
 	pfsync_clear_states_ptr = pfsync_clear_states;
 	pfsync_defer_ptr = pfsync_defer;
 	PF_RULES_WUNLOCK();
 }
 
 static void
 pfsync_pointers_uninit()
 {
 
 	PF_RULES_WLOCK();
 	pfsync_state_import_ptr = NULL;
 	pfsync_insert_state_ptr = NULL;
 	pfsync_update_state_ptr = NULL;
 	pfsync_delete_state_ptr = NULL;
 	pfsync_clear_states_ptr = NULL;
 	pfsync_defer_ptr = NULL;
 	PF_RULES_WUNLOCK();
 }
 
 static int
 pfsync_init()
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	int error = 0;
 
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		V_pfsync_cloner = if_clone_simple(pfsyncname,
 		    pfsync_clone_create, pfsync_clone_destroy, 1);
 		error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif,
 		    SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
 		CURVNET_RESTORE();
 		if (error)
 			goto fail_locked;
 	}
 	VNET_LIST_RUNLOCK();
 #ifdef INET
 	error = pf_proto_register(PF_INET, &in_pfsync_protosw);
 	if (error)
 		goto fail;
 	error = ipproto_register(IPPROTO_PFSYNC);
 	if (error) {
 		pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
 		goto fail;
 	}
 #endif
 	pfsync_pointers_init();
 
 	return (0);
 
 fail:
 	VNET_LIST_RLOCK();
 fail_locked:
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		if (V_pfsync_swi_cookie) {
 			swi_remove(V_pfsync_swi_cookie);
 			if_clone_detach(V_pfsync_cloner);
 		}
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 
 	return (error);
 }
 
 static void
 pfsync_uninit()
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	pfsync_pointers_uninit();
 
 	ipproto_unregister(IPPROTO_PFSYNC);
 	pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		if_clone_detach(V_pfsync_cloner);
 		swi_remove(V_pfsync_swi_cookie);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 }
 
 static int
 pfsync_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		error = pfsync_init();
 		break;
 	case MOD_QUIESCE:
 		/*
 		 * Module should not be unloaded due to race conditions.
 		 */
 		error = EBUSY;
 		break;
 	case MOD_UNLOAD:
 		pfsync_uninit();
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 static moduledata_t pfsync_mod = {
 	pfsyncname,
 	pfsync_modevent,
 	0
 };
 
 #define PFSYNC_MODVER 1
 
 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
 MODULE_VERSION(pfsync, PFSYNC_MODVER);
 MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);
Index: user/dchagin/lemul/sys
===================================================================
--- user/dchagin/lemul/sys	(revision 274240)
+++ user/dchagin/lemul/sys	(revision 274241)

Property changes on: user/dchagin/lemul/sys
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sys:r274221-274240
Index: user/dchagin/lemul
===================================================================
--- user/dchagin/lemul	(revision 274240)
+++ user/dchagin/lemul	(revision 274241)

Property changes on: user/dchagin/lemul
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r274221-274240