Index: projects/ifnet/sys/dev/virtio/network/if_vtnet.c
===================================================================
--- projects/ifnet/sys/dev/virtio/network/if_vtnet.c	(revision 277599)
+++ projects/ifnet/sys/dev/virtio/network/if_vtnet.c	(revision 277600)
@@ -1,3801 +1,3801 @@
 /*-
  * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /* Driver for VirtIO network devices. */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/buf_ring.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/random.h>
 #include <sys/sglist.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/taskqueue.h>
 #include <sys/smp.h>
 #include <machine/smp.h>
 
 #include <vm/uma.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 
 #include <net/bpf.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/udp.h>
 #include <netinet/tcp.h>
 #include <netinet/sctp.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/bus.h>
 #include <sys/rman.h>
 
 #include <dev/virtio/virtio.h>
 #include <dev/virtio/virtqueue.h>
 #include <dev/virtio/network/virtio_net.h>
 #include <dev/virtio/network/if_vtnetvar.h>
 
 #ifdef DEV_NETMAP
 #include <dev/netmap/if_vtnet_netmap.h>
 #endif /* DEV_NETMAP */
 
 #include "virtio_if.h"
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 static int	vtnet_modevent(module_t, int, void *);
 
 static int	vtnet_probe(device_t);
 static int	vtnet_attach(device_t);
 static int	vtnet_detach(device_t);
 static int	vtnet_suspend(device_t);
 static int	vtnet_resume(device_t);
 static int	vtnet_shutdown(device_t);
 static int	vtnet_attach_completed(device_t);
 static int	vtnet_config_change(device_t);
 
 static void	vtnet_negotiate_features(struct vtnet_softc *);
 static void	vtnet_setup_features(struct vtnet_softc *);
 static int	vtnet_init_rxq(struct vtnet_softc *, int);
 static int	vtnet_init_txq(struct vtnet_softc *, int);
 static int	vtnet_alloc_rxtx_queues(struct vtnet_softc *);
 static void	vtnet_free_rxtx_queues(struct vtnet_softc *);
 static int	vtnet_alloc_rx_filters(struct vtnet_softc *);
 static void	vtnet_free_rx_filters(struct vtnet_softc *);
 static int	vtnet_alloc_virtqueues(struct vtnet_softc *);
 static void	vtnet_setup_interface(struct vtnet_softc *);
 static int	vtnet_change_mtu(struct vtnet_softc *, int);
 static int	vtnet_ioctl(if_t, u_long, void *, struct thread *);
 static uint64_t	vtnet_get_counter(if_t, ift_counter);
 
 static int	vtnet_rxq_populate(struct vtnet_rxq *);
 static void	vtnet_rxq_free_mbufs(struct vtnet_rxq *);
 static struct mbuf *
 		vtnet_rx_alloc_buf(struct vtnet_softc *, int , struct mbuf **);
 static int	vtnet_rxq_replace_lro_nomgr_buf(struct vtnet_rxq *,
 		    struct mbuf *, int);
 static int	vtnet_rxq_replace_buf(struct vtnet_rxq *, struct mbuf *, int);
 static int	vtnet_rxq_enqueue_buf(struct vtnet_rxq *, struct mbuf *);
 static int	vtnet_rxq_new_buf(struct vtnet_rxq *);
 static int	vtnet_rxq_csum(struct vtnet_rxq *, struct mbuf *,
 		     struct virtio_net_hdr *);
 static void	vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *, int);
 static void	vtnet_rxq_discard_buf(struct vtnet_rxq *, struct mbuf *);
 static int	vtnet_rxq_merged_eof(struct vtnet_rxq *, struct mbuf *, int);
 static void	vtnet_rxq_input(struct vtnet_rxq *, struct mbuf *,
 		    struct virtio_net_hdr *);
 static int	vtnet_rxq_eof(struct vtnet_rxq *);
 static void	vtnet_rx_vq_intr(void *);
 static void	vtnet_rxq_tq_intr(void *, int);
 
 static int	vtnet_txq_below_threshold(struct vtnet_txq *);
 static int	vtnet_txq_notify(struct vtnet_txq *);
 static void	vtnet_txq_free_mbufs(struct vtnet_txq *);
 static int	vtnet_txq_offload_ctx(struct vtnet_txq *, struct mbuf *,
 		    int *, int *, int *);
 static int	vtnet_txq_offload_tso(struct vtnet_txq *, struct mbuf *, int,
 		    int, struct virtio_net_hdr *);
 static struct mbuf *
 		vtnet_txq_offload(struct vtnet_txq *, struct mbuf *,
 		    struct virtio_net_hdr *);
 static int	vtnet_txq_enqueue_buf(struct vtnet_txq *, struct mbuf **,
 		    struct vtnet_tx_header *);
 static int	vtnet_txq_encap(struct vtnet_txq *, struct mbuf **);
 static int	vtnet_txq_mq_start_locked(struct vtnet_txq *, struct mbuf *);
 static int	vtnet_txq_mq_start(if_t, struct mbuf *);
 static void	vtnet_txq_tq_deferred(void *, int);
 static void	vtnet_txq_start(struct vtnet_txq *);
 static void	vtnet_txq_tq_intr(void *, int);
 static int	vtnet_txq_eof(struct vtnet_txq *);
 static void	vtnet_tx_vq_intr(void *);
 static void	vtnet_tx_start_all(struct vtnet_softc *);
 static void	vtnet_qflush(if_t);
 
 static int	vtnet_watchdog(struct vtnet_txq *);
 static void	vtnet_accum_stats(struct vtnet_softc *,
 		    struct vtnet_rxq_stats *, struct vtnet_txq_stats *);
 static void	vtnet_tick(void *);
 
 static void	vtnet_start_taskqueues(struct vtnet_softc *);
 static void	vtnet_free_taskqueues(struct vtnet_softc *);
 static void	vtnet_drain_taskqueues(struct vtnet_softc *);
 
 static void	vtnet_drain_rxtx_queues(struct vtnet_softc *);
 static void	vtnet_stop_rendezvous(struct vtnet_softc *);
 static void	vtnet_stop(struct vtnet_softc *);
 static int	vtnet_virtio_reinit(struct vtnet_softc *);
 static void	vtnet_init_rx_filters(struct vtnet_softc *);
 static int	vtnet_init_rx_queues(struct vtnet_softc *);
 static int	vtnet_init_tx_queues(struct vtnet_softc *);
 static int	vtnet_init_rxtx_queues(struct vtnet_softc *);
 static void	vtnet_set_active_vq_pairs(struct vtnet_softc *);
 static int	vtnet_reinit(struct vtnet_softc *);
 static void	vtnet_init_locked(struct vtnet_softc *);
 static void	vtnet_init(void *);
 
 static void	vtnet_free_ctrl_vq(struct vtnet_softc *);
 static void	vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *,
 		    struct sglist *, int, int);
 static int	vtnet_ctrl_mac_cmd(struct vtnet_softc *, uint8_t *);
 static int	vtnet_ctrl_mq_cmd(struct vtnet_softc *, uint16_t);
 static int	vtnet_ctrl_rx_cmd(struct vtnet_softc *, int, int);
 static int	vtnet_set_promisc(struct vtnet_softc *, int);
 static int	vtnet_set_allmulti(struct vtnet_softc *, int);
 static void	vtnet_attach_disable_promisc(struct vtnet_softc *);
 static void	vtnet_rx_filter(struct vtnet_softc *);
 static void	vtnet_rx_filter_mac(struct vtnet_softc *);
 static int	vtnet_exec_vlan_filter(struct vtnet_softc *, int, uint16_t);
 static void	vtnet_rx_filter_vlan(struct vtnet_softc *);
 static void	vtnet_update_vlan_filter(struct vtnet_softc *, int, uint16_t);
 static void	vtnet_register_vlan(void *, if_t, uint16_t);
 static void	vtnet_unregister_vlan(void *, if_t, uint16_t);
 
 static int	vtnet_is_link_up(struct vtnet_softc *);
 static void	vtnet_update_link_status(struct vtnet_softc *);
 static int	vtnet_ifmedia_upd(if_t);
 static void	vtnet_ifmedia_sts(if_t, struct ifmediareq *);
 static void	vtnet_get_hwaddr(struct vtnet_softc *);
 static void	vtnet_set_hwaddr(struct vtnet_softc *);
 static void	vtnet_vlan_tag_remove(struct mbuf *);
 static void	vtnet_set_rx_process_limit(struct vtnet_softc *);
 static void	vtnet_set_tx_intr_threshold(struct vtnet_softc *);
 
 static void	vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *,
 		    struct sysctl_oid_list *, struct vtnet_rxq *);
 static void	vtnet_setup_txq_sysctl(struct sysctl_ctx_list *,
 		    struct sysctl_oid_list *, struct vtnet_txq *);
 static void	vtnet_setup_queue_sysctl(struct vtnet_softc *);
 static void	vtnet_setup_sysctl(struct vtnet_softc *);
 
 static int	vtnet_rxq_enable_intr(struct vtnet_rxq *);
 static void	vtnet_rxq_disable_intr(struct vtnet_rxq *);
 static int	vtnet_txq_enable_intr(struct vtnet_txq *);
 static void	vtnet_txq_disable_intr(struct vtnet_txq *);
 static void	vtnet_enable_rx_interrupts(struct vtnet_softc *);
 static void	vtnet_enable_tx_interrupts(struct vtnet_softc *);
 static void	vtnet_enable_interrupts(struct vtnet_softc *);
 static void	vtnet_disable_rx_interrupts(struct vtnet_softc *);
 static void	vtnet_disable_tx_interrupts(struct vtnet_softc *);
 static void	vtnet_disable_interrupts(struct vtnet_softc *);
 
 static int	vtnet_tunable_int(struct vtnet_softc *, const char *, int);
 
 /* Tunables. */
 static int vtnet_csum_disable = 0;
 TUNABLE_INT("hw.vtnet.csum_disable", &vtnet_csum_disable);
 static int vtnet_tso_disable = 0;
 TUNABLE_INT("hw.vtnet.tso_disable", &vtnet_tso_disable);
 static int vtnet_lro_disable = 0;
 TUNABLE_INT("hw.vtnet.lro_disable", &vtnet_lro_disable);
 static int vtnet_mq_disable = 0;
 TUNABLE_INT("hw.vtnet.mq_disable", &vtnet_mq_disable);
 static int vtnet_mq_max_pairs = 0;
 TUNABLE_INT("hw.vtnet.mq_max_pairs", &vtnet_mq_max_pairs);
 static int vtnet_rx_process_limit = 512;
 TUNABLE_INT("hw.vtnet.rx_process_limit", &vtnet_rx_process_limit);
 
 static uma_zone_t vtnet_tx_header_zone;
 
 static struct virtio_feature_desc vtnet_feature_desc[] = {
 	{ VIRTIO_NET_F_CSUM,		"TxChecksum"	},
 	{ VIRTIO_NET_F_GUEST_CSUM,	"RxChecksum"	},
 	{ VIRTIO_NET_F_MAC,		"MacAddress"	},
 	{ VIRTIO_NET_F_GSO,		"TxAllGSO"	},
 	{ VIRTIO_NET_F_GUEST_TSO4,	"RxTSOv4"	},
 	{ VIRTIO_NET_F_GUEST_TSO6,	"RxTSOv6"	},
 	{ VIRTIO_NET_F_GUEST_ECN,	"RxECN"		},
 	{ VIRTIO_NET_F_GUEST_UFO,	"RxUFO"		},
 	{ VIRTIO_NET_F_HOST_TSO4,	"TxTSOv4"	},
 	{ VIRTIO_NET_F_HOST_TSO6,	"TxTSOv6"	},
 	{ VIRTIO_NET_F_HOST_ECN,	"TxTSOECN"	},
 	{ VIRTIO_NET_F_HOST_UFO,	"TxUFO"		},
 	{ VIRTIO_NET_F_MRG_RXBUF,	"MrgRxBuf"	},
 	{ VIRTIO_NET_F_STATUS,		"Status"	},
 	{ VIRTIO_NET_F_CTRL_VQ,		"ControlVq"	},
 	{ VIRTIO_NET_F_CTRL_RX,		"RxMode"	},
 	{ VIRTIO_NET_F_CTRL_VLAN,	"VLanFilter"	},
 	{ VIRTIO_NET_F_CTRL_RX_EXTRA,	"RxModeExtra"	},
 	{ VIRTIO_NET_F_GUEST_ANNOUNCE,	"GuestAnnounce"	},
 	{ VIRTIO_NET_F_MQ,		"Multiqueue"	},
 	{ VIRTIO_NET_F_CTRL_MAC_ADDR,	"SetMacAddress"	},
 
 	{ 0, NULL }
 };
 
 static device_method_t vtnet_methods[] = {
 	/* Device methods. */
 	DEVMETHOD(device_probe,			vtnet_probe),
 	DEVMETHOD(device_attach,		vtnet_attach),
 	DEVMETHOD(device_detach,		vtnet_detach),
 	DEVMETHOD(device_suspend,		vtnet_suspend),
 	DEVMETHOD(device_resume,		vtnet_resume),
 	DEVMETHOD(device_shutdown,		vtnet_shutdown),
 
 	/* VirtIO methods. */
 	DEVMETHOD(virtio_attach_completed,	vtnet_attach_completed),
 	DEVMETHOD(virtio_config_change,		vtnet_config_change),
 
 	DEVMETHOD_END
 };
 
 static driver_t vtnet_driver = {
 	"vtnet",
 	vtnet_methods,
 	sizeof(struct vtnet_softc)
 };
 static devclass_t vtnet_devclass;
 
 DRIVER_MODULE(vtnet, virtio_mmio, vtnet_driver, vtnet_devclass,
     vtnet_modevent, 0);
 DRIVER_MODULE(vtnet, virtio_pci, vtnet_driver, vtnet_devclass,
     vtnet_modevent, 0);
 MODULE_VERSION(vtnet, 1);
 MODULE_DEPEND(vtnet, virtio, 1, 1, 1);
 
 static struct ifdriver vtnet_ifdrv = {
 	.ifdrv_ops = {
 		.ifop_origin = IFOP_ORIGIN_DRIVER,
 		.ifop_ioctl = vtnet_ioctl,
 		.ifop_init = vtnet_init,
 		.ifop_get_counter = vtnet_get_counter,
 		.ifop_transmit = vtnet_txq_mq_start,
 		.ifop_qflush = vtnet_qflush,
 	},
 	.ifdrv_name = "vtnet",
 	.ifdrv_type = IFT_ETHER,
 	.ifdrv_hdrlen = sizeof(struct ether_vlan_header),
 };
 
 static int
 vtnet_modevent(module_t mod, int type, void *unused)
 {
 	int error;
 
 	error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		vtnet_tx_header_zone = uma_zcreate("vtnet_tx_hdr",
 		    sizeof(struct vtnet_tx_header),
 		    NULL, NULL, NULL, NULL, 0, 0);
 		break;
 	case MOD_QUIESCE:
 	case MOD_UNLOAD:
 		if (uma_zone_get_cur(vtnet_tx_header_zone) > 0)
 			error = EBUSY;
 		else if (type == MOD_UNLOAD) {
 			uma_zdestroy(vtnet_tx_header_zone);
 			vtnet_tx_header_zone = NULL;
 		}
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 static int
 vtnet_probe(device_t dev)
 {
 
 	if (virtio_get_device_type(dev) != VIRTIO_ID_NETWORK)
 		return (ENXIO);
 
 	device_set_desc(dev, "VirtIO Networking Adapter");
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 vtnet_attach(device_t dev)
 {
 	struct vtnet_softc *sc;
 	int error;
 
 	sc = device_get_softc(dev);
 	sc->vtnet_dev = dev;
 
 	/* Register our feature descriptions. */
 	virtio_set_feature_desc(dev, vtnet_feature_desc);
 
 	VTNET_CORE_LOCK_INIT(sc);
 	callout_init_mtx(&sc->vtnet_tick_ch, VTNET_CORE_MTX(sc), 0);
 
 	vtnet_setup_sysctl(sc);
 	vtnet_setup_features(sc);
 
 	error = vtnet_alloc_rx_filters(sc);
 	if (error) {
 		device_printf(dev, "cannot allocate Rx filters\n");
 		goto fail;
 	}
 
 	error = vtnet_alloc_rxtx_queues(sc);
 	if (error) {
 		device_printf(dev, "cannot allocate queues\n");
 		goto fail;
 	}
 
 	error = vtnet_alloc_virtqueues(sc);
 	if (error) {
 		device_printf(dev, "cannot allocate virtqueues\n");
 		goto fail;
 	}
 
 	error = virtio_setup_intr(dev, INTR_TYPE_NET);
 	if (error) {
 		device_printf(dev, "cannot setup virtqueue interrupts\n");
 		goto fail;
 	}
 
 	vtnet_setup_interface(sc);
 
 #ifdef DEV_NETMAP
 	vtnet_netmap_attach(sc);
 #endif /* DEV_NETMAP */
 
 	vtnet_start_taskqueues(sc);
 
 fail:
 	if (error)
 		vtnet_detach(dev);
 
 	return (error);
 }
 
 static int
 vtnet_detach(device_t dev)
 {
 	struct vtnet_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	if (device_is_attached(dev)) {
 		VTNET_CORE_LOCK(sc);
 		vtnet_stop(sc);
 		VTNET_CORE_UNLOCK(sc);
 
 		callout_drain(&sc->vtnet_tick_ch);
 		vtnet_drain_taskqueues(sc);
 
 #ifdef DEV_NETMAP
 		netmap_detach(sc->vtnet_ifp);
 #endif /* DEV_NETMAP */
 
 		if_detach(sc->vtnet_ifp);
 	}
 
 	vtnet_free_taskqueues(sc);
 
 	if (sc->vtnet_vlan_attach != NULL) {
 		EVENTHANDLER_DEREGISTER(vlan_config, sc->vtnet_vlan_attach);
 		sc->vtnet_vlan_attach = NULL;
 	}
 	if (sc->vtnet_vlan_detach != NULL) {
 		EVENTHANDLER_DEREGISTER(vlan_unconfg, sc->vtnet_vlan_detach);
 		sc->vtnet_vlan_detach = NULL;
 	}
 
 	ifmedia_removeall(&sc->vtnet_media);
 
 	vtnet_free_rxtx_queues(sc);
 	vtnet_free_rx_filters(sc);
 
 	if (sc->vtnet_ctrl_vq != NULL)
 		vtnet_free_ctrl_vq(sc);
 
 	VTNET_CORE_LOCK_DESTROY(sc);
 
 	return (0);
 }
 
 static int
 vtnet_suspend(device_t dev)
 {
 	struct vtnet_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	VTNET_CORE_LOCK(sc);
 	vtnet_stop(sc);
 	sc->vtnet_flags |= VTNET_FLAG_SUSPENDED;
 	VTNET_CORE_UNLOCK(sc);
 
 	return (0);
 }
 
 static int
 vtnet_resume(device_t dev)
 {
 	struct vtnet_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	VTNET_CORE_LOCK(sc);
 	if (sc->vtnet_if_flags & IFF_UP)
 		vtnet_init_locked(sc);
 	sc->vtnet_flags &= ~VTNET_FLAG_SUSPENDED;
 	VTNET_CORE_UNLOCK(sc);
 
 	return (0);
 }
 
 static int
 vtnet_shutdown(device_t dev)
 {
 
 	/*
 	 * Suspend already does all of what we need to
 	 * do here; we just never expect to be resumed.
 	 */
 	return (vtnet_suspend(dev));
 }
 
 static int
 vtnet_attach_completed(device_t dev)
 {
 
 	vtnet_attach_disable_promisc(device_get_softc(dev));
 
 	return (0);
 }
 
 static int
 vtnet_config_change(device_t dev)
 {
 	struct vtnet_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	VTNET_CORE_LOCK(sc);
 	vtnet_update_link_status(sc);
 	if (sc->vtnet_link_active != 0)
 		vtnet_tx_start_all(sc);
 	VTNET_CORE_UNLOCK(sc);
 
 	return (0);
 }
 
 static void
 vtnet_negotiate_features(struct vtnet_softc *sc)
 {
 	device_t dev;
 	uint64_t mask, features;
 
 	dev = sc->vtnet_dev;
 	mask = 0;
 
 	/*
 	 * TSO and LRO are only available when their corresponding checksum
 	 * offload feature is also negotiated.
 	 */
 	if (vtnet_tunable_int(sc, "csum_disable", vtnet_csum_disable)) {
 		mask |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM;
 		mask |= VTNET_TSO_FEATURES | VTNET_LRO_FEATURES;
 	}
 	if (vtnet_tunable_int(sc, "tso_disable", vtnet_tso_disable))
 		mask |= VTNET_TSO_FEATURES;
 	if (vtnet_tunable_int(sc, "lro_disable", vtnet_lro_disable))
 		mask |= VTNET_LRO_FEATURES;
 	if (vtnet_tunable_int(sc, "mq_disable", vtnet_mq_disable))
 		mask |= VIRTIO_NET_F_MQ;
 
 	features = VTNET_FEATURES & ~mask;
 	sc->vtnet_features = virtio_negotiate_features(dev, features);
 
 	if (virtio_with_feature(dev, VTNET_LRO_FEATURES) &&
 	    virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF) == 0) {
 		/*
 		 * LRO without mergeable buffers requires special care. This
 		 * is not ideal because every receive buffer must be large
 		 * enough to hold the maximum TCP packet, the Ethernet header,
 		 * and the header. This requires up to 34 descriptors with
 		 * MCLBYTES clusters. If we do not have indirect descriptors,
 		 * LRO is disabled since the virtqueue will not contain very
 		 * many receive buffers.
 		 */
 		if (!virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC)) {
 			device_printf(dev,
 			    "LRO disabled due to both mergeable buffers and "
 			    "indirect descriptors not negotiated\n");
 
 			features &= ~VTNET_LRO_FEATURES;
 			sc->vtnet_features =
 			    virtio_negotiate_features(dev, features);
 		} else
 			sc->vtnet_flags |= VTNET_FLAG_LRO_NOMRG;
 	}
 }
 
 static void
 vtnet_setup_features(struct vtnet_softc *sc)
 {
 	device_t dev;
 	int max_pairs, max;
 
 	dev = sc->vtnet_dev;
 
 	vtnet_negotiate_features(sc);
 
 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
 		sc->vtnet_flags |= VTNET_FLAG_INDIRECT;
 	if (virtio_with_feature(dev, VIRTIO_RING_F_EVENT_IDX))
 		sc->vtnet_flags |= VTNET_FLAG_EVENT_IDX;
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_MAC)) {
 		/* This feature should always be negotiated. */
 		sc->vtnet_flags |= VTNET_FLAG_MAC;
 	}
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) {
 		sc->vtnet_flags |= VTNET_FLAG_MRG_RXBUFS;
 		sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	} else
 		sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr);
 
 	if (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS)
 		sc->vtnet_rx_nsegs = VTNET_MRG_RX_SEGS;
 	else if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG)
 		sc->vtnet_rx_nsegs = VTNET_MAX_RX_SEGS;
 	else
 		sc->vtnet_rx_nsegs = VTNET_MIN_RX_SEGS;
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_GSO) ||
 	    virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4) ||
 	    virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6))
 		sc->vtnet_tx_nsegs = VTNET_MAX_TX_SEGS;
 	else
 		sc->vtnet_tx_nsegs = VTNET_MIN_TX_SEGS;
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VQ)) {
 		sc->vtnet_flags |= VTNET_FLAG_CTRL_VQ;
 
 		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_RX))
 			sc->vtnet_flags |= VTNET_FLAG_CTRL_RX;
 		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VLAN))
 			sc->vtnet_flags |= VTNET_FLAG_VLAN_FILTER;
 		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_MAC_ADDR))
 			sc->vtnet_flags |= VTNET_FLAG_CTRL_MAC;
 	}
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_MQ) &&
 	    sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) {
 		max_pairs = virtio_read_dev_config_2(dev,
 		    offsetof(struct virtio_net_config, max_virtqueue_pairs));
 		if (max_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
 		    max_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX)
 			max_pairs = 1;
 	} else
 		max_pairs = 1;
 
 	if (max_pairs > 1) {
 		/*
 		 * Limit the maximum number of queue pairs to the number of
 		 * CPUs or the configured maximum. The actual number of
 		 * queues that get used may be less.
 		 */
 		max = vtnet_tunable_int(sc, "mq_max_pairs", vtnet_mq_max_pairs);
 		if (max > 0 && max_pairs > max)
 			max_pairs = max;
 		if (max_pairs > mp_ncpus)
 			max_pairs = mp_ncpus;
 		if (max_pairs > VTNET_MAX_QUEUE_PAIRS)
 			max_pairs = VTNET_MAX_QUEUE_PAIRS;
 		if (max_pairs > 1)
 			sc->vtnet_flags |= VTNET_FLAG_MULTIQ;
 	}
 
 	sc->vtnet_max_vq_pairs = max_pairs;
 }
 
 static int
 vtnet_init_rxq(struct vtnet_softc *sc, int id)
 {
 	struct vtnet_rxq *rxq;
 
 	rxq = &sc->vtnet_rxqs[id];
 
 	snprintf(rxq->vtnrx_name, sizeof(rxq->vtnrx_name), "%s-rx%d",
 	    device_get_nameunit(sc->vtnet_dev), id);
 	mtx_init(&rxq->vtnrx_mtx, rxq->vtnrx_name, NULL, MTX_DEF);
 
 	rxq->vtnrx_sc = sc;
 	rxq->vtnrx_id = id;
 
 	rxq->vtnrx_sg = sglist_alloc(sc->vtnet_rx_nsegs, M_NOWAIT);
 	if (rxq->vtnrx_sg == NULL)
 		return (ENOMEM);
 
 	TASK_INIT(&rxq->vtnrx_intrtask, 0, vtnet_rxq_tq_intr, rxq);
 	rxq->vtnrx_tq = taskqueue_create(rxq->vtnrx_name, M_NOWAIT,
 	    taskqueue_thread_enqueue, &rxq->vtnrx_tq);
 
 	return (rxq->vtnrx_tq == NULL ? ENOMEM : 0);
 }
 
 static int
 vtnet_init_txq(struct vtnet_softc *sc, int id)
 {
 	struct vtnet_txq *txq;
 
 	txq = &sc->vtnet_txqs[id];
 
 	snprintf(txq->vtntx_name, sizeof(txq->vtntx_name), "%s-tx%d",
 	    device_get_nameunit(sc->vtnet_dev), id);
 	mtx_init(&txq->vtntx_mtx, txq->vtntx_name, NULL, MTX_DEF);
 
 	txq->vtntx_sc = sc;
 	txq->vtntx_id = id;
 
 	txq->vtntx_sg = sglist_alloc(sc->vtnet_tx_nsegs, M_NOWAIT);
 	if (txq->vtntx_sg == NULL)
 		return (ENOMEM);
 
 	txq->vtntx_br = buf_ring_alloc(VTNET_DEFAULT_BUFRING_SIZE, M_DEVBUF,
 	    M_NOWAIT, &txq->vtntx_mtx);
 	if (txq->vtntx_br == NULL)
 		return (ENOMEM);
 
 	TASK_INIT(&txq->vtntx_defrtask, 0, vtnet_txq_tq_deferred, txq);
 	TASK_INIT(&txq->vtntx_intrtask, 0, vtnet_txq_tq_intr, txq);
 	txq->vtntx_tq = taskqueue_create(txq->vtntx_name, M_NOWAIT,
 	    taskqueue_thread_enqueue, &txq->vtntx_tq);
 	if (txq->vtntx_tq == NULL)
 		return (ENOMEM);
 
 	return (0);
 }
 
 static int
 vtnet_alloc_rxtx_queues(struct vtnet_softc *sc)
 {
 	int i, npairs, error;
 
 	npairs = sc->vtnet_max_vq_pairs;
 
 	sc->vtnet_rxqs = malloc(sizeof(struct vtnet_rxq) * npairs, M_DEVBUF,
 	    M_NOWAIT | M_ZERO);
 	sc->vtnet_txqs = malloc(sizeof(struct vtnet_txq) * npairs, M_DEVBUF,
 	    M_NOWAIT | M_ZERO);
 	if (sc->vtnet_rxqs == NULL || sc->vtnet_txqs == NULL)
 		return (ENOMEM);
 
 	for (i = 0; i < npairs; i++) {
 		error = vtnet_init_rxq(sc, i);
 		if (error)
 			return (error);
 		error = vtnet_init_txq(sc, i);
 		if (error)
 			return (error);
 	}
 
 	vtnet_setup_queue_sysctl(sc);
 
 	return (0);
 }
 
 static void
 vtnet_destroy_rxq(struct vtnet_rxq *rxq)
 {
 
 	rxq->vtnrx_sc = NULL;
 	rxq->vtnrx_id = -1;
 
 	if (rxq->vtnrx_sg != NULL) {
 		sglist_free(rxq->vtnrx_sg);
 		rxq->vtnrx_sg = NULL;
 	}
 
 	if (mtx_initialized(&rxq->vtnrx_mtx) != 0)
 		mtx_destroy(&rxq->vtnrx_mtx);
 }
 
 static void
 vtnet_destroy_txq(struct vtnet_txq *txq)
 {
 
 	txq->vtntx_sc = NULL;
 	txq->vtntx_id = -1;
 
 	if (txq->vtntx_sg != NULL) {
 		sglist_free(txq->vtntx_sg);
 		txq->vtntx_sg = NULL;
 	}
 
 	if (txq->vtntx_br != NULL) {
 		buf_ring_free(txq->vtntx_br, M_DEVBUF);
 		txq->vtntx_br = NULL;
 	}
 
 	if (mtx_initialized(&txq->vtntx_mtx) != 0)
 		mtx_destroy(&txq->vtntx_mtx);
 }
 
 static void
 vtnet_free_rxtx_queues(struct vtnet_softc *sc)
 {
 	int i;
 
 	if (sc->vtnet_rxqs != NULL) {
 		for (i = 0; i < sc->vtnet_max_vq_pairs; i++)
 			vtnet_destroy_rxq(&sc->vtnet_rxqs[i]);
 		free(sc->vtnet_rxqs, M_DEVBUF);
 		sc->vtnet_rxqs = NULL;
 	}
 
 	if (sc->vtnet_txqs != NULL) {
 		for (i = 0; i < sc->vtnet_max_vq_pairs; i++)
 			vtnet_destroy_txq(&sc->vtnet_txqs[i]);
 		free(sc->vtnet_txqs, M_DEVBUF);
 		sc->vtnet_txqs = NULL;
 	}
 }
 
 static int
 vtnet_alloc_rx_filters(struct vtnet_softc *sc)
 {
 
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) {
 		sc->vtnet_mac_filter = malloc(sizeof(struct vtnet_mac_filter),
 		    M_DEVBUF, M_NOWAIT | M_ZERO);
 		if (sc->vtnet_mac_filter == NULL)
 			return (ENOMEM);
 	}
 
 	if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) {
 		sc->vtnet_vlan_filter = malloc(sizeof(uint32_t) *
 		    VTNET_VLAN_FILTER_NWORDS, M_DEVBUF, M_NOWAIT | M_ZERO);
 		if (sc->vtnet_vlan_filter == NULL)
 			return (ENOMEM);
 	}
 
 	return (0);
 }
 
 static void
 vtnet_free_rx_filters(struct vtnet_softc *sc)
 {
 
 	if (sc->vtnet_mac_filter != NULL) {
 		free(sc->vtnet_mac_filter, M_DEVBUF);
 		sc->vtnet_mac_filter = NULL;
 	}
 
 	if (sc->vtnet_vlan_filter != NULL) {
 		free(sc->vtnet_vlan_filter, M_DEVBUF);
 		sc->vtnet_vlan_filter = NULL;
 	}
 }
 
 static int
 vtnet_alloc_virtqueues(struct vtnet_softc *sc)
 {
 	device_t dev;
 	struct vq_alloc_info *info;
 	struct vtnet_rxq *rxq;
 	struct vtnet_txq *txq;
 	int i, idx, flags, nvqs, error;
 
 	dev = sc->vtnet_dev;
 	flags = 0;
 
 	nvqs = sc->vtnet_max_vq_pairs * 2;
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ)
 		nvqs++;
 
 	info = malloc(sizeof(struct vq_alloc_info) * nvqs, M_TEMP, M_NOWAIT);
 	if (info == NULL)
 		return (ENOMEM);
 
 	for (i = 0, idx = 0; i < sc->vtnet_max_vq_pairs; i++, idx+=2) {
 		rxq = &sc->vtnet_rxqs[i];
 		VQ_ALLOC_INFO_INIT(&info[idx], sc->vtnet_rx_nsegs,
 		    vtnet_rx_vq_intr, rxq, &rxq->vtnrx_vq,
 		    "%s-%d rx", device_get_nameunit(dev), rxq->vtnrx_id);
 
 		txq = &sc->vtnet_txqs[i];
 		VQ_ALLOC_INFO_INIT(&info[idx+1], sc->vtnet_tx_nsegs,
 		    vtnet_tx_vq_intr, txq, &txq->vtntx_vq,
 		    "%s-%d tx", device_get_nameunit(dev), txq->vtntx_id);
 	}
 
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) {
 		VQ_ALLOC_INFO_INIT(&info[idx], 0, NULL, NULL,
 		    &sc->vtnet_ctrl_vq, "%s ctrl", device_get_nameunit(dev));
 	}
 
 	/*
 	 * Enable interrupt binding if this is multiqueue. This only matters
 	 * when per-vq MSIX is available.
 	 */
 	if (sc->vtnet_flags & VTNET_FLAG_MULTIQ)
 		flags |= 0;
 
 	error = virtio_alloc_virtqueues(dev, flags, nvqs, info);
 	free(info, M_TEMP);
 
 	return (error);
 }
 
 static void
 vtnet_setup_interface(struct vtnet_softc *sc)
 {
 	struct if_attach_args ifat = {
 		.ifat_version = IF_ATTACH_VERSION,
 		.ifat_drv = &vtnet_ifdrv,
 		.ifat_softc = sc,
 		.ifat_baudrate = IF_Gbps(10), /* Approx. */
 		.ifat_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST,
 		.ifat_capabilities = IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU,
 	};
 	device_t dev;
 
 	dev = sc->vtnet_dev;
 	/* Read (or generate) the MAC address for the adapter. */
 	vtnet_get_hwaddr(sc);
 
 	ifat.ifat_dunit = device_get_unit(dev);
 	ifat.ifat_lla = sc->vtnet_hwaddr;
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_STATUS))
 		ifat.ifat_capabilities |= IFCAP_LINKSTATE;
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_CSUM)) {
 		ifat.ifat_capabilities |= IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6;
 
 		if (virtio_with_feature(dev, VIRTIO_NET_F_GSO)) {
 			ifat.ifat_capabilities |= IFCAP_TSO4 | IFCAP_TSO6;
 			sc->vtnet_flags |= VTNET_FLAG_TSO_ECN;
 		} else {
 			if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4))
 				ifat.ifat_capabilities |= IFCAP_TSO4;
 			if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6))
 				ifat.ifat_capabilities |= IFCAP_TSO6;
 			if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_ECN))
 				sc->vtnet_flags |= VTNET_FLAG_TSO_ECN;
 		}
 
 		if (ifat.ifat_capabilities & IFCAP_TSO)
 			ifat.ifat_capabilities |= IFCAP_VLAN_HWTSO;
 	}
 
 	if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_CSUM)) {
 		ifat.ifat_capabilities |= IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6;
 
 		if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO4) ||
 		    virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO6))
 			ifat.ifat_capabilities |= IFCAP_LRO;
 	}
 
 	if (ifat.ifat_capabilities & IFCAP_HWCSUM) {
 		/*
 		 * VirtIO does not support VLAN tagging, but we can fake
 		 * it by inserting and removing the 802.1Q header during
 		 * transmit and receive. We are then able to do checksum
 		 * offloading of VLAN frames.
 		 */
 		ifat.ifat_capabilities |=
 		    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
 	}
 
 	ifat.ifat_capenable = ifat.ifat_capabilities;
 
 	/*
 	 * Capabilities after here are not enabled by default.
 	 */
 
 	if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) {
 		ifat.ifat_capabilities |= IFCAP_VLAN_HWFILTER;
 
 		sc->vtnet_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
 		    vtnet_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
 		sc->vtnet_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
 		    vtnet_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
 	}
 
 	vtnet_set_rx_process_limit(sc);
 	vtnet_set_tx_intr_threshold(sc);
 
 	ifmedia_init(&sc->vtnet_media, IFM_IMASK, vtnet_ifmedia_upd,
 	    vtnet_ifmedia_sts);
 	ifmedia_add(&sc->vtnet_media, VTNET_MEDIATYPE, 0, NULL);
 	ifmedia_set(&sc->vtnet_media, VTNET_MEDIATYPE);
 
 	sc->vtnet_ifp = if_attach(&ifat);
 }
 
 static int
 vtnet_change_mtu(struct vtnet_softc *sc, int new_mtu)
 {
 	if_t ifp;
 	int frame_size, clsize;
 
 	ifp = sc->vtnet_ifp;
 
 	if (new_mtu < ETHERMIN || new_mtu > VTNET_MAX_MTU)
 		return (EINVAL);
 
 	frame_size = sc->vtnet_hdr_size + sizeof(struct ether_vlan_header) +
 	    new_mtu;
 
 	/*
 	 * Based on the new MTU (and hence frame size) determine which
 	 * cluster size is most appropriate for the receive queues.
 	 */
 	if (frame_size <= MCLBYTES) {
 		clsize = MCLBYTES;
 	} else if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
 		/* Avoid going past 9K jumbos. */
 		if (frame_size > MJUM9BYTES)
 			return (EINVAL);
 		clsize = MJUM9BYTES;
 	} else
 		clsize = MJUMPAGESIZE;
 
 	sc->vtnet_rx_new_clsize = clsize;
 
 	if (sc->vtnet_flags & VTNET_FLAG_RUNNING) {
 		sc->vtnet_flags &= ~VTNET_FLAG_RUNNING;
 		vtnet_init_locked(sc);
 	}
 
 	return (0);
 }
 
 static int
 vtnet_ioctl(if_t ifp, u_long cmd, void *data, struct thread *td)
 {
 	struct vtnet_softc *sc;
 	struct ifreq *ifr;
 	int oflags, error;
 
 	sc = if_getsoftc(ifp, IF_DRIVER_SOFTC);
 	ifr = (struct ifreq *) data;
 	error = 0;
 
 	switch (cmd) {
 	case SIOCSIFMTU:
 		VTNET_CORE_LOCK(sc);
 		error = vtnet_change_mtu(sc, ifr->ifr_mtu);
 		VTNET_CORE_UNLOCK(sc);
 		break;
 
 	case SIOCSIFFLAGS:
 		if ((ifr->ifr_flags & (IFF_PROMISC | IFF_ALLMULTI)) &&
 		    (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0) {
 			error = EINVAL;
 			break;
 		}
 		VTNET_CORE_LOCK(sc);
 		oflags = sc->vtnet_if_flags;
 		sc->vtnet_if_flags = ifr->ifr_flags;
 		if ((sc->vtnet_if_flags & IFF_UP) == 0) {
 			if (sc->vtnet_flags & VTNET_FLAG_RUNNING)
 				vtnet_stop(sc);
 		} else if (sc->vtnet_flags & VTNET_FLAG_RUNNING) {
 			if ((oflags ^ sc->vtnet_if_flags) &
 			    (IFF_PROMISC | IFF_ALLMULTI))
 				vtnet_rx_filter(sc);
 		} else
 			vtnet_init_locked(sc);
 		VTNET_CORE_UNLOCK(sc);
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0)
 			break;
 		VTNET_CORE_LOCK(sc);
 		if (sc->vtnet_flags & VTNET_FLAG_RUNNING)
 			vtnet_rx_filter_mac(sc);
 		VTNET_CORE_UNLOCK(sc);
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &sc->vtnet_media, cmd);
 		break;
 
 	case SIOCSIFCAP:
 		sc->vtnet_capenable = ifr->ifr_reqcap;
 		/* These Rx features require us to renegotiate. */
 		if ((ifr->ifr_reqcap ^ ifr->ifr_curcap) &
 		    (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO |
 		    IFCAP_VLAN_HWFILTER) &&
 		    (sc->vtnet_flags & VTNET_FLAG_RUNNING)) {
 			VTNET_CORE_LOCK(sc);
 			sc->vtnet_flags &= ~VTNET_FLAG_RUNNING;
 			vtnet_init_locked(sc);
 			VTNET_CORE_UNLOCK(sc);
 		}
 		ifr->ifr_hwassist = 0;
 		if (ifr->ifr_reqcap & IFCAP_TXCSUM)
 			ifr->ifr_hwassist |= VTNET_CSUM_OFFLOAD;
 		if (ifr->ifr_reqcap & IFCAP_TXCSUM_IPV6)
 			ifr->ifr_hwassist |= VTNET_CSUM_OFFLOAD_IPV6;
 		if (ifr->ifr_reqcap & IFCAP_TSO4)
 			ifr->ifr_hwassist |= CSUM_TSO;
 		if (ifr->ifr_reqcap & IFCAP_TSO6)
 			ifr->ifr_hwassist |= CSUM_IP6_TSO;
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	VTNET_CORE_LOCK_ASSERT_NOTOWNED(sc);
 
 	return (error);
 }
 
 static int
 vtnet_rxq_populate(struct vtnet_rxq *rxq)
 {
 	struct virtqueue *vq;
 	int nbufs, error;
 
 	vq = rxq->vtnrx_vq;
 	error = ENOSPC;
 
 	for (nbufs = 0; !virtqueue_full(vq); nbufs++) {
 		error = vtnet_rxq_new_buf(rxq);
 		if (error)
 			break;
 	}
 
 	if (nbufs > 0) {
 		virtqueue_notify(vq);
 		/*
 		 * EMSGSIZE signifies the virtqueue did not have enough
 		 * entries available to hold the last mbuf. This is not
 		 * an error.
 		 */
 		if (error == EMSGSIZE)
 			error = 0;
 	}
 
 	return (error);
 }
 
 static void
 vtnet_rxq_free_mbufs(struct vtnet_rxq *rxq)
 {
 	struct virtqueue *vq;
 	struct mbuf *m;
 	int last;
 
 	vq = rxq->vtnrx_vq;
 	last = 0;
 
 	while ((m = virtqueue_drain(vq, &last)) != NULL)
 		m_freem(m);
 
 	KASSERT(virtqueue_empty(vq),
 	    ("%s: mbufs remaining in rx queue %p", __func__, rxq));
 }
 
 static struct mbuf *
 vtnet_rx_alloc_buf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp)
 {
 	struct mbuf *m_head, *m_tail, *m;
 	int i, clsize;
 
 	clsize = sc->vtnet_rx_clsize;
 
 	KASSERT(nbufs == 1 || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG,
 	    ("%s: chained mbuf %d request without LRO_NOMRG", __func__, nbufs));
 
 	m_head = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, clsize);
 	if (m_head == NULL)
 		goto fail;
 
 	m_head->m_len = clsize;
 	m_tail = m_head;
 
 	/* Allocate the rest of the chain. */
 	for (i = 1; i < nbufs; i++) {
 		m = m_getjcl(M_NOWAIT, MT_DATA, 0, clsize);
 		if (m == NULL)
 			goto fail;
 
 		m->m_len = clsize;
 		m_tail->m_next = m;
 		m_tail = m;
 	}
 
 	if (m_tailp != NULL)
 		*m_tailp = m_tail;
 
 	return (m_head);
 
 fail:
 	sc->vtnet_stats.mbuf_alloc_failed++;
 	m_freem(m_head);
 
 	return (NULL);
 }
 
 /*
  * Slow path for when LRO without mergeable buffers is negotiated.
  */
 static int
 vtnet_rxq_replace_lro_nomgr_buf(struct vtnet_rxq *rxq, struct mbuf *m0,
     int len0)
 {
 	struct vtnet_softc *sc;
 	struct mbuf *m, *m_prev;
 	struct mbuf *m_new, *m_tail;
 	int len, clsize, nreplace, error;
 
 	sc = rxq->vtnrx_sc;
 	clsize = sc->vtnet_rx_clsize;
 
 	m_prev = NULL;
 	m_tail = NULL;
 	nreplace = 0;
 
 	m = m0;
 	len = len0;
 
 	/*
 	 * Since these mbuf chains are so large, we avoid allocating an
 	 * entire replacement chain if possible. When the received frame
 	 * did not consume the entire chain, the unused mbufs are moved
 	 * to the replacement chain.
 	 */
 	while (len > 0) {
 		/*
 		 * Something is seriously wrong if we received a frame
 		 * larger than the chain. Drop it.
 		 */
 		if (m == NULL) {
 			sc->vtnet_stats.rx_frame_too_large++;
 			return (EMSGSIZE);
 		}
 
 		/* We always allocate the same cluster size. */
 		KASSERT(m->m_len == clsize,
 		    ("%s: mbuf size %d is not the cluster size %d",
 		    __func__, m->m_len, clsize));
 
 		m->m_len = MIN(m->m_len, len);
 		len -= m->m_len;
 
 		m_prev = m;
 		m = m->m_next;
 		nreplace++;
 	}
 
 	KASSERT(nreplace <= sc->vtnet_rx_nmbufs,
 	    ("%s: too many replacement mbufs %d max %d", __func__, nreplace,
 	    sc->vtnet_rx_nmbufs));
 
 	m_new = vtnet_rx_alloc_buf(sc, nreplace, &m_tail);
 	if (m_new == NULL) {
 		m_prev->m_len = clsize;
 		return (ENOBUFS);
 	}
 
 	/*
 	 * Move any unused mbufs from the received chain onto the end
 	 * of the new chain.
 	 */
 	if (m_prev->m_next != NULL) {
 		m_tail->m_next = m_prev->m_next;
 		m_prev->m_next = NULL;
 	}
 
 	error = vtnet_rxq_enqueue_buf(rxq, m_new);
 	if (error) {
 		/*
 		 * BAD! We could not enqueue the replacement mbuf chain. We
 		 * must restore the m0 chain to the original state if it was
 		 * modified so we can subsequently discard it.
 		 *
 		 * NOTE: The replacement is suppose to be an identical copy
 		 * to the one just dequeued so this is an unexpected error.
 		 */
 		sc->vtnet_stats.rx_enq_replacement_failed++;
 
 		if (m_tail->m_next != NULL) {
 			m_prev->m_next = m_tail->m_next;
 			m_tail->m_next = NULL;
 		}
 
 		m_prev->m_len = clsize;
 		m_freem(m_new);
 	}
 
 	return (error);
 }
 
 static int
 vtnet_rxq_replace_buf(struct vtnet_rxq *rxq, struct mbuf *m, int len)
 {
 	struct vtnet_softc *sc;
 	struct mbuf *m_new;
 	int error;
 
 	sc = rxq->vtnrx_sc;
 
 	KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG || m->m_next == NULL,
 	    ("%s: chained mbuf without LRO_NOMRG", __func__));
 
 	if (m->m_next == NULL) {
 		/* Fast-path for the common case of just one mbuf. */
 		if (m->m_len < len)
 			return (EINVAL);
 
 		m_new = vtnet_rx_alloc_buf(sc, 1, NULL);
 		if (m_new == NULL)
 			return (ENOBUFS);
 
 		error = vtnet_rxq_enqueue_buf(rxq, m_new);
 		if (error) {
 			/*
 			 * The new mbuf is suppose to be an identical
 			 * copy of the one just dequeued so this is an
 			 * unexpected error.
 			 */
 			m_freem(m_new);
 			sc->vtnet_stats.rx_enq_replacement_failed++;
 		} else
 			m->m_len = len;
 	} else
 		error = vtnet_rxq_replace_lro_nomgr_buf(rxq, m, len);
 
 	return (error);
 }
 
 static int
 vtnet_rxq_enqueue_buf(struct vtnet_rxq *rxq, struct mbuf *m)
 {
 	struct vtnet_softc *sc;
 	struct sglist *sg;
 	struct vtnet_rx_header *rxhdr;
 	uint8_t *mdata;
 	int offset, error;
 
 	sc = rxq->vtnrx_sc;
 	sg = rxq->vtnrx_sg;
 	mdata = mtod(m, uint8_t *);
 
 	VTNET_RXQ_LOCK_ASSERT(rxq);
 	KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG || m->m_next == NULL,
 	    ("%s: chained mbuf without LRO_NOMRG", __func__));
 	KASSERT(m->m_len == sc->vtnet_rx_clsize,
 	    ("%s: unexpected cluster size %d/%d", __func__, m->m_len,
 	     sc->vtnet_rx_clsize));
 
 	sglist_reset(sg);
 	if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
 		MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr));
 		rxhdr = (struct vtnet_rx_header *) mdata;
 		sglist_append(sg, &rxhdr->vrh_hdr, sc->vtnet_hdr_size);
 		offset = sizeof(struct vtnet_rx_header);
 	} else
 		offset = 0;
 
 	sglist_append(sg, mdata + offset, m->m_len - offset);
 	if (m->m_next != NULL) {
 		error = sglist_append_mbuf(sg, m->m_next);
 		MPASS(error == 0);
 	}
 
 	error = virtqueue_enqueue(rxq->vtnrx_vq, m, sg, 0, sg->sg_nseg);
 
 	return (error);
 }
 
 static int
 vtnet_rxq_new_buf(struct vtnet_rxq *rxq)
 {
 	struct vtnet_softc *sc;
 	struct mbuf *m;
 	int error;
 
 	sc = rxq->vtnrx_sc;
 
 	m = vtnet_rx_alloc_buf(sc, sc->vtnet_rx_nmbufs, NULL);
 	if (m == NULL)
 		return (ENOBUFS);
 
 	error = vtnet_rxq_enqueue_buf(rxq, m);
 	if (error)
 		m_freem(m);
 
 	return (error);
 }
 
 /*
  * Use the checksum offset in the VirtIO header to set the
  * correct CSUM_* flags.
  */
 static int
 vtnet_rxq_csum_by_offset(struct vtnet_rxq *rxq, struct mbuf *m,
     uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr)
 {
 	struct vtnet_softc *sc;
 #if defined(INET) || defined(INET6)
 	int offset = hdr->csum_start + hdr->csum_offset;
 #endif
 
 	sc = rxq->vtnrx_sc;
 
 	/* Only do a basic sanity check on the offset. */
 	switch (eth_type) {
 #if defined(INET)
 	case ETHERTYPE_IP:
 		if (__predict_false(offset < ip_start + sizeof(struct ip)))
 			return (1);
 		break;
 #endif
 #if defined(INET6)
 	case ETHERTYPE_IPV6:
 		if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr)))
 			return (1);
 		break;
 #endif
 	default:
 		sc->vtnet_stats.rx_csum_bad_ethtype++;
 		return (1);
 	}
 
 	/*
 	 * Use the offset to determine the appropriate CSUM_* flags. This is
 	 * a bit dirty, but we can get by with it since the checksum offsets
 	 * happen to be different. We assume the host host does not do IPv4
 	 * header checksum offloading.
 	 */
 	switch (hdr->csum_offset) {
 	case offsetof(struct udphdr, uh_sum):
 	case offsetof(struct tcphdr, th_sum):
 		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 		m->m_pkthdr.csum_data = 0xFFFF;
 		break;
 	case offsetof(struct sctphdr, checksum):
 		m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 		break;
 	default:
 		sc->vtnet_stats.rx_csum_bad_offset++;
 		return (1);
 	}
 
 	return (0);
 }
 
 static int
 vtnet_rxq_csum_by_parse(struct vtnet_rxq *rxq, struct mbuf *m,
     uint16_t eth_type, int ip_start, struct virtio_net_hdr *hdr)
 {
 	struct vtnet_softc *sc;
 	int offset, proto;
 
 	sc = rxq->vtnrx_sc;
 
 	switch (eth_type) {
 #if defined(INET)
 	case ETHERTYPE_IP: {
 		struct ip *ip;
 		if (__predict_false(m->m_len < ip_start + sizeof(struct ip)))
 			return (1);
 		ip = (struct ip *)(m->m_data + ip_start);
 		proto = ip->ip_p;
 		offset = ip_start + (ip->ip_hl << 2);
 		break;
 	}
 #endif
 #if defined(INET6)
 	case ETHERTYPE_IPV6:
 		if (__predict_false(m->m_len < ip_start +
 		    sizeof(struct ip6_hdr)))
 			return (1);
 		offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto);
 		if (__predict_false(offset < 0))
 			return (1);
 		break;
 #endif
 	default:
 		sc->vtnet_stats.rx_csum_bad_ethtype++;
 		return (1);
 	}
 
 	switch (proto) {
 	case IPPROTO_TCP:
 		if (__predict_false(m->m_len < offset + sizeof(struct tcphdr)))
 			return (1);
 		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 		m->m_pkthdr.csum_data = 0xFFFF;
 		break;
 	case IPPROTO_UDP:
 		if (__predict_false(m->m_len < offset + sizeof(struct udphdr)))
 			return (1);
 		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 		m->m_pkthdr.csum_data = 0xFFFF;
 		break;
 	case IPPROTO_SCTP:
 		if (__predict_false(m->m_len < offset + sizeof(struct sctphdr)))
 			return (1);
 		m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 		break;
 	default:
 		/*
 		 * For the remaining protocols, FreeBSD does not support
 		 * checksum offloading, so the checksum will be recomputed.
 		 */
 #if 0
 		if_printf(sc->vtnet_ifp, "cksum offload of unsupported "
 		    "protocol eth_type=%#x proto=%d csum_start=%d "
 		    "csum_offset=%d\n", __func__, eth_type, proto,
 		    hdr->csum_start, hdr->csum_offset);
 #endif
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * Set the appropriate CSUM_* flags. Unfortunately, the information
  * provided is not directly useful to us. The VirtIO header gives the
  * offset of the checksum, which is all Linux needs, but this is not
  * how FreeBSD does things. We are forced to peek inside the packet
  * a bit.
  *
  * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD
  * could accept the offsets and let the stack figure it out.
  */
 static int
 vtnet_rxq_csum(struct vtnet_rxq *rxq, struct mbuf *m,
     struct virtio_net_hdr *hdr)
 {
 	struct ether_header *eh;
 	struct ether_vlan_header *evh;
 	uint16_t eth_type;
 	int offset, error;
 
 	eh = mtod(m, struct ether_header *);
 	eth_type = ntohs(eh->ether_type);
 	if (eth_type == ETHERTYPE_VLAN) {
 		/* BMV: We should handle nested VLAN tags too. */
 		evh = mtod(m, struct ether_vlan_header *);
 		eth_type = ntohs(evh->evl_proto);
 		offset = sizeof(struct ether_vlan_header);
 	} else
 		offset = sizeof(struct ether_header);
 
 	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
 		error = vtnet_rxq_csum_by_offset(rxq, m, eth_type, offset, hdr);
 	else
 		error = vtnet_rxq_csum_by_parse(rxq, m, eth_type, offset, hdr);
 
 	return (error);
 }
 
 static void
 vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *rxq, int nbufs)
 {
 	struct mbuf *m;
 
 	while (--nbufs > 0) {
 		m = virtqueue_dequeue(rxq->vtnrx_vq, NULL);
 		if (m == NULL)
 			break;
 		vtnet_rxq_discard_buf(rxq, m);
 	}
 }
 
 static void
 vtnet_rxq_discard_buf(struct vtnet_rxq *rxq, struct mbuf *m)
 {
 	int error;
 
 	/*
 	 * Requeue the discarded mbuf. This should always be successful
 	 * since it was just dequeued.
 	 */
 	error = vtnet_rxq_enqueue_buf(rxq, m);
 	KASSERT(error == 0,
 	    ("%s: cannot requeue discarded mbuf %d", __func__, error));
 }
 
 static int
 vtnet_rxq_merged_eof(struct vtnet_rxq *rxq, struct mbuf *m_head, int nbufs)
 {
 	struct vtnet_softc *sc;
 	struct virtqueue *vq;
 	struct mbuf *m, *m_tail;
 	int len;
 
 	sc = rxq->vtnrx_sc;
 	vq = rxq->vtnrx_vq;
 	m_tail = m_head;
 
 	while (--nbufs > 0) {
 		m = virtqueue_dequeue(vq, &len);
 		if (m == NULL) {
 			rxq->vtnrx_stats.vrxs_ierrors++;
 			goto fail;
 		}
 
 		if (vtnet_rxq_new_buf(rxq) != 0) {
 			rxq->vtnrx_stats.vrxs_iqdrops++;
 			vtnet_rxq_discard_buf(rxq, m);
 			if (nbufs > 1)
 				vtnet_rxq_discard_merged_bufs(rxq, nbufs);
 			goto fail;
 		}
 
 		if (m->m_len < len)
 			len = m->m_len;
 
 		m->m_len = len;
 		m->m_flags &= ~M_PKTHDR;
 
 		m_head->m_pkthdr.len += len;
 		m_tail->m_next = m;
 		m_tail = m;
 	}
 
 	return (0);
 
 fail:
 	sc->vtnet_stats.rx_mergeable_failed++;
 	m_freem(m_head);
 
 	return (1);
 }
 
 static void
 vtnet_rxq_input(struct vtnet_rxq *rxq, struct mbuf *m,
     struct virtio_net_hdr *hdr)
 {
 	struct vtnet_softc *sc;
 	if_t ifp;
 	struct ether_header *eh;
 
 	sc = rxq->vtnrx_sc;
 	ifp = sc->vtnet_ifp;
 
 	if (sc->vtnet_capenable & IFCAP_VLAN_HWTAGGING) {
 		eh = mtod(m, struct ether_header *);
 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
 			vtnet_vlan_tag_remove(m);
 			/*
 			 * With the 802.1Q header removed, update the
 			 * checksum starting location accordingly.
 			 */
 			if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
 				hdr->csum_start -= ETHER_VLAN_ENCAP_LEN;
 		}
 	}
 
 	m->m_pkthdr.flowid = rxq->vtnrx_id;
 	M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
 
 	/*
 	 * BMV: FreeBSD does not have the UNNECESSARY and PARTIAL checksum
 	 * distinction that Linux does. Need to reevaluate if performing
 	 * offloading for the NEEDS_CSUM case is really appropriate.
 	 */
 	if (hdr->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM |
 	    VIRTIO_NET_HDR_F_DATA_VALID)) {
 		if (vtnet_rxq_csum(rxq, m, hdr) == 0)
 			rxq->vtnrx_stats.vrxs_csum++;
 		else
 			rxq->vtnrx_stats.vrxs_csum_failed++;
 	}
 
 	rxq->vtnrx_stats.vrxs_ipackets++;
 	rxq->vtnrx_stats.vrxs_ibytes += m->m_pkthdr.len;
 
 	VTNET_RXQ_UNLOCK(rxq);
 	if_input(ifp, m);
 	VTNET_RXQ_LOCK(rxq);
 }
 
 static int
 vtnet_rxq_eof(struct vtnet_rxq *rxq)
 {
 	struct virtio_net_hdr lhdr, *hdr;
 	struct vtnet_softc *sc;
 	if_t ifp;
 	struct virtqueue *vq;
 	struct mbuf *m;
 	struct virtio_net_hdr_mrg_rxbuf *mhdr;
 	int len, deq, nbufs, adjsz, count;
 
 	sc = rxq->vtnrx_sc;
 	vq = rxq->vtnrx_vq;
 	ifp = sc->vtnet_ifp;
 	hdr = &lhdr;
 	deq = 0;
 	count = sc->vtnet_rx_process_limit;
 
 	VTNET_RXQ_LOCK_ASSERT(rxq);
 
 #ifdef DEV_NETMAP
 	if (netmap_rx_irq(ifp, 0, &deq)) {
 		return (FALSE);
 	}
 #endif /* DEV_NETMAP */
 
 	while (count-- > 0) {
 		m = virtqueue_dequeue(vq, &len);
 		if (m == NULL)
 			break;
 		deq++;
 
 		if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) {
 			rxq->vtnrx_stats.vrxs_ierrors++;
 			vtnet_rxq_discard_buf(rxq, m);
 			continue;
 		}
 
 		if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
 			nbufs = 1;
 			adjsz = sizeof(struct vtnet_rx_header);
 			/*
 			 * Account for our pad inserted between the header
 			 * and the actual start of the frame.
 			 */
 			len += VTNET_RX_HEADER_PAD;
 		} else {
 			mhdr = mtod(m, struct virtio_net_hdr_mrg_rxbuf *);
 			nbufs = mhdr->num_buffers;
 			adjsz = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 		}
 
 		if (vtnet_rxq_replace_buf(rxq, m, len) != 0) {
 			rxq->vtnrx_stats.vrxs_iqdrops++;
 			vtnet_rxq_discard_buf(rxq, m);
 			if (nbufs > 1)
 				vtnet_rxq_discard_merged_bufs(rxq, nbufs);
 			continue;
 		}
 
 		m->m_pkthdr.len = len;
 		m->m_pkthdr.rcvif = ifp;
 		m->m_pkthdr.csum_flags = 0;
 
 		if (nbufs > 1) {
 			/* Dequeue the rest of chain. */
 			if (vtnet_rxq_merged_eof(rxq, m, nbufs) != 0)
 				continue;
 		}
 
 		/*
 		 * Save copy of header before we strip it. For both mergeable
 		 * and non-mergeable, the header is at the beginning of the
 		 * mbuf data. We no longer need num_buffers, so always use a
 		 * regular header.
 		 *
 		 * BMV: Is this memcpy() expensive? We know the mbuf data is
 		 * still valid even after the m_adj().
 		 */
 		memcpy(hdr, mtod(m, void *), sizeof(struct virtio_net_hdr));
 		m_adj(m, adjsz);
 
 		vtnet_rxq_input(rxq, m, hdr);
 
 		/* Must recheck after dropping the Rx lock. */
 		if ((sc->vtnet_flags & VTNET_FLAG_RUNNING) == 0)
 			break;
 	}
 
 	if (deq > 0)
 		virtqueue_notify(vq);
 
 	return (count > 0 ? 0 : EAGAIN);
 }
 
 static void
 vtnet_rx_vq_intr(void *xrxq)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_rxq *rxq;
 	if_t ifp;
 	int tries, more;
 
 	rxq = xrxq;
 	sc = rxq->vtnrx_sc;
 	ifp = sc->vtnet_ifp;
 	tries = 0;
 
 	if (__predict_false(rxq->vtnrx_id >= sc->vtnet_act_vq_pairs)) {
 		/*
 		 * Ignore this interrupt. Either this is a spurious interrupt
 		 * or multiqueue without per-VQ MSIX so every queue needs to
 		 * be polled (a brain dead configuration we could try harder
 		 * to avoid).
 		 */
 		vtnet_rxq_disable_intr(rxq);
 		return;
 	}
 
 	VTNET_RXQ_LOCK(rxq);
 
 again:
 	if ((sc->vtnet_flags & VTNET_FLAG_RUNNING) == 0) {
 		VTNET_RXQ_UNLOCK(rxq);
 		return;
 	}
 
 	more = vtnet_rxq_eof(rxq);
 	if (more || vtnet_rxq_enable_intr(rxq) != 0) {
 		if (!more)
 			vtnet_rxq_disable_intr(rxq);
 		/*
 		 * This is an occasional condition or race (when !more),
 		 * so retry a few times before scheduling the taskqueue.
 		 */
 		if (tries++ < VTNET_INTR_DISABLE_RETRIES)
 			goto again;
 
 		VTNET_RXQ_UNLOCK(rxq);
 		rxq->vtnrx_stats.vrxs_rescheduled++;
 		taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
 	} else
 		VTNET_RXQ_UNLOCK(rxq);
 }
 
 static void
 vtnet_rxq_tq_intr(void *xrxq, int pending)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_rxq *rxq;
 	if_t ifp;
 	int more;
 
 	rxq = xrxq;
 	sc = rxq->vtnrx_sc;
 	ifp = sc->vtnet_ifp;
 
 	VTNET_RXQ_LOCK(rxq);
 
 	if ((sc->vtnet_flags & VTNET_FLAG_RUNNING) == 0) {
 		VTNET_RXQ_UNLOCK(rxq);
 		return;
 	}
 
 	more = vtnet_rxq_eof(rxq);
 	if (more || vtnet_rxq_enable_intr(rxq) != 0) {
 		if (!more)
 			vtnet_rxq_disable_intr(rxq);
 		rxq->vtnrx_stats.vrxs_rescheduled++;
 		taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
 	}
 
 	VTNET_RXQ_UNLOCK(rxq);
 }
 
 static int
 vtnet_txq_below_threshold(struct vtnet_txq *txq)
 {
 	struct vtnet_softc *sc;
 	struct virtqueue *vq;
 
 	sc = txq->vtntx_sc;
 	vq = txq->vtntx_vq;
 
 	return (virtqueue_nfree(vq) <= sc->vtnet_tx_intr_thresh);
 }
 
 static int
 vtnet_txq_notify(struct vtnet_txq *txq)
 {
 	struct virtqueue *vq;
 
 	vq = txq->vtntx_vq;
 
 	txq->vtntx_watchdog = VTNET_TX_TIMEOUT;
 	virtqueue_notify(vq);
 
 	if (vtnet_txq_enable_intr(txq) == 0)
 		return (0);
 
 	/*
 	 * Drain frames that were completed since last checked. If this
 	 * causes the queue to go above the threshold, the caller should
 	 * continue transmitting.
 	 */
 	if (vtnet_txq_eof(txq) != 0 && vtnet_txq_below_threshold(txq) == 0) {
 		virtqueue_disable_intr(vq);
 		return (1);
 	}
 
 	return (0);
 }
 
 static void
 vtnet_txq_free_mbufs(struct vtnet_txq *txq)
 {
 	struct virtqueue *vq;
 	struct vtnet_tx_header *txhdr;
 	int last;
 
 	vq = txq->vtntx_vq;
 	last = 0;
 
 	while ((txhdr = virtqueue_drain(vq, &last)) != NULL) {
 		m_freem(txhdr->vth_mbuf);
 		uma_zfree(vtnet_tx_header_zone, txhdr);
 	}
 
 	KASSERT(virtqueue_empty(vq),
 	    ("%s: mbufs remaining in tx queue %p", __func__, txq));
 }
 
 /*
  * BMV: Much of this can go away once we finally have offsets in
  * the mbuf packet header. Bug andre@.
  */
 static int
 vtnet_txq_offload_ctx(struct vtnet_txq *txq, struct mbuf *m,
     int *etype, int *proto, int *start)
 {
 	struct vtnet_softc *sc;
 	struct ether_vlan_header *evh;
 	int offset;
 
 	sc = txq->vtntx_sc;
 
 	evh = mtod(m, struct ether_vlan_header *);
 	if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 		/* BMV: We should handle nested VLAN tags too. */
 		*etype = ntohs(evh->evl_proto);
 		offset = sizeof(struct ether_vlan_header);
 	} else {
 		*etype = ntohs(evh->evl_encap_proto);
 		offset = sizeof(struct ether_header);
 	}
 
 	switch (*etype) {
 #if defined(INET)
 	case ETHERTYPE_IP: {
 		struct ip *ip, iphdr;
 		if (__predict_false(m->m_len < offset + sizeof(struct ip))) {
 			m_copydata(m, offset, sizeof(struct ip),
 			    (caddr_t) &iphdr);
 			ip = &iphdr;
 		} else
 			ip = (struct ip *)(m->m_data + offset);
 		*proto = ip->ip_p;
 		*start = offset + (ip->ip_hl << 2);
 		break;
 	}
 #endif
 #if defined(INET6)
 	case ETHERTYPE_IPV6:
 		*proto = -1;
 		*start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto);
 		/* Assert the network stack sent us a valid packet. */
 		KASSERT(*start > offset,
 		    ("%s: mbuf %p start %d offset %d proto %d", __func__, m,
 		    *start, offset, *proto));
 		break;
 #endif
 	default:
 		sc->vtnet_stats.tx_csum_bad_ethtype++;
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 vtnet_txq_offload_tso(struct vtnet_txq *txq, struct mbuf *m, int eth_type,
     int offset, struct virtio_net_hdr *hdr)
 {
 	static struct timeval lastecn;
 	static int curecn;
 	struct vtnet_softc *sc;
 	struct tcphdr *tcp, tcphdr;
 
 	sc = txq->vtntx_sc;
 
 	if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) {
 		m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr);
 		tcp = &tcphdr;
 	} else
 		tcp = (struct tcphdr *)(m->m_data + offset);
 
 	hdr->hdr_len = offset + (tcp->th_off << 2);
 	hdr->gso_size = m->m_pkthdr.tso_segsz;
 	hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 :
 	    VIRTIO_NET_HDR_GSO_TCPV6;
 
 	if (tcp->th_flags & TH_CWR) {
 		/*
 		 * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD,
 		 * ECN support is not on a per-interface basis, but globally via
 		 * the net.inet.tcp.ecn.enable sysctl knob. The default is off.
 		 */
 		if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) {
 			if (ppsratecheck(&lastecn, &curecn, 1))
 				if_printf(sc->vtnet_ifp,
 				    "TSO with ECN not negotiated with host\n");
 			return (ENOTSUP);
 		}
 		hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
 	}
 
 	txq->vtntx_stats.vtxs_tso++;
 
 	return (0);
 }
 
 static struct mbuf *
 vtnet_txq_offload(struct vtnet_txq *txq, struct mbuf *m,
     struct virtio_net_hdr *hdr)
 {
 	struct vtnet_softc *sc;
 	int flags, etype, csum_start, proto, error;
 
 	sc = txq->vtntx_sc;
 	flags = m->m_pkthdr.csum_flags;
 
 	error = vtnet_txq_offload_ctx(txq, m, &etype, &proto, &csum_start);
 	if (error)
 		goto drop;
 
 	if ((etype == ETHERTYPE_IP && flags & VTNET_CSUM_OFFLOAD) ||
 	    (etype == ETHERTYPE_IPV6 && flags & VTNET_CSUM_OFFLOAD_IPV6)) {
 		/*
 		 * We could compare the IP protocol vs the CSUM_ flag too,
 		 * but that really should not be necessary.
 		 */
 		hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
 		hdr->csum_start = csum_start;
 		hdr->csum_offset = m->m_pkthdr.csum_data;
 		txq->vtntx_stats.vtxs_csum++;
 	}
 
 	if (flags & CSUM_TSO) {
 		if (__predict_false(proto != IPPROTO_TCP)) {
 			/* Likely failed to correctly parse the mbuf. */
 			sc->vtnet_stats.tx_tso_not_tcp++;
 			goto drop;
 		}
 
 		KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM,
 		    ("%s: mbuf %p TSO without checksum offload %#x",
 		    __func__, m, flags));
 
 		error = vtnet_txq_offload_tso(txq, m, etype, csum_start, hdr);
 		if (error)
 			goto drop;
 	}
 
 	return (m);
 
 drop:
 	m_freem(m);
 	return (NULL);
 }
 
 static int
 vtnet_txq_enqueue_buf(struct vtnet_txq *txq, struct mbuf **m_head,
     struct vtnet_tx_header *txhdr)
 {
 	struct vtnet_softc *sc;
 	struct virtqueue *vq;
 	struct sglist *sg;
 	struct mbuf *m;
 	int error;
 
 	sc = txq->vtntx_sc;
 	vq = txq->vtntx_vq;
 	sg = txq->vtntx_sg;
 	m = *m_head;
 
 	sglist_reset(sg);
 	error = sglist_append(sg, &txhdr->vth_uhdr, sc->vtnet_hdr_size);
 	KASSERT(error == 0 && sg->sg_nseg == 1,
 	    ("%s: error %d adding header to sglist", __func__, error));
 
 	error = sglist_append_mbuf(sg, m);
 	if (error) {
 		m = m_defrag(m, M_NOWAIT);
 		if (m == NULL)
 			goto fail;
 
 		*m_head = m;
 		sc->vtnet_stats.tx_defragged++;
 
 		error = sglist_append_mbuf(sg, m);
 		if (error)
 			goto fail;
 	}
 
 	txhdr->vth_mbuf = m;
 	error = virtqueue_enqueue(vq, txhdr, sg, sg->sg_nseg, 0);
 
 	return (error);
 
 fail:
 	sc->vtnet_stats.tx_defrag_failed++;
 	m_freem(*m_head);
 	*m_head = NULL;
 
 	return (ENOBUFS);
 }
 
 static int
 vtnet_txq_encap(struct vtnet_txq *txq, struct mbuf **m_head)
 {
 	struct vtnet_tx_header *txhdr;
 	struct virtio_net_hdr *hdr;
 	struct mbuf *m;
 	int error;
 
 	m = *m_head;
 	M_ASSERTPKTHDR(m);
 
 	txhdr = uma_zalloc(vtnet_tx_header_zone, M_NOWAIT | M_ZERO);
 	if (txhdr == NULL) {
 		m_freem(m);
 		*m_head = NULL;
 		return (ENOMEM);
 	}
 
 	/*
 	 * Always use the non-mergeable header, regardless if the feature
 	 * was negotiated. For transmit, num_buffers is always zero. The
 	 * vtnet_hdr_size is used to enqueue the correct header size.
 	 */
 	hdr = &txhdr->vth_uhdr.hdr;
 
 	if (m->m_flags & M_VLANTAG) {
 		m = ether_vlanencap(m, m->m_pkthdr.ether_vtag);
 		if ((*m_head = m) == NULL) {
 			error = ENOBUFS;
 			goto fail;
 		}
 		m->m_flags &= ~M_VLANTAG;
 	}
 
 	if (m->m_pkthdr.csum_flags & VTNET_CSUM_ALL_OFFLOAD) {
 		m = vtnet_txq_offload(txq, m, hdr);
 		if ((*m_head = m) == NULL) {
 			error = ENOBUFS;
 			goto fail;
 		}
 	}
 
 	error = vtnet_txq_enqueue_buf(txq, m_head, txhdr);
 	if (error == 0)
 		return (0);
 
 fail:
 	uma_zfree(vtnet_tx_header_zone, txhdr);
 
 	return (error);
 }
 
 static int
 vtnet_txq_mq_start_locked(struct vtnet_txq *txq, struct mbuf *m)
 {
 	struct vtnet_softc *sc;
 	struct virtqueue *vq;
 	struct buf_ring *br;
 	if_t ifp;
 	int enq, tries, error;
 
 	sc = txq->vtntx_sc;
 	vq = txq->vtntx_vq;
 	br = txq->vtntx_br;
 	ifp = sc->vtnet_ifp;
 	tries = 0;
 	error = 0;
 
 	VTNET_TXQ_LOCK_ASSERT(txq);
 
 	if ((sc->vtnet_flags & VTNET_FLAG_RUNNING) == 0 ||
 	    sc->vtnet_link_active == 0) {
 		if (m != NULL)
 			error = buf_ring_enqueue(br, m);
 		return (error);
 	}
 
 	if (m != NULL) {
 		error = buf_ring_enqueue(br, m);
 		if (error)
 			return (error);
 	}
 
 	vtnet_txq_eof(txq);
 
 again:
 	enq = 0;
 
 	while ((m = buf_ring_peek(br)) != NULL) {
 		if (virtqueue_full(vq)) {
 			buf_ring_putback_sc(br, m);
 			break;
 		}
 
 		if (vtnet_txq_encap(txq, &m) != 0) {
 			if (m != NULL)
 				buf_ring_putback_sc(br, m);
 			else
 				buf_ring_advance_sc(br);
 			break;
 		}
 		buf_ring_advance_sc(br);
 
 		enq++;
 		if_mtap(ifp, m, NULL, 0);
 	}
 
 	if (enq > 0 && vtnet_txq_notify(txq) != 0) {
 		if (tries++ < VTNET_NOTIFY_RETRIES)
 			goto again;
 
 		txq->vtntx_stats.vtxs_rescheduled++;
 		taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask);
 	}
 
 	return (0);
 }
 
 static int
 vtnet_txq_mq_start(if_t ifp, struct mbuf *m)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
 	int i, npairs, error;
 
 	sc = if_getsoftc(ifp, IF_DRIVER_SOFTC);
 	npairs = sc->vtnet_act_vq_pairs;
 
 	/* check if flowid is set */
 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
 		i = m->m_pkthdr.flowid % npairs;
 	else
 		i = curcpu % npairs;
 
 	txq = &sc->vtnet_txqs[i];
 
 	if (VTNET_TXQ_TRYLOCK(txq) != 0) {
 		error = vtnet_txq_mq_start_locked(txq, m);
 		VTNET_TXQ_UNLOCK(txq);
 	} else {
 		error = buf_ring_enqueue(txq->vtntx_br, m);
 		taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_defrtask);
 	}
 
 	return (error);
 }
 
 static void
 vtnet_txq_tq_deferred(void *xtxq, int pending)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
 
 	txq = xtxq;
 	sc = txq->vtntx_sc;
 
 	VTNET_TXQ_LOCK(txq);
 	if (!buf_ring_empty(txq->vtntx_br))
 		vtnet_txq_mq_start_locked(txq, NULL);
 	VTNET_TXQ_UNLOCK(txq);
 }
 
 static void
 vtnet_txq_start(struct vtnet_txq *txq)
 {
 	struct vtnet_softc *sc;
 
 	sc = txq->vtntx_sc;
 
 	if (!buf_ring_empty(txq->vtntx_br))
 		vtnet_txq_mq_start_locked(txq, NULL);
 }
 
 static void
 vtnet_txq_tq_intr(void *xtxq, int pending)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
 	if_t ifp;
 
 	txq = xtxq;
 	sc = txq->vtntx_sc;
 	ifp = sc->vtnet_ifp;
 
 	VTNET_TXQ_LOCK(txq);
 
 	if ((sc->vtnet_flags & VTNET_FLAG_RUNNING) == 0) {
 		VTNET_TXQ_UNLOCK(txq);
 		return;
 	}
 
 	vtnet_txq_eof(txq);
 	vtnet_txq_start(txq);
 
 	VTNET_TXQ_UNLOCK(txq);
 }
 
 static int
 vtnet_txq_eof(struct vtnet_txq *txq)
 {
 	struct virtqueue *vq;
 	struct vtnet_tx_header *txhdr;
 	struct mbuf *m;
 	int deq;
 
 	vq = txq->vtntx_vq;
 	deq = 0;
 	VTNET_TXQ_LOCK_ASSERT(txq);
 
 #ifdef DEV_NETMAP
 	if (netmap_tx_irq(txq->vtntx_sc->vtnet_ifp, txq->vtntx_id)) {
 		virtqueue_disable_intr(vq); // XXX luigi
 		return 0; // XXX or 1 ?
 	}
 #endif /* DEV_NETMAP */
 
 	while ((txhdr = virtqueue_dequeue(vq, NULL)) != NULL) {
 		m = txhdr->vth_mbuf;
 		deq++;
 
 		txq->vtntx_stats.vtxs_opackets++;
 		txq->vtntx_stats.vtxs_obytes += m->m_pkthdr.len;
 		if (m->m_flags & M_MCAST)
 			txq->vtntx_stats.vtxs_omcasts++;
 
 		m_freem(m);
 		uma_zfree(vtnet_tx_header_zone, txhdr);
 	}
 
 	if (virtqueue_empty(vq))
 		txq->vtntx_watchdog = 0;
 
 	return (deq);
 }
 
 static void
 vtnet_tx_vq_intr(void *xtxq)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
 	if_t ifp;
 
 	txq = xtxq;
 	sc = txq->vtntx_sc;
 	ifp = sc->vtnet_ifp;
 
 	if (__predict_false(txq->vtntx_id >= sc->vtnet_act_vq_pairs)) {
 		/*
 		 * Ignore this interrupt. Either this is a spurious interrupt
 		 * or multiqueue without per-VQ MSIX so every queue needs to
 		 * be polled (a brain dead configuration we could try harder
 		 * to avoid).
 		 */
 		vtnet_txq_disable_intr(txq);
 		return;
 	}
 
 	VTNET_TXQ_LOCK(txq);
 
 	if ((sc->vtnet_flags & VTNET_FLAG_RUNNING) == 0) {
 		VTNET_TXQ_UNLOCK(txq);
 		return;
 	}
 
 	vtnet_txq_eof(txq);
 	vtnet_txq_start(txq);
 
 	VTNET_TXQ_UNLOCK(txq);
 }
 
 static void
 vtnet_tx_start_all(struct vtnet_softc *sc)
 {
 	struct vtnet_txq *txq;
 	int i;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
 		txq = &sc->vtnet_txqs[i];
 
 		VTNET_TXQ_LOCK(txq);
 		vtnet_txq_start(txq);
 		VTNET_TXQ_UNLOCK(txq);
 	}
 }
 
 static void
 vtnet_qflush(if_t ifp)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_txq *txq;
 	struct mbuf *m;
 	int i;
 
 	sc = if_getsoftc(ifp, IF_DRIVER_SOFTC);
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
 		txq = &sc->vtnet_txqs[i];
 
 		VTNET_TXQ_LOCK(txq);
 		while ((m = buf_ring_dequeue_sc(txq->vtntx_br)) != NULL)
 			m_freem(m);
 		VTNET_TXQ_UNLOCK(txq);
 	}
 }
 
 static int
 vtnet_watchdog(struct vtnet_txq *txq)
 {
 	if_t ifp;
 
 	ifp = txq->vtntx_sc->vtnet_ifp;
 
 	VTNET_TXQ_LOCK(txq);
 	if (txq->vtntx_watchdog == 1) {
 		/*
 		 * Only drain completed frames if the watchdog is about to
 		 * expire. If any frames were drained, there may be enough
 		 * free descriptors now available to transmit queued frames.
 		 * In that case, the timer will immediately be decremented
 		 * below, but the timeout is generous enough that should not
 		 * be a problem.
 		 */
 		if (vtnet_txq_eof(txq) != 0)
 			vtnet_txq_start(txq);
 	}
 
 	if (txq->vtntx_watchdog == 0 || --txq->vtntx_watchdog) {
 		VTNET_TXQ_UNLOCK(txq);
 		return (0);
 	}
 	VTNET_TXQ_UNLOCK(txq);
 
 	if_printf(ifp, "watchdog timeout on queue %d\n", txq->vtntx_id);
 	return (1);
 }
 
 static void
 vtnet_accum_stats(struct vtnet_softc *sc, struct vtnet_rxq_stats *rxacc,
     struct vtnet_txq_stats *txacc)
 {
 
 	bzero(rxacc, sizeof(struct vtnet_rxq_stats));
 	bzero(txacc, sizeof(struct vtnet_txq_stats));
 
 	for (int i = 0; i < sc->vtnet_max_vq_pairs; i++) {
 		struct vtnet_rxq_stats *rxst;
 		struct vtnet_txq_stats *txst;
 
 		rxst = &sc->vtnet_rxqs[i].vtnrx_stats;
 		rxacc->vrxs_ipackets += rxst->vrxs_ipackets;
 		rxacc->vrxs_ibytes += rxst->vrxs_ibytes;
 		rxacc->vrxs_iqdrops += rxst->vrxs_iqdrops;
 		rxacc->vrxs_csum += rxst->vrxs_csum;
 		rxacc->vrxs_csum_failed += rxst->vrxs_csum_failed;
 		rxacc->vrxs_rescheduled += rxst->vrxs_rescheduled;
 
 		txst = &sc->vtnet_txqs[i].vtntx_stats;
 		txacc->vtxs_opackets += txst->vtxs_opackets;
 		txacc->vtxs_obytes += txst->vtxs_obytes;
 		txacc->vtxs_csum += txst->vtxs_csum;
 		txacc->vtxs_tso += txst->vtxs_tso;
 		txacc->vtxs_rescheduled += txst->vtxs_rescheduled;
 	}
 }
 
 static uint64_t
 vtnet_get_counter(if_t ifp, ift_counter cnt)
 {
 	struct vtnet_softc *sc;
 	struct vtnet_rxq_stats rxaccum;
 	struct vtnet_txq_stats txaccum;
 
 	sc = if_getsoftc(ifp, IF_DRIVER_SOFTC);
 	vtnet_accum_stats(sc, &rxaccum, &txaccum);
 
 	switch (cnt) {
 	case IFCOUNTER_IPACKETS:
 		return (rxaccum.vrxs_ipackets);
 	case IFCOUNTER_IQDROPS:
 		return (rxaccum.vrxs_iqdrops);
 	case IFCOUNTER_IERRORS:
 		return (rxaccum.vrxs_ierrors);
 	case IFCOUNTER_OPACKETS:
 		return (txaccum.vtxs_opackets);
 	case IFCOUNTER_OBYTES:
 		return (txaccum.vtxs_obytes);
 	case IFCOUNTER_OMCASTS:
 		return (txaccum.vtxs_omcasts);
 	default:
 		return (if_get_counter_default(ifp, cnt));
 	}
 }
 
 static void
 vtnet_tick(void *xsc)
 {
 	struct vtnet_softc *sc;
 	if_t ifp;
 	int i, timedout;
 
 	sc = xsc;
 	ifp = sc->vtnet_ifp;
 	timedout = 0;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
 		timedout |= vtnet_watchdog(&sc->vtnet_txqs[i]);
 
 	if (timedout != 0) {
 		sc->vtnet_flags &= ~VTNET_FLAG_RUNNING;
 		vtnet_init_locked(sc);
 	} else
 		callout_schedule(&sc->vtnet_tick_ch, hz);
 }
 
 static void
 vtnet_start_taskqueues(struct vtnet_softc *sc)
 {
 	device_t dev;
 	struct vtnet_rxq *rxq;
 	struct vtnet_txq *txq;
 	int i, error;
 
 	dev = sc->vtnet_dev;
 
 	/*
 	 * Errors here are very difficult to recover from - we cannot
 	 * easily fail because, if this is during boot, we will hang
 	 * when freeing any successfully started taskqueues because
 	 * the scheduler isn't up yet.
 	 *
 	 * Most drivers just ignore the return value - it only fails
 	 * with ENOMEM so an error is not likely.
 	 */
 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 		error = taskqueue_start_threads(&rxq->vtnrx_tq, 1, PI_NET,
 		    "%s rxq %d", device_get_nameunit(dev), rxq->vtnrx_id);
 		if (error) {
 			device_printf(dev, "failed to start rx taskq %d\n",
 			    rxq->vtnrx_id);
 		}
 
 		txq = &sc->vtnet_txqs[i];
 		error = taskqueue_start_threads(&txq->vtntx_tq, 1, PI_NET,
 		    "%s txq %d", device_get_nameunit(dev), txq->vtntx_id);
 		if (error) {
 			device_printf(dev, "failed to start tx taskq %d\n",
 			    txq->vtntx_id);
 		}
 	}
 }
 
 static void
 vtnet_free_taskqueues(struct vtnet_softc *sc)
 {
 	struct vtnet_rxq *rxq;
 	struct vtnet_txq *txq;
 	int i;
 
 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 		if (rxq->vtnrx_tq != NULL) {
 			taskqueue_free(rxq->vtnrx_tq);
 			rxq->vtnrx_vq = NULL;
 		}
 
 		txq = &sc->vtnet_txqs[i];
 		if (txq->vtntx_tq != NULL) {
 			taskqueue_free(txq->vtntx_tq);
 			txq->vtntx_tq = NULL;
 		}
 	}
 }
 
 static void
 vtnet_drain_taskqueues(struct vtnet_softc *sc)
 {
 	struct vtnet_rxq *rxq;
 	struct vtnet_txq *txq;
 	int i;
 
 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 		if (rxq->vtnrx_tq != NULL)
 			taskqueue_drain(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
 
 		txq = &sc->vtnet_txqs[i];
 		if (txq->vtntx_tq != NULL) {
 			taskqueue_drain(txq->vtntx_tq, &txq->vtntx_intrtask);
 			taskqueue_drain(txq->vtntx_tq, &txq->vtntx_defrtask);
 		}
 	}
 }
 
 static void
 vtnet_drain_rxtx_queues(struct vtnet_softc *sc)
 {
 	struct vtnet_rxq *rxq;
 	struct vtnet_txq *txq;
 	int i;
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 		vtnet_rxq_free_mbufs(rxq);
 
 		txq = &sc->vtnet_txqs[i];
 		vtnet_txq_free_mbufs(txq);
 	}
 }
 
 static void
 vtnet_stop_rendezvous(struct vtnet_softc *sc)
 {
 	struct vtnet_rxq *rxq;
 	struct vtnet_txq *txq;
 	int i;
 
 	/*
 	 * Lock and unlock the per-queue mutex so we known the stop
 	 * state is visible. Doing only the active queues should be
 	 * sufficient, but it does not cost much extra to do all the
 	 * queues. Note we hold the core mutex here too.
 	 */
 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 		VTNET_RXQ_LOCK(rxq);
 		VTNET_RXQ_UNLOCK(rxq);
 
 		txq = &sc->vtnet_txqs[i];
 		VTNET_TXQ_LOCK(txq);
 		VTNET_TXQ_UNLOCK(txq);
 	}
 }
 
 static void
 vtnet_stop(struct vtnet_softc *sc)
 {
 	device_t dev;
 	if_t ifp;
 
 	dev = sc->vtnet_dev;
 	ifp = sc->vtnet_ifp;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	sc->vtnet_flags &= ~VTNET_FLAG_RUNNING;
 	sc->vtnet_link_active = 0;
 	callout_stop(&sc->vtnet_tick_ch);
 
 	/* Only advisory. */
 	vtnet_disable_interrupts(sc);
 
 	/*
 	 * Stop the host adapter. This resets it to the pre-initialized
 	 * state. It will not generate any interrupts until after it is
 	 * reinitialized.
 	 */
 	virtio_stop(dev);
 	vtnet_stop_rendezvous(sc);
 
 	/* Free any mbufs left in the virtqueues. */
 	vtnet_drain_rxtx_queues(sc);
 }
 
 static int
 vtnet_virtio_reinit(struct vtnet_softc *sc)
 {
 	device_t dev;
 	if_t ifp;
 	uint64_t features;
 	uint32_t mask;
 	int error;
 
 	dev = sc->vtnet_dev;
 	ifp = sc->vtnet_ifp;
 	features = sc->vtnet_features;
 
 	mask = 0;
 #if defined(INET)
 	mask |= IFCAP_RXCSUM;
 #endif
 #if defined (INET6)
 	mask |= IFCAP_RXCSUM_IPV6;
 #endif
 
 	/*
 	 * Re-negotiate with the host, removing any disabled receive
 	 * features. Transmit features are disabled only on our side
 	 * via if_capenable and if_hwassist.
 	 *
 	 * We require both IPv4 and IPv6 offloading to be enabled
 	 * in order to negotiated it: VirtIO does not distinguish
 	 * between the two.
 	 */
 	if ((sc->vtnet_capenable & mask) != mask)
 		features &= ~VIRTIO_NET_F_GUEST_CSUM;
 
 	if ((sc->vtnet_capenable & IFCAP_LRO) == 0)
 		features &= ~VTNET_LRO_FEATURES;
 
 	if ((sc->vtnet_capenable & IFCAP_VLAN_HWFILTER) == 0)
 		features &= ~VIRTIO_NET_F_CTRL_VLAN;
 
 	error = virtio_reinit(dev, features);
 	if (error)
 		device_printf(dev, "virtio reinit error %d\n", error);
 
 	return (error);
 }
 
 static void
 vtnet_init_rx_filters(struct vtnet_softc *sc)
 {
 	if_t ifp;
 
 	ifp = sc->vtnet_ifp;
 
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) {
 		/* Restore promiscuous and all-multicast modes. */
 		vtnet_rx_filter(sc);
 		/* Restore filtered MAC addresses. */
 		vtnet_rx_filter_mac(sc);
 	}
 
 	if (sc->vtnet_capenable & IFCAP_VLAN_HWFILTER)
 		vtnet_rx_filter_vlan(sc);
 }
 
 static int
 vtnet_init_rx_queues(struct vtnet_softc *sc)
 {
 	device_t dev;
 	struct vtnet_rxq *rxq;
 	int i, clsize, error;
 
 	dev = sc->vtnet_dev;
 
 	/*
 	 * Use the new cluster size if one has been set (via a MTU
 	 * change). Otherwise, use the standard 2K clusters.
 	 *
 	 * BMV: It might make sense to use page sized clusters as
 	 * the default (depending on the features negotiated).
 	 */
 	if (sc->vtnet_rx_new_clsize != 0) {
 		clsize = sc->vtnet_rx_new_clsize;
 		sc->vtnet_rx_new_clsize = 0;
 	} else
 		clsize = MCLBYTES;
 
 	sc->vtnet_rx_clsize = clsize;
 	sc->vtnet_rx_nmbufs = VTNET_NEEDED_RX_MBUFS(sc, clsize);
 
 	KASSERT(sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS ||
 	    sc->vtnet_rx_nmbufs < sc->vtnet_rx_nsegs,
 	    ("%s: too many rx mbufs %d for %d segments", __func__,
 	    sc->vtnet_rx_nmbufs, sc->vtnet_rx_nsegs));
 
 #ifdef DEV_NETMAP
 	if (vtnet_netmap_init_rx_buffers(sc))
 		return 0;
 #endif /* DEV_NETMAP */
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
 		rxq = &sc->vtnet_rxqs[i];
 
 		/* Hold the lock to satisfy asserts. */
 		VTNET_RXQ_LOCK(rxq);
 		error = vtnet_rxq_populate(rxq);
 		VTNET_RXQ_UNLOCK(rxq);
 
 		if (error) {
 			device_printf(dev,
 			    "cannot allocate mbufs for Rx queue %d\n", i);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static int
 vtnet_init_tx_queues(struct vtnet_softc *sc)
 {
 	struct vtnet_txq *txq;
 	int i;
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
 		txq = &sc->vtnet_txqs[i];
 		txq->vtntx_watchdog = 0;
 	}
 
 	return (0);
 }
 
 static int
 vtnet_init_rxtx_queues(struct vtnet_softc *sc)
 {
 	int error;
 
 	error = vtnet_init_rx_queues(sc);
 	if (error)
 		return (error);
 
 	error = vtnet_init_tx_queues(sc);
 	if (error)
 		return (error);
 
 	return (0);
 }
 
 static void
 vtnet_set_active_vq_pairs(struct vtnet_softc *sc)
 {
 	device_t dev;
 	int npairs;
 
 	dev = sc->vtnet_dev;
 
 	if ((sc->vtnet_flags & VTNET_FLAG_MULTIQ) == 0) {
 		MPASS(sc->vtnet_max_vq_pairs == 1);
 		sc->vtnet_act_vq_pairs = 1;
 		return;
 	}
 
 	/* BMV: Just use the maximum configured for now. */
 	npairs = sc->vtnet_max_vq_pairs;
 
 	if (vtnet_ctrl_mq_cmd(sc, npairs) != 0) {
 		device_printf(dev,
 		    "cannot set active queue pairs to %d\n", npairs);
 		npairs = 1;
 	}
 
 	sc->vtnet_act_vq_pairs = npairs;
 }
 
 static int
 vtnet_reinit(struct vtnet_softc *sc)
 {
 	if_t ifp;
 	int error;
 
 	ifp = sc->vtnet_ifp;
 
 	/* Use the current MAC address. */
 	bcopy(if_lladdr(ifp), sc->vtnet_hwaddr, ETHER_ADDR_LEN);
 	vtnet_set_hwaddr(sc);
 
 	vtnet_set_active_vq_pairs(sc);
 
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ)
 		vtnet_init_rx_filters(sc);
 
 	error = vtnet_init_rxtx_queues(sc);
 	if (error)
 		return (error);
 
 	vtnet_enable_interrupts(sc);
 	sc->vtnet_flags |= VTNET_FLAG_RUNNING;
 
 	return (0);
 }
 
 static void
 vtnet_init_locked(struct vtnet_softc *sc)
 {
 	device_t dev;
 	if_t ifp;
 
 	dev = sc->vtnet_dev;
 	ifp = sc->vtnet_ifp;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	if (sc->vtnet_flags & VTNET_FLAG_RUNNING)
 		return;
 
 	vtnet_stop(sc);
 
 	/* Reinitialize with the host. */
 	if (vtnet_virtio_reinit(sc) != 0)
 		goto fail;
 
 	if (vtnet_reinit(sc) != 0)
 		goto fail;
 
 	virtio_reinit_complete(dev);
 
 	vtnet_update_link_status(sc);
 	callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc);
 
 	return;
 
 fail:
 	vtnet_stop(sc);
 }
 
 static void
 vtnet_init(void *xsc)
 {
 	struct vtnet_softc *sc;
 
 	sc = xsc;
 
 #ifdef DEV_NETMAP
 	if (!NA(sc->vtnet_ifp)) {
 		D("try to attach again");
 		vtnet_netmap_attach(sc);
 	}
 #endif /* DEV_NETMAP */
 
 	VTNET_CORE_LOCK(sc);
 	vtnet_init_locked(sc);
 	VTNET_CORE_UNLOCK(sc);
 }
 
 static void
 vtnet_free_ctrl_vq(struct vtnet_softc *sc)
 {
 	struct virtqueue *vq;
 
 	vq = sc->vtnet_ctrl_vq;
 
 	/*
 	 * The control virtqueue is only polled and therefore it should
 	 * already be empty.
 	 */
 	KASSERT(virtqueue_empty(vq),
 	    ("%s: ctrl vq %p not empty", __func__, vq));
 }
 
 static void
 vtnet_exec_ctrl_cmd(struct vtnet_softc *sc, void *cookie,
     struct sglist *sg, int readable, int writable)
 {
 	struct virtqueue *vq;
 
 	vq = sc->vtnet_ctrl_vq;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 	KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_VQ,
 	    ("%s: CTRL_VQ feature not negotiated", __func__));
 
 	if (!virtqueue_empty(vq))
 		return;
 	if (virtqueue_enqueue(vq, cookie, sg, readable, writable) != 0)
 		return;
 
 	/*
 	 * Poll for the response, but the command is likely already
 	 * done when we return from the notify.
 	 */
 	virtqueue_notify(vq);
 	virtqueue_poll(vq, NULL);
 }
 
 static int
 vtnet_ctrl_mac_cmd(struct vtnet_softc *sc, uint8_t *hwaddr)
 {
 	struct virtio_net_ctrl_hdr hdr __aligned(2);
 	struct sglist_seg segs[3];
 	struct sglist sg;
 	uint8_t ack;
 	int error;
 
 	hdr.class = VIRTIO_NET_CTRL_MAC;
 	hdr.cmd = VIRTIO_NET_CTRL_MAC_ADDR_SET;
 	ack = VIRTIO_NET_ERR;
 
 	sglist_init(&sg, 3, segs);
 	error = 0;
 	error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr));
 	error |= sglist_append(&sg, hwaddr, ETHER_ADDR_LEN);
 	error |= sglist_append(&sg, &ack, sizeof(uint8_t));
 	KASSERT(error == 0 && sg.sg_nseg == 3,
 	    ("%s: error %d adding set MAC msg to sglist", __func__, error));
 
 	vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1);
 
 	return (ack == VIRTIO_NET_OK ? 0 : EIO);
 }
 
 static int
 vtnet_ctrl_mq_cmd(struct vtnet_softc *sc, uint16_t npairs)
 {
 	struct sglist_seg segs[3];
 	struct sglist sg;
 	struct {
 		struct virtio_net_ctrl_hdr hdr;
 		uint8_t pad1;
 		struct virtio_net_ctrl_mq mq;
 		uint8_t pad2;
 		uint8_t ack;
 	} s __aligned(2);
 	int error;
 
 	s.hdr.class = VIRTIO_NET_CTRL_MQ;
 	s.hdr.cmd = VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET;
 	s.mq.virtqueue_pairs = npairs;
 	s.ack = VIRTIO_NET_ERR;
 
 	sglist_init(&sg, 3, segs);
 	error = 0;
 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
 	error |= sglist_append(&sg, &s.mq, sizeof(struct virtio_net_ctrl_mq));
 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
 	KASSERT(error == 0 && sg.sg_nseg == 3,
 	    ("%s: error %d adding MQ message to sglist", __func__, error));
 
 	vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
 
 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
 }
 
 static int
 vtnet_ctrl_rx_cmd(struct vtnet_softc *sc, int cmd, int on)
 {
 	struct sglist_seg segs[3];
 	struct sglist sg;
 	struct {
 		struct virtio_net_ctrl_hdr hdr;
 		uint8_t pad1;
 		uint8_t onoff;
 		uint8_t pad2;
 		uint8_t ack;
 	} s __aligned(2);
 	int error;
 
 	KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX,
 	    ("%s: CTRL_RX feature not negotiated", __func__));
 
 	s.hdr.class = VIRTIO_NET_CTRL_RX;
 	s.hdr.cmd = cmd;
 	s.onoff = !!on;
 	s.ack = VIRTIO_NET_ERR;
 
 	sglist_init(&sg, 3, segs);
 	error = 0;
 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
 	error |= sglist_append(&sg, &s.onoff, sizeof(uint8_t));
 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
 	KASSERT(error == 0 && sg.sg_nseg == 3,
 	    ("%s: error %d adding Rx message to sglist", __func__, error));
 
 	vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
 
 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
 }
 
 static int
 vtnet_set_promisc(struct vtnet_softc *sc, int on)
 {
 
 	return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_PROMISC, on));
 }
 
 static int
 vtnet_set_allmulti(struct vtnet_softc *sc, int on)
 {
 
 	return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, on));
 }
 
 /*
  * The device defaults to promiscuous mode for backwards compatibility.
  * Turn it off at attach time if possible.
  */
 static void
 vtnet_attach_disable_promisc(struct vtnet_softc *sc)
 {
 	struct ifreq ifr;
 	if_t ifp;
 
 	ifp = sc->vtnet_ifp;
 
 	VTNET_CORE_LOCK(sc);
 	if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0) {
 		(void )if_drvioctl(SIOCGIFFLAGS, ifp, &ifr, curthread);
-		ifr.ifr_flagslow |= IFF_PROMISC;
+		ifr.ifr_flags |= IFF_PROMISC;
 		(void )if_drvioctl(SIOCSIFFLAGS, ifp, &ifr, curthread);
 	} else if (vtnet_set_promisc(sc, 0) != 0) {
 		(void )if_drvioctl(SIOCGIFFLAGS, ifp, &ifr, curthread);
-		ifr.ifr_flagslow |= IFF_PROMISC;
+		ifr.ifr_flags |= IFF_PROMISC;
 		(void )if_drvioctl(SIOCSIFFLAGS, ifp, &ifr, curthread);
 		device_printf(sc->vtnet_dev,
 		    "cannot disable default promiscuous mode\n");
 	}
 	VTNET_CORE_UNLOCK(sc);
 }
 
 static void
 vtnet_rx_filter(struct vtnet_softc *sc)
 {
 	device_t dev;
 	if_t ifp;
 
 	dev = sc->vtnet_dev;
 	ifp = sc->vtnet_ifp;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 
 	if (vtnet_set_promisc(sc, sc->vtnet_if_flags & IFF_PROMISC) != 0)
 		device_printf(dev, "cannot %s promiscuous mode\n",
 		    sc->vtnet_if_flags & IFF_PROMISC ?
 		    "enable" : "disable");
 
 	if (vtnet_set_allmulti(sc, sc->vtnet_if_flags & IFF_ALLMULTI) != 0)
 		device_printf(dev, "cannot %s all-multicast mode\n",
 		    sc->vtnet_if_flags & IFF_ALLMULTI ?
 		    "enable" : "disable");
 }
 
 static void
 vtnet_copy_unicast_mac(void *arg, struct sockaddr *addr,
     struct sockaddr *dstaddr, struct sockaddr *mask)
 {
 	struct vtnet_softc *sc = arg;
 	struct vtnet_mac_filter *filter = sc->vtnet_mac_filter;
 	struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
 
 	if (addr->sa_family != AF_LINK)
 		return;
 
 	if (memcmp(LLADDR(sdl), sc->vtnet_hwaddr, ETHER_ADDR_LEN) == 0)
 		return;
 
 	if (filter->vmf_unicast.nentries == VTNET_MAX_MAC_ENTRIES) {
 		filter->vmf_unicast.nentries++;
 		return;
 	}
 
 	bcopy(LLADDR(sdl),
 	    &filter->vmf_unicast.macs[filter->vmf_unicast.nentries++],
 	    ETHER_ADDR_LEN);
 }
 
 static void
 vtnet_copy_multicast_mac(void *arg, struct sockaddr *maddr)
 {
 	struct vtnet_softc *sc = arg;
 	struct vtnet_mac_filter *filter = sc->vtnet_mac_filter;
 	struct sockaddr_dl *sdl = (struct sockaddr_dl *)maddr;
 
 	if (maddr->sa_family != AF_LINK)
 		return;
 
 	if (filter->vmf_multicast.nentries == VTNET_MAX_MAC_ENTRIES) {
 		filter->vmf_multicast.nentries++;
 		return;
 	}
 
 	bcopy(LLADDR(sdl), 
 	    &filter->vmf_multicast.macs[filter->vmf_multicast.nentries++],
 	    ETHER_ADDR_LEN);
 }
 
 static void
 vtnet_rx_filter_mac(struct vtnet_softc *sc)
 {
 	struct virtio_net_ctrl_hdr hdr __aligned(2);
 	struct vtnet_mac_filter *filter;
 	struct sglist_seg segs[4];
 	struct sglist sg;
 	if_t ifp;
 	int promisc, allmulti, error;
 	uint8_t ack;
 
 	ifp = sc->vtnet_ifp;
 	filter = sc->vtnet_mac_filter;
 	promisc = 0;
 	allmulti = 0;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 	KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX,
 	    ("%s: CTRL_RX feature not negotiated", __func__));
 
 	/* Unicast MAC addresses: */
 	if_foreach_addr(ifp, vtnet_copy_unicast_mac, sc);
 	if (filter->vmf_unicast.nentries > VTNET_MAX_MAC_ENTRIES) {
 		promisc = 1;
 		filter->vmf_unicast.nentries = 0;
 		if_printf(ifp, "more than %d MAC addresses assigned, "
 		    "falling back to promiscuous mode\n",
 		    VTNET_MAX_MAC_ENTRIES);
 	}
 
 	/* Multicast MAC addresses: */
 	if_foreach_maddr(ifp, vtnet_copy_multicast_mac, sc);
 	if (filter->vmf_multicast.nentries > VTNET_MAX_MAC_ENTRIES) {
 		allmulti = 1;
 		filter->vmf_multicast.nentries = 0;
 		if_printf(ifp, "more than %d multicast MAC addresses "
 		    "assigned, falling back to all-multicast mode\n",
 		    VTNET_MAX_MAC_ENTRIES);
 	}
 
 	if (promisc != 0 && allmulti != 0)
 		goto out;
 
 	hdr.class = VIRTIO_NET_CTRL_MAC;
 	hdr.cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET;
 	ack = VIRTIO_NET_ERR;
 
 	sglist_init(&sg, 4, segs);
 	error = 0;
 	error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr));
 	error |= sglist_append(&sg, &filter->vmf_unicast,
 	    sizeof(uint32_t) + filter->vmf_unicast.nentries * ETHER_ADDR_LEN);
 	error |= sglist_append(&sg, &filter->vmf_multicast,
 	    sizeof(uint32_t) + filter->vmf_multicast.nentries * ETHER_ADDR_LEN);
 	error |= sglist_append(&sg, &ack, sizeof(uint8_t));
 	KASSERT(error == 0 && sg.sg_nseg == 4,
 	    ("%s: error %d adding MAC filter msg to sglist", __func__, error));
 
 	vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1);
 
 	if (ack != VIRTIO_NET_OK)
 		if_printf(ifp, "error setting host MAC filter table\n");
 
 out:
 	if (promisc != 0 && vtnet_set_promisc(sc, 1) != 0)
 		if_printf(ifp, "cannot enable promiscuous mode\n");
 	if (allmulti != 0 && vtnet_set_allmulti(sc, 1) != 0)
 		if_printf(ifp, "cannot enable all-multicast mode\n");
 }
 
 static int
 vtnet_exec_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag)
 {
 	struct sglist_seg segs[3];
 	struct sglist sg;
 	struct {
 		struct virtio_net_ctrl_hdr hdr;
 		uint8_t pad1;
 		uint16_t tag;
 		uint8_t pad2;
 		uint8_t ack;
 	} s __aligned(2);
 	int error;
 
 	s.hdr.class = VIRTIO_NET_CTRL_VLAN;
 	s.hdr.cmd = add ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL;
 	s.tag = tag;
 	s.ack = VIRTIO_NET_ERR;
 
 	sglist_init(&sg, 3, segs);
 	error = 0;
 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
 	error |= sglist_append(&sg, &s.tag, sizeof(uint16_t));
 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
 	KASSERT(error == 0 && sg.sg_nseg == 3,
 	    ("%s: error %d adding VLAN message to sglist", __func__, error));
 
 	vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
 
 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
 }
 
 static void
 vtnet_rx_filter_vlan(struct vtnet_softc *sc)
 {
 	uint32_t w;
 	uint16_t tag;
 	int i, bit;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 	KASSERT(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER,
 	    ("%s: VLAN_FILTER feature not negotiated", __func__));
 
 	/* Enable the filter for each configured VLAN. */
 	for (i = 0; i < VTNET_VLAN_FILTER_NWORDS; i++) {
 		w = sc->vtnet_vlan_filter[i];
 
 		while ((bit = ffs(w) - 1) != -1) {
 			w &= ~(1 << bit);
 			tag = sizeof(w) * CHAR_BIT * i + bit;
 
 			if (vtnet_exec_vlan_filter(sc, 1, tag) != 0) {
 				device_printf(sc->vtnet_dev,
 				    "cannot enable VLAN %d filter\n", tag);
 			}
 		}
 	}
 }
 
 static void
 vtnet_update_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag)
 {
 	if_t ifp;
 	int idx, bit;
 
 	ifp = sc->vtnet_ifp;
 	idx = (tag >> 5) & 0x7F;
 	bit = tag & 0x1F;
 
 	if (tag == 0 || tag > 4095)
 		return;
 
 	VTNET_CORE_LOCK(sc);
 
 	if (add)
 		sc->vtnet_vlan_filter[idx] |= (1 << bit);
 	else
 		sc->vtnet_vlan_filter[idx] &= ~(1 << bit);
 
 	if ((sc->vtnet_capenable & IFCAP_VLAN_HWFILTER) &&
 	    vtnet_exec_vlan_filter(sc, add, tag) != 0) {
 		device_printf(sc->vtnet_dev,
 		    "cannot %s VLAN %d %s the host filter table\n",
 		    add ? "add" : "remove", tag, add ? "to" : "from");
 	}
 
 	VTNET_CORE_UNLOCK(sc);
 }
 
 static void
 vtnet_register_vlan(void *arg, if_t ifp, uint16_t tag)
 {
 
 	if (if_getsoftc(ifp, IF_DRIVER_SOFTC) != arg)
 		return;
 
 	vtnet_update_vlan_filter(arg, 1, tag);
 }
 
 static void
 vtnet_unregister_vlan(void *arg, if_t ifp, uint16_t tag)
 {
 
 	if (if_getsoftc(ifp, IF_DRIVER_SOFTC) != arg)
 		return;
 
 	vtnet_update_vlan_filter(arg, 0, tag);
 }
 
 static int
 vtnet_is_link_up(struct vtnet_softc *sc)
 {
 	device_t dev;
 	if_t ifp;
 	uint16_t status;
 
 	dev = sc->vtnet_dev;
 	ifp = sc->vtnet_ifp;
 
 	if (!virtio_with_feature(dev, VIRTIO_NET_F_STATUS))
 		status = VIRTIO_NET_S_LINK_UP;
 	else
 		status = virtio_read_dev_config_2(dev,
 		    offsetof(struct virtio_net_config, status));
 
 	return ((status & VIRTIO_NET_S_LINK_UP) != 0);
 }
 
 static void
 vtnet_update_link_status(struct vtnet_softc *sc)
 {
 	if_t ifp;
 	int link;
 
 	ifp = sc->vtnet_ifp;
 
 	VTNET_CORE_LOCK_ASSERT(sc);
 	link = vtnet_is_link_up(sc);
 
 	/* Notify if the link status has changed. */
 	if (link != 0 && sc->vtnet_link_active == 0) {
 		sc->vtnet_link_active = 1;
 		if_link_state_change(ifp, LINK_STATE_UP);
 	} else if (link == 0 && sc->vtnet_link_active != 0) {
 		sc->vtnet_link_active = 0;
 		if_link_state_change(ifp, LINK_STATE_DOWN);
 	}
 }
 
 static int
 vtnet_ifmedia_upd(if_t ifp)
 {
 	struct vtnet_softc *sc;
 	struct ifmedia *ifm;
 
 	sc = if_getsoftc(ifp, IF_DRIVER_SOFTC);
 	ifm = &sc->vtnet_media;
 
 	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
 		return (EINVAL);
 
 	return (0);
 }
 
 static void
 vtnet_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr)
 {
 	struct vtnet_softc *sc;
 
 	sc = if_getsoftc(ifp, IF_DRIVER_SOFTC);
 
 	ifmr->ifm_status = IFM_AVALID;
 	ifmr->ifm_active = IFM_ETHER;
 
 	VTNET_CORE_LOCK(sc);
 	if (vtnet_is_link_up(sc) != 0) {
 		ifmr->ifm_status |= IFM_ACTIVE;
 		ifmr->ifm_active |= VTNET_MEDIATYPE;
 	} else
 		ifmr->ifm_active |= IFM_NONE;
 	VTNET_CORE_UNLOCK(sc);
 }
 
 static void
 vtnet_set_hwaddr(struct vtnet_softc *sc)
 {
 	device_t dev;
 	int i;
 
 	dev = sc->vtnet_dev;
 
 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_MAC) {
 		if (vtnet_ctrl_mac_cmd(sc, sc->vtnet_hwaddr) != 0)
 			device_printf(dev, "unable to set MAC address\n");
 	} else if (sc->vtnet_flags & VTNET_FLAG_MAC) {
 		for (i = 0; i < ETHER_ADDR_LEN; i++) {
 			virtio_write_dev_config_1(dev,
 			    offsetof(struct virtio_net_config, mac) + i,
 			    sc->vtnet_hwaddr[i]);
 		}
 	}
 }
 
 static void
 vtnet_get_hwaddr(struct vtnet_softc *sc)
 {
 	device_t dev;
 	int i;
 
 	dev = sc->vtnet_dev;
 
 	if ((sc->vtnet_flags & VTNET_FLAG_MAC) == 0) {
 		/*
 		 * Generate a random locally administered unicast address.
 		 *
 		 * It would be nice to generate the same MAC address across
 		 * reboots, but it seems all the hosts currently available
 		 * support the MAC feature, so this isn't too important.
 		 */
 		sc->vtnet_hwaddr[0] = 0xB2;
 		arc4rand(&sc->vtnet_hwaddr[1], ETHER_ADDR_LEN - 1, 0);
 		vtnet_set_hwaddr(sc);
 		return;
 	}
 
 	for (i = 0; i < ETHER_ADDR_LEN; i++) {
 		sc->vtnet_hwaddr[i] = virtio_read_dev_config_1(dev,
 		    offsetof(struct virtio_net_config, mac) + i);
 	}
 }
 
 static void
 vtnet_vlan_tag_remove(struct mbuf *m)
 {
 	struct ether_vlan_header *evh;
 
 	evh = mtod(m, struct ether_vlan_header *);
 	m->m_pkthdr.ether_vtag = ntohs(evh->evl_tag);
 	m->m_flags |= M_VLANTAG;
 
 	/* Strip the 802.1Q header. */
 	bcopy((char *) evh, (char *) evh + ETHER_VLAN_ENCAP_LEN,
 	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
 }
 
 static void
 vtnet_set_rx_process_limit(struct vtnet_softc *sc)
 {
 	int limit;
 
 	limit = vtnet_tunable_int(sc, "rx_process_limit",
 	    vtnet_rx_process_limit);
 	if (limit < 0)
 		limit = INT_MAX;
 	sc->vtnet_rx_process_limit = limit;
 }
 
 static void
 vtnet_set_tx_intr_threshold(struct vtnet_softc *sc)
 {
 	device_t dev;
 	int size, thresh;
 
 	dev = sc->vtnet_dev;
 	size = virtqueue_size(sc->vtnet_txqs[0].vtntx_vq);
 
 	/*
 	 * The Tx interrupt is disabled until the queue free count falls
 	 * below our threshold. Completed frames are drained from the Tx
 	 * virtqueue before transmitting new frames and in the watchdog
 	 * callout, so the frequency of Tx interrupts is greatly reduced,
 	 * at the cost of not freeing mbufs as quickly as they otherwise
 	 * would be.
 	 *
 	 * N.B. We assume all the Tx queues are the same size.
 	 */
 	thresh = size / 4;
 
 	/*
 	 * Without indirect descriptors, leave enough room for the most
 	 * segments we handle.
 	 */
 	if ((sc->vtnet_flags & VTNET_FLAG_INDIRECT) == 0 &&
 	    thresh < sc->vtnet_tx_nsegs)
 		thresh = sc->vtnet_tx_nsegs;
 
 	sc->vtnet_tx_intr_thresh = thresh;
 }
 
 static void
 vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *child, struct vtnet_rxq *rxq)
 {
 	struct sysctl_oid *node;
 	struct sysctl_oid_list *list;
 	struct vtnet_rxq_stats *stats;
 	char namebuf[16];
 
 	snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->vtnrx_id);
 	node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
 	    CTLFLAG_RD, NULL, "Receive Queue");
 	list = SYSCTL_CHILDREN(node);
 
 	stats = &rxq->vtnrx_stats;
 
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ipackets", CTLFLAG_RD,
 	    &stats->vrxs_ipackets, "Receive packets");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ibytes", CTLFLAG_RD,
 	    &stats->vrxs_ibytes, "Receive bytes");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "iqdrops", CTLFLAG_RD,
 	    &stats->vrxs_iqdrops, "Receive drops");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ierrors", CTLFLAG_RD,
 	    &stats->vrxs_ierrors, "Receive errors");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD,
 	    &stats->vrxs_csum, "Receive checksum offloaded");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum_failed", CTLFLAG_RD,
 	    &stats->vrxs_csum_failed, "Receive checksum offload failed");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD,
 	    &stats->vrxs_rescheduled,
 	    "Receive interrupt handler rescheduled");
 }
 
 static void
 vtnet_setup_txq_sysctl(struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *child, struct vtnet_txq *txq)
 {
 	struct sysctl_oid *node;
 	struct sysctl_oid_list *list;
 	struct vtnet_txq_stats *stats;
 	char namebuf[16];
 
 	snprintf(namebuf, sizeof(namebuf), "txq%d", txq->vtntx_id);
 	node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
 	    CTLFLAG_RD, NULL, "Transmit Queue");
 	list = SYSCTL_CHILDREN(node);
 
 	stats = &txq->vtntx_stats;
 
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "opackets", CTLFLAG_RD,
 	    &stats->vtxs_opackets, "Transmit packets");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "obytes", CTLFLAG_RD,
 	    &stats->vtxs_obytes, "Transmit bytes");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "omcasts", CTLFLAG_RD,
 	    &stats->vtxs_omcasts, "Transmit multicasts");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD,
 	    &stats->vtxs_csum, "Transmit checksum offloaded");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "tso", CTLFLAG_RD,
 	    &stats->vtxs_tso, "Transmit segmentation offloaded");
 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD,
 	    &stats->vtxs_rescheduled,
 	    "Transmit interrupt handler rescheduled");
 }
 
 static void
 vtnet_setup_queue_sysctl(struct vtnet_softc *sc)
 {
 	device_t dev;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *tree;
 	struct sysctl_oid_list *child;
 	int i;
 
 	dev = sc->vtnet_dev;
 	ctx = device_get_sysctl_ctx(dev);
 	tree = device_get_sysctl_tree(dev);
 	child = SYSCTL_CHILDREN(tree);
 
 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
 		vtnet_setup_rxq_sysctl(ctx, child, &sc->vtnet_rxqs[i]);
 		vtnet_setup_txq_sysctl(ctx, child, &sc->vtnet_txqs[i]);
 	}
 }
 
 static void
 vtnet_setup_stat_sysctl(struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *child, struct vtnet_softc *sc)
 {
 	struct vtnet_statistics *stats;
 	struct vtnet_rxq_stats rxaccum;
 	struct vtnet_txq_stats txaccum;
 
 	vtnet_accum_stats(sc, &rxaccum, &txaccum);
 
 	stats = &sc->vtnet_stats;
 	stats->rx_csum_offloaded = rxaccum.vrxs_csum;
 	stats->rx_csum_failed = rxaccum.vrxs_csum_failed;
 	stats->rx_task_rescheduled = rxaccum.vrxs_rescheduled;
 	stats->tx_csum_offloaded = txaccum.vtxs_csum;
 	stats->tx_tso_offloaded = txaccum.vtxs_tso;
 	stats->tx_task_rescheduled = txaccum.vtxs_rescheduled;
 
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "mbuf_alloc_failed",
 	    CTLFLAG_RD, &stats->mbuf_alloc_failed,
 	    "Mbuf cluster allocation failures");
 
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_frame_too_large",
 	    CTLFLAG_RD, &stats->rx_frame_too_large,
 	    "Received frame larger than the mbuf chain");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_enq_replacement_failed",
 	    CTLFLAG_RD, &stats->rx_enq_replacement_failed,
 	    "Enqueuing the replacement receive mbuf failed");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_mergeable_failed",
 	    CTLFLAG_RD, &stats->rx_mergeable_failed,
 	    "Mergeable buffers receive failures");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ethtype",
 	    CTLFLAG_RD, &stats->rx_csum_bad_ethtype,
 	    "Received checksum offloaded buffer with unsupported "
 	    "Ethernet type");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ipproto",
 	    CTLFLAG_RD, &stats->rx_csum_bad_ipproto,
 	    "Received checksum offloaded buffer with incorrect IP protocol");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_offset",
 	    CTLFLAG_RD, &stats->rx_csum_bad_offset,
 	    "Received checksum offloaded buffer with incorrect offset");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_proto",
 	    CTLFLAG_RD, &stats->rx_csum_bad_proto,
 	    "Received checksum offloaded buffer with incorrect protocol");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_failed",
 	    CTLFLAG_RD, &stats->rx_csum_failed,
 	    "Received buffer checksum offload failed");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_offloaded",
 	    CTLFLAG_RD, &stats->rx_csum_offloaded,
 	    "Received buffer checksum offload succeeded");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_task_rescheduled",
 	    CTLFLAG_RD, &stats->rx_task_rescheduled,
 	    "Times the receive interrupt task rescheduled itself");
 
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_bad_ethtype",
 	    CTLFLAG_RD, &stats->tx_csum_bad_ethtype,
 	    "Aborted transmit of checksum offloaded buffer with unknown "
 	    "Ethernet type");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_bad_ethtype",
 	    CTLFLAG_RD, &stats->tx_tso_bad_ethtype,
 	    "Aborted transmit of TSO buffer with unknown Ethernet type");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_not_tcp",
 	    CTLFLAG_RD, &stats->tx_tso_not_tcp,
 	    "Aborted transmit of TSO buffer with non TCP protocol");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defragged",
 	    CTLFLAG_RD, &stats->tx_defragged,
 	    "Transmit mbufs defragged");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defrag_failed",
 	    CTLFLAG_RD, &stats->tx_defrag_failed,
 	    "Aborted transmit of buffer because defrag failed");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_offloaded",
 	    CTLFLAG_RD, &stats->tx_csum_offloaded,
 	    "Offloaded checksum of transmitted buffer");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_offloaded",
 	    CTLFLAG_RD, &stats->tx_tso_offloaded,
 	    "Segmentation offload of transmitted buffer");
 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_task_rescheduled",
 	    CTLFLAG_RD, &stats->tx_task_rescheduled,
 	    "Times the transmit interrupt task rescheduled itself");
 }
 
 static void
 vtnet_setup_sysctl(struct vtnet_softc *sc)
 {
 	device_t dev;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *tree;
 	struct sysctl_oid_list *child;
 
 	dev = sc->vtnet_dev;
 	ctx = device_get_sysctl_ctx(dev);
 	tree = device_get_sysctl_tree(dev);
 	child = SYSCTL_CHILDREN(tree);
 
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "max_vq_pairs",
 	    CTLFLAG_RD, &sc->vtnet_max_vq_pairs, 0,
 	    "Maximum number of supported virtqueue pairs");
 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "act_vq_pairs",
 	    CTLFLAG_RD, &sc->vtnet_act_vq_pairs, 0,
 	    "Number of active virtqueue pairs");
 
 	vtnet_setup_stat_sysctl(ctx, child, sc);
 }
 
 static int
 vtnet_rxq_enable_intr(struct vtnet_rxq *rxq)
 {
 
 	return (virtqueue_enable_intr(rxq->vtnrx_vq));
 }
 
 static void
 vtnet_rxq_disable_intr(struct vtnet_rxq *rxq)
 {
 
 	virtqueue_disable_intr(rxq->vtnrx_vq);
 }
 
 static int
 vtnet_txq_enable_intr(struct vtnet_txq *txq)
 {
 	struct virtqueue *vq;
 
 	vq = txq->vtntx_vq;
 
 	if (vtnet_txq_below_threshold(txq) != 0)
 		return (virtqueue_postpone_intr(vq, VQ_POSTPONE_LONG));
 
 	/*
 	 * The free count is above our threshold. Keep the Tx interrupt
 	 * disabled until the queue is fuller.
 	 */
 	return (0);
 }
 
 static void
 vtnet_txq_disable_intr(struct vtnet_txq *txq)
 {
 
 	virtqueue_disable_intr(txq->vtntx_vq);
 }
 
 static void
 vtnet_enable_rx_interrupts(struct vtnet_softc *sc)
 {
 	int i;
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
 		vtnet_rxq_enable_intr(&sc->vtnet_rxqs[i]);
 }
 
 static void
 vtnet_enable_tx_interrupts(struct vtnet_softc *sc)
 {
 	int i;
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
 		vtnet_txq_enable_intr(&sc->vtnet_txqs[i]);
 }
 
 static void
 vtnet_enable_interrupts(struct vtnet_softc *sc)
 {
 
 	vtnet_enable_rx_interrupts(sc);
 	vtnet_enable_tx_interrupts(sc);
 }
 
 static void
 vtnet_disable_rx_interrupts(struct vtnet_softc *sc)
 {
 	int i;
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
 		vtnet_rxq_disable_intr(&sc->vtnet_rxqs[i]);
 }
 
 static void
 vtnet_disable_tx_interrupts(struct vtnet_softc *sc)
 {
 	int i;
 
 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
 		vtnet_txq_disable_intr(&sc->vtnet_txqs[i]);
 }
 
 static void
 vtnet_disable_interrupts(struct vtnet_softc *sc)
 {
 
 	vtnet_disable_rx_interrupts(sc);
 	vtnet_disable_tx_interrupts(sc);
 }
 
 static int
 vtnet_tunable_int(struct vtnet_softc *sc, const char *knob, int def)
 {
 	char path[64];
 
 	snprintf(path, sizeof(path),
 	    "hw.vtnet.%d.%s", device_get_unit(sc->vtnet_dev), knob);
 	TUNABLE_INT_FETCH(path, &def);
 
 	return (def);
 }
Index: projects/ifnet/sys/net/if.c
===================================================================
--- projects/ifnet/sys/net/if.c	(revision 277599)
+++ projects/ifnet/sys/net/if.c	(revision 277600)
@@ -1,3670 +1,3676 @@
 /*-
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.c	8.5 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_inet6.h"
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/conf.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/bus.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/refcount.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
 #include <sys/sockio.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/priv.h>
 
 #include <machine/stdarg.h>
 #include <vm/uma.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_vlan_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <net/ethernet.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #ifdef INET
 #include <netinet/if_ether.h>
 #endif /* INET */
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif /* INET6 */
 #endif /* INET || INET6 */
 
 #include <security/mac/mac_framework.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 #endif
 
 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
 
 int	ifqmaxlen = IFQ_MAXLEN;
 SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN,
     &ifqmaxlen, 0, "max send queue size");
 
 /* Log link state change events */
 static int log_link_state_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
 	&log_link_state_change, 0,
 	"log interface link state change events");
 
 /* Interface description */
 static unsigned int ifdescr_maxlen = 1024;
 SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
 	&ifdescr_maxlen, 0,
 	"administrative maximum length for interface description");
 
 static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");
 
 /* global sx for non-critical path ifdescr */
 static struct sx ifdescr_sx;
 SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");
 
 void	(*bridge_linkstate_p)(struct ifnet *ifp);
 void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
 void	(*lagg_linkstate_p)(struct ifnet *ifp, int state);
 /* These are external hooks for CARP. */
 void	(*carp_linkstate_p)(struct ifnet *ifp);
 void	(*carp_demote_adj_p)(int, char *);
 int	(*carp_master_p)(struct ifaddr *);
 #if defined(INET) || defined(INET6)
 int	(*carp_forus_p)(struct ifnet *ifp, u_char *dhost);
 int	(*carp_output_p)(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *sa);
 int	(*carp_ioctl_p)(struct ifreq *, u_long, struct thread *);   
 int	(*carp_attach_p)(struct ifaddr *, int);
 void	(*carp_detach_p)(struct ifaddr *);
 #endif
 #ifdef INET
 int	(*carp_iamatch_p)(struct ifaddr *, uint8_t **);
 #endif
 #ifdef INET6
 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6);
 caddr_t	(*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m,
     const struct in6_addr *taddr);
 #endif
 
 struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
 
 /*
  * XXX: Style; these should be sorted alphabetically, and unprototyped
  * static functions should be prototyped. Currently they are sorted by
  * declaration order.
  */
 static void	if_attachdomain(void *);
 static void	if_attachdomain1(struct ifnet *);
 static int	ifconf(u_long, caddr_t);
 static void	if_freemulti(struct ifmultiaddr *);
 static void	if_grow(void);
 static void	if_route(struct ifnet *, int flag, int fam);
 static int	if_setflag(struct ifnet *, int, int, int *, int);
 static void	if_unroute(struct ifnet *, int flag, int fam);
 static void	link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
 static int	if_rtdel(struct radix_node *, void *);
 static int	if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
 static void	do_link_state_change(void *, int);
 static int	if_getgroup(struct ifgroupreq *, struct ifnet *);
 static int	if_getgroupmembers(struct ifgroupreq *);
 static void	if_delgroups(struct ifnet *);
 static void	if_attach_internal(struct ifnet *, int);
 static void	if_detach_internal(struct ifnet *, int);
 static struct ifqueue * if_snd_alloc(int);
 static void	if_snd_free(struct ifqueue *);
 static void	if_snd_qflush(if_t);
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 VNET_DEFINE(int, if_index);
 VNET_DEFINE(struct ifnethead, ifnet);	/* depend on static init XXX */
 VNET_DEFINE(struct ifgrouphead, ifg_head);
 
 static VNET_DEFINE(int, if_indexlim) = 8;
 
 /* Table of ifnet by index. */
 VNET_DEFINE(struct ifnet **, ifindex_table);
 
 #define	V_if_indexlim		VNET(if_indexlim)
 #define	V_ifindex_table		VNET(ifindex_table)
 
 static struct iftsomax default_tsomax = {
 	/*
 	 * The TSO defaults need to be such that an NFS mbuf list of 35
 	 * mbufs totalling just below 64K works and that a chain of mbufs
 	 * can be defragged into at most 32 segments.
 	 */
 	.tsomax_bytes = MIN(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN +
 	    ETHER_VLAN_ENCAP_LEN)),
 	.tsomax_segcount = 35,
 	.tsomax_segsize = 2048,
 };
 
 /*
  * The global network interface list (V_ifnet) and related state (such as
  * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and
  * an rwlock.  Either may be acquired shared to stablize the list, but both
  * must be acquired writable to modify the list.  This model allows us to
  * both stablize the interface list during interrupt thread processing, but
  * also to stablize it over long-running ioctls, without introducing priority
  * inversions and deadlocks.
  */
 struct rwlock ifnet_rwlock;
 RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE);
 struct sx ifnet_sxlock;
 SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE);
 
 /*
  * The allocation of network interfaces is a rather non-atomic affair; we
  * need to select an index before we are ready to expose the interface for
  * use, so will use this pointer value to indicate reservation.
  */
 #define	IFNET_HOLD	(void *)(uintptr_t)(-1)
 
 static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
 
 static struct ifops ifdead_ops;
 
 struct ifnet *
 ifnet_byindex_locked(u_short idx)
 {
 
 	if (idx > V_if_index)
 		return (NULL);
 	if (V_ifindex_table[idx] == IFNET_HOLD)
 		return (NULL);
 	return (V_ifindex_table[idx]);
 }
 
 struct ifnet *
 ifnet_byindex(u_short idx)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 struct ifnet *
 ifnet_byindex_ref(u_short idx)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifnet_byindex_locked(idx);
 	if (ifp == NULL || (ifp->if_flags & IFF_DYING)) {
 		IFNET_RUNLOCK_NOSLEEP();
 		return (NULL);
 	}
 	if_ref(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 /*
  * Allocate an ifindex array entry.
  */
 static void
 ifindex_alloc(struct ifnet *ifp)
 {
 	u_short idx;
 
 	IFNET_WLOCK();
 retry:
 	/*
 	 * Try to find an empty slot below V_if_index.  If we fail, take the
 	 * next slot.
 	 */
 	for (idx = 1; idx <= V_if_index; idx++) {
 		if (V_ifindex_table[idx] == NULL)
 			break;
 	}
 
 	/* Catch if_index overflow. */
 	if (idx >= V_if_indexlim) {
 		if_grow();
 		goto retry;
 	}
 	if (idx > V_if_index)
 		V_if_index = idx;
 	V_ifindex_table[idx] = ifp;
 	ifp->if_index = idx;
 	IFNET_WUNLOCK();
 }
 
 static void
 ifindex_free(u_short idx)
 {
 
 	IFNET_WLOCK_ASSERT();
 
 	V_ifindex_table[idx] = NULL;
 	while (V_if_index > 0 &&
 	    V_ifindex_table[V_if_index] == NULL)
 		V_if_index--;
 }
 
 struct ifaddr *
 ifaddr_byindex(u_short idx)
 {
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	ifa = ifnet_byindex_locked(idx)->if_addr;
 	if (ifa != NULL)
 		ifa_ref(ifa);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Network interface utility routines.
  *
  * Routines with ifa_ifwith* names take sockaddr *'s as
  * parameters.
  */
 
 static void
 vnet_if_init(const void *unused __unused)
 {
 
 	TAILQ_INIT(&V_ifnet);
 	TAILQ_INIT(&V_ifg_head);
 	IFNET_WLOCK();
 	if_grow();				/* create initial table */
 	IFNET_WUNLOCK();
 	vnet_if_clone_init();
 }
 VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init,
     NULL);
 
 #ifdef VIMAGE
 static void
 vnet_if_uninit(const void *unused __unused)
 {
 
 	VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p "
 	    "not empty", __func__, __LINE__, &V_ifnet));
 	VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p "
 	    "not empty", __func__, __LINE__, &V_ifg_head));
 
 	free((caddr_t)V_ifindex_table, M_IFNET);
 }
 VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
     vnet_if_uninit, NULL);
 #endif
 
 static void
 if_grow(void)
 {
 	int oldlim;
 	u_int n;
 	struct ifnet **e;
 
 	IFNET_WLOCK_ASSERT();
 	oldlim = V_if_indexlim;
 	IFNET_WUNLOCK();
 	n = (oldlim << 1) * sizeof(*e);
 	e = malloc(n, M_IFNET, M_WAITOK | M_ZERO);
 	IFNET_WLOCK();
 	if (V_if_indexlim != oldlim) {
 		free(e, M_IFNET);
 		return;
 	}
 	if (V_ifindex_table != NULL) {
 		memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
 		free((caddr_t)V_ifindex_table, M_IFNET);
 	}
 	V_if_indexlim <<= 1;
 	V_ifindex_table = e;
 }
 
 /*
  * Registration/deregistration of interface types.  A type can carry
  * common methods.  Certain drivers depend on types to be loaded.
  */
 static SLIST_HEAD(, iftype) iftypehead = SLIST_HEAD_INITIALIZER(iftypehead);
 void
 iftype_register(struct iftype *ift)
 {
 
 	IFNET_WLOCK();
 	SLIST_INSERT_HEAD(&iftypehead, ift, ift_next);
 	IFNET_WUNLOCK();
 }
 
 void
 iftype_unregister(struct iftype *ift)
 {
 
 	IFNET_WLOCK();
 	SLIST_REMOVE(&iftypehead, ift, iftype, ift_next);
 	IFNET_WUNLOCK();
 }
 
 static struct iftype *
 iftype_find(ifType type)
 {
 	struct iftype *ift;
 
 	IFNET_RLOCK();
 	SLIST_FOREACH(ift, &iftypehead, ift_next)
 		if (ift->ift_type == type)
 			break;
 	IFNET_RUNLOCK();
 
 	return (ift);
 }
 
 #define	ifdrv_flags		__ifdrv_stack_owned
 #define	IFDRV_BLESSED		0x00000001
 
 static void
 ifdriver_bless(struct ifdriver *ifdrv, struct iftype *ift)
 {
 
 	if (ift != NULL) {
 #define	COPY(op)	if (ifdrv->ifdrv_ops.op == NULL)		\
 				ifdrv->ifdrv_ops.op = ift->ift_ops.op
 		COPY(ifop_input);
 		COPY(ifop_transmit);
 		COPY(ifop_output);
 		COPY(ifop_ioctl);
 		COPY(ifop_get_counter);
 		COPY(ifop_init);
 		COPY(ifop_qflush);
 		COPY(ifop_resolvemulti);
 		COPY(ifop_reassign);
 #undef COPY
 #define	COPY(f)		if (ifdrv->ifdrv_ ## f == 0)			\
 				ifdrv->ifdrv_ ## f = ift->ift_ ## f
 		COPY(hdrlen);
 		COPY(addrlen);
 		COPY(dlt);
 		COPY(dlt_hdrlen);
 #undef COPY
 	}
 
 	/*
 	 * If driver has ifdrv_maxqlen defined, then it opts-in
 	 * for * generic software queue, and thus for default
 	 * ifop_qflush.
 	 */
 	if (ifdrv->ifdrv_maxqlen > 0) {
 		KASSERT(ifdrv->ifdrv_ops.ifop_qflush == NULL,
 		    ("%s: fdrv_maxqlen > 0 and ifop_qflush",
 		    ifdrv->ifdrv_name));
 		ifdrv->ifdrv_ops.ifop_qflush = if_snd_qflush;
 	}
 
 	if (ifdrv->ifdrv_ops.ifop_get_counter == NULL)
 		ifdrv->ifdrv_ops.ifop_get_counter = if_get_counter_default;
 
 #if defined(INET) || defined(INET6)
 	/* Use defaults for TSO, if nothing is set. */
 	if (ifdrv->ifdrv_tsomax == NULL)
 		ifdrv->ifdrv_tsomax = &default_tsomax;
 	else
 		KASSERT(ifdrv->ifdrv_tsomax->tsomax_bytes == 0 ||
 		    ifdrv->ifdrv_tsomax->tsomax_bytes >= (IP_MAXPACKET / 8),
 		    ("%s: tsomax_bytes is outside of range",
 		    ifdrv->ifdrv_name));
 #endif
 
 	ifdrv->ifdrv_flags |= IFDRV_BLESSED;
 }
 
 /*
  * Allocate a struct ifnet and an index for an interface.  A layer 2
  * common structure will also be allocated if an allocation routine is
  * registered for the passed type.
  *
  * The only reason for this function to fail is failure to allocate a
  * unit number, which is possible only if driver does cloning.
  */
 if_t
 if_attach(struct if_attach_args *ifat)
 {
 	struct ifdriver *ifdrv;
 	struct iftype *ift;
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	int socksize, ifasize, namelen, masklen;
 
 	KASSERT(ifat->ifat_version == IF_ATTACH_VERSION,
 	    ("%s: version %d, expected %d",
 	    __func__, ifat->ifat_version, IF_ATTACH_VERSION));
 
 	ifdrv = ifat->ifat_drv;
 	ift = iftype_find(ifdrv->ifdrv_type);
 	if ((ifdrv->ifdrv_flags & IFDRV_BLESSED) == 0)
 		ifdriver_bless(ifdrv, ift);
 
 	if (ifdrv->ifdrv_clone != NULL) {
 		int error;
 
 		error = ifc_alloc_unit(ifdrv->ifdrv_clone, &ifat->ifat_dunit);
 		if (error) {
 			log(LOG_WARNING, "%s unit allocation failure: %d\n",
 			    ifdrv->ifdrv_name, error);
 			ifat->ifat_error = error;
 			return (NULL);
 		}
 	}
 
 	ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK | M_ZERO);
 	for (int i = 0; i < IFCOUNTERS; i++)
 		ifp->if_counters[i] = counter_u64_alloc(M_WAITOK);
 #ifdef MAC
 	mac_ifnet_init(ifp);
 	mac_ifnet_create(ifp);
 #endif
 
 	ifp->if_ops = &ifdrv->ifdrv_ops;
 	ifp->if_drv = ifdrv;
 	ifp->if_type = ift;
 	
 #define	COPY(f)	ifp->if_ ## f = ifat->ifat_ ## f
 	COPY(softc);
 	COPY(mtu);
 	COPY(flags);
 	COPY(capabilities);
 	COPY(capenable);
 	COPY(hwassist);
 	COPY(baudrate);
 #undef COPY
 
 	if (ifat->ifat_tsomax) {
 		/*
 		 * Driver wants dynamic tsomax on this interface, we
 		 * will allocate one and are responsible for freeing
 		 * it on detach.
 		 */
 		KASSERT(ifat->ifat_tsomax->tsomax_bytes == 0 ||
 		    ifat->ifat_tsomax->tsomax_bytes >= (IP_MAXPACKET / 8),
 		    ("%s: tsomax_bytes is outside of range",
 		    ifdrv->ifdrv_name));
 		ifp->if_tsomax = malloc(sizeof(struct iftsomax), M_IFNET,
 		    M_WAITOK);
 		bcopy(ifat->ifat_tsomax, ifp->if_tsomax,
 		    sizeof(struct iftsomax));
 	} else
 		ifp->if_tsomax = ifdrv->ifdrv_tsomax;
 
 	if (ifdrv->ifdrv_maxqlen > 0)
 		ifp->if_snd = if_snd_alloc(ifdrv->ifdrv_maxqlen);
 
 	IF_ADDR_LOCK_INIT(ifp);
 	IF_AFDATA_LOCK_INIT(ifp);
 	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
 	TAILQ_INIT(&ifp->if_addrhead);
 	TAILQ_INIT(&ifp->if_multiaddrs);
 	TAILQ_INIT(&ifp->if_groups);
 
 	/* XXXGL: there is no check that name is unique. */
 	ifp->if_dunit = ifat->ifat_dunit;
 	if (ifat->ifat_name)
 		strlcpy(ifp->if_xname, ifat->ifat_name, IFNAMSIZ);
 	else if (ifat->ifat_dunit != IFAT_DUNIT_NONE)
 		snprintf(ifp->if_xname, IFNAMSIZ, "%s%d",
 		    ifdrv->ifdrv_name, ifat->ifat_dunit);
 	else
 		strlcpy(ifp->if_xname, ifdrv->ifdrv_name, IFNAMSIZ);
 
 	ifindex_alloc(ifp);
 	refcount_init(&ifp->if_refcount, 1);
 
 	/*
 	 * Allocate ifaddr to store link level address and name for this
 	 * interface.  Always save enough space for any possiable name so
 	 * we can do a rename in place later.
 	 */
 	namelen = strlen(ifp->if_xname);
 	masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
 	socksize = masklen + ifdrv->ifdrv_addrlen;
 	if (socksize < sizeof(*sdl))
 		socksize = sizeof(*sdl);
 	socksize = roundup2(socksize, sizeof(long));
 	ifasize = sizeof(*ifa) + 2 * socksize;
 	ifa = ifa_alloc(ifasize, M_WAITOK);
 	sdl = (struct sockaddr_dl *)(ifa + 1);
 	sdl->sdl_len = socksize;
 	sdl->sdl_family = AF_LINK;
 	bcopy(ifp->if_xname, sdl->sdl_data, namelen);
 	sdl->sdl_nlen = namelen;
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = ifdrv->ifdrv_type;
 	ifp->if_addr = ifa;
 	ifa->ifa_ifp = ifp;
 	ifa->ifa_rtrequest = link_rtrequest;
 	ifa->ifa_addr = (struct sockaddr *)sdl;
 	sdl = (struct sockaddr_dl *)(socksize + (char *)sdl);
 	ifa->ifa_netmask = (struct sockaddr *)sdl;
 	sdl->sdl_len = masklen;
 	while (namelen != 0)
 		sdl->sdl_data[--namelen] = 0xff;
 	TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 
 	if (ift)
 		ift->ift_attach(ifp, ifat);
 
 	bpfattach(ifp, ifdrv->ifdrv_dlt, ifdrv->ifdrv_dlt_hdrlen);
 
 	if_attach_internal(ifp, 0);
 
 	return (ifp);
 }
 
 /*
  * Do the actual work of freeing a struct ifnet, and layer 2 common
  * structure.  This call is made when the last reference to an
  * interface is released.
  */
 static void
 if_free_internal(struct ifnet *ifp)
 {
 
 	KASSERT((ifp->if_flags & IFF_DYING),
 	    ("if_free_internal: interface not dying"));
 
 #ifdef MAC
 	mac_ifnet_destroy(ifp);
 #endif /* MAC */
 	if (ifp->if_description != NULL)
 		free(ifp->if_description, M_IFDESCR);
 	IF_AFDATA_DESTROY(ifp);
 	IF_ADDR_LOCK_DESTROY(ifp);
 	if (ifp->if_snd)
 		if_snd_free(ifp->if_snd);
 
 	for (int i = 0; i < IFCOUNTERS; i++)
 		counter_u64_free(ifp->if_counters[i]);
 
 	if (ifp->if_tsomax != ifp->if_drv->ifdrv_tsomax)
 		free(ifp->if_tsomax, M_IFNET);
 
 	free(ifp, M_IFNET);
 }
 
 void
 if_mtap(if_t ifp, struct mbuf *m, void *data, u_int dlen)
 {
 
 	if (!bpf_peers_present(ifp->if_bpf))
 		return;
 
 	if (dlen == 0) {
 		if (m->m_flags & M_VLANTAG)
 			ether_vlan_mtap(ifp->if_bpf, m, NULL, 0);
 		else
 			bpf_mtap(ifp->if_bpf, m);
 	} else
 		bpf_mtap2(ifp->if_bpf, data, dlen, m);
 }
 
 /*
  * Interfaces to keep an ifnet type-stable despite the possibility of the
  * driver calling if_free().  If there are additional references, we defer
  * freeing the underlying data structure.
  */
 void
 if_ref(struct ifnet *ifp)
 {
 
 	/* We don't assert the ifnet list lock here, but arguably should. */
 	refcount_acquire(&ifp->if_refcount);
 }
 
 void
 if_rele(struct ifnet *ifp)
 {
 
 	if (!refcount_release(&ifp->if_refcount))
 		return;
 	if_free_internal(ifp);
 }
 
 /*
  * Compute the least common TSO limit.
  */
 void
 if_tsomax_common(const struct iftsomax *from, struct iftsomax *to)
 {
 
 	/*
 	 * 1) If there is no limit currently, take the limit from
 	 * the network adapter.
 	 *
 	 * 2) If the network adapter has a limit below the current
 	 * limit, apply it.
 	 */
 	if (to->tsomax_bytes == 0 || (from->tsomax_bytes != 0 &&
 	    from->tsomax_bytes < to->tsomax_bytes)) {
 		to->tsomax_bytes = from->tsomax_bytes;
 	}
 	if (to->tsomax_segcount == 0 || (from->tsomax_segcount != 0 &&
 	    from->tsomax_segcount < to->tsomax_segcount)) {
 		to->tsomax_segcount = from->tsomax_segcount;
 	}
 	if (to->tsomax_segsize == 0 || (from->tsomax_segsize != 0 &&
 	    from->tsomax_segsize < to->tsomax_segsize)) {
 		to->tsomax_segsize = from->tsomax_segsize;
 	}
 }
 
 /*
  * Update TSO limit of a network adapter.
  *
  * Returns zero if no change. Else non-zero.
  */
 int
 if_tsomax_update(if_t ifp, const struct iftsomax *new)
 {
 	int retval = 0;
 
 	KASSERT(ifp->if_tsomax != ifp->if_drv->ifdrv_tsomax,
 	    ("%s: interface %s (driver %s) has static if_tsomax", __func__,
 	    ifp->if_xname, ifp->if_drv->ifdrv_name));
 
 	if (ifp->if_tsomax->tsomax_bytes != new->tsomax_bytes) {
 		ifp->if_tsomax->tsomax_bytes = new->tsomax_bytes;
 		retval++;
 	}
 	if (ifp->if_tsomax->tsomax_segsize != new->tsomax_segsize) {
 		ifp->if_tsomax->tsomax_segsize = new->tsomax_segsize;
 		retval++;
 	}
 	if (ifp->if_tsomax->tsomax_segcount != new->tsomax_segcount) {
 		ifp->if_tsomax->tsomax_segcount = new->tsomax_segcount;
 		retval++;
 	}
 	KASSERT(ifp->if_tsomax->tsomax_bytes == 0 ||
 	    ifp->if_tsomax->tsomax_bytes >= (IP_MAXPACKET / 8),
 	    ("%s: tsomax_bytes is outside of range", ifp->if_xname));
 	return (retval);
 }
 
 static void
 if_attach_internal(struct ifnet *ifp, int vmove)
 {
 
 	if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index))
 		panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
 		    ifp->if_xname);
 
 #ifdef VIMAGE
 	ifp->if_vnet = curvnet;
 	if (ifp->if_home_vnet == NULL)
 		ifp->if_home_vnet = curvnet;
 #endif
 
 	if_addgroup(ifp, IFG_ALL);
 
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_epoch = time_uptime;
 
 #ifdef VIMAGE
 	/*
 	 * Update the interface index in the link layer address
 	 * of the interface.
 	 */
 	for (ifa = ifp->if_addr; ifa != NULL;
 	    ifa = TAILQ_NEXT(ifa, ifa_link)) {
 		if (ifa->ifa_addr->sa_family == AF_LINK) {
 			sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 			sdl->sdl_index = ifp->if_index;
 		}
 	}
 #endif
 
 	IFNET_WLOCK();
 	TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
 #ifdef VIMAGE
 	curvnet->vnet_ifcnt++;
 #endif
 	IFNET_WUNLOCK();
 
 	if (domain_init_status >= 2)
 		if_attachdomain1(ifp);
 
 	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
 
 	/* Announce the interface. */
 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 }
 
 static void
 if_attachdomain(void *dummy)
 {
 	struct ifnet *ifp;
 
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		if_attachdomain1(ifp);
 }
 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
     if_attachdomain, NULL);
 
 static void
 if_attachdomain1(struct ifnet *ifp)
 {
 	struct domain *dp;
 
 	/*
 	 * Since dp->dom_ifattach calls malloc() with M_WAITOK, we
 	 * cannot lock ifp->if_afdata initialization, entirely.
 	 */
 	if (IF_AFDATA_TRYLOCK(ifp) == 0)
 		return;
 	if (ifp->if_afdata_initialized >= domain_init_status) {
 		IF_AFDATA_UNLOCK(ifp);
 		log(LOG_WARNING, "%s called more than once on %s\n",
 		    __func__, ifp->if_xname);
 		return;
 	}
 	ifp->if_afdata_initialized = domain_init_status;
 	IF_AFDATA_UNLOCK(ifp);
 
 	/* address family dependent data region */
 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_ifattach)
 			ifp->if_afdata[dp->dom_family] =
 			    (*dp->dom_ifattach)(ifp);
 	}
 }
 
 /*
  * Remove any unicast or broadcast network addresses from an interface.
  */
 void
 if_purgeaddrs(struct ifnet *ifp)
 {
 	struct ifaddr *ifa, *next;
 
 	TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
 		if (ifa->ifa_addr->sa_family == AF_LINK)
 			continue;
 #ifdef INET
 		/* XXX: Ugly!! ad hoc just for INET */
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			struct ifaliasreq ifr;
 
 			bzero(&ifr, sizeof(ifr));
 			ifr.ifra_addr = *ifa->ifa_addr;
 			if (ifa->ifa_dstaddr)
 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
 			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
 			    NULL) == 0)
 				continue;
 		}
 #endif /* INET */
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6) {
 			in6_purgeaddr(ifa);
 			/* ifp_addrhead is already updated */
 			continue;
 		}
 #endif /* INET6 */
 		TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
 		ifa_free(ifa);
 	}
 }
 
 /*
  * Remove any multicast network addresses from an interface when an ifnet
  * is going away.
  */
 static void
 if_purgemaddrs(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 1);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Detach an interface, removing it from the list of "active" interfaces.
  * If vmove flag is set on entry to if_detach_internal(), perform only a
  * limited subset of cleanup tasks, given that we are moving an ifnet from
  * one vnet to another, where it must be fully operational.
  *
  * XXXRW: There are some significant questions about event ordering, and
  * how to prevent things from starting to use the interface during detach.
  */
 void
 if_detach(if_t ifp)
 {
 
 	ifp->if_flags |= IFF_DYING;			/* XXX: Locking */
 
 	bpfdetach(ifp);
 #ifdef DEVICE_POLLING
 	if (ifp->if_capenable & IFCAP_POLLING)
 		if_poll_deregister(ifp);
 #endif
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	if_detach_internal(ifp, 0);
 
 	IFNET_WLOCK();
 	KASSERT(ifp == ifnet_byindex_locked(ifp->if_index),
 	    ("%s: freeing unallocated ifnet", ifp->if_xname));
 
 	ifindex_free(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	if (ifp->if_drv->ifdrv_clone != NULL)
 		ifc_free_unit(ifp->if_drv->ifdrv_clone, ifp->if_dunit);
 
 	if (refcount_release(&ifp->if_refcount))
 		if_free_internal(ifp);
 	CURVNET_RESTORE();
 }
 
 static void
 if_detach_internal(struct ifnet *ifp, int vmove)
 {
 	struct ifaddr *ifa;
 	struct radix_node_head	*rnh;
 	int i, j;
 	struct domain *dp;
  	struct ifnet *iter;
  	int found = 0;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(iter, &V_ifnet, if_link)
 		if (iter == ifp) {
 			TAILQ_REMOVE(&V_ifnet, ifp, if_link);
 			found = 1;
 			break;
 		}
 #ifdef VIMAGE
 	if (found)
 		curvnet->vnet_ifcnt--;
 #endif
 	IFNET_WUNLOCK();
 	if (!found) {
 		if (vmove)
 			panic("%s: ifp=%p not on the ifnet tailq %p",
 			    __func__, ifp, &V_ifnet);
 		else
 			return; /* XXX this should panic as well? */
 	}
 
 	/*
 	 * Remove/wait for pending events.
 	 */
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 
 	/*
 	 * Remove routes and flush queues.
 	 */
 	if_down(ifp);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		altq_disable(&ifp->if_snd);
 	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
 		altq_detach(&ifp->if_snd);
 #endif
 
 	if_purgeaddrs(ifp);
 
 #ifdef INET
 	in_ifdetach(ifp);
 #endif
 
 #ifdef INET6
 	/*
 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
 	 * before removing routing entries below, since IPv6 interface direct
 	 * routes are expected to be removed by the IPv6-specific kernel API.
 	 * Otherwise, the kernel will detect some inconsistency and bark it.
 	 */
 	in6_ifdetach(ifp);
 #endif
 	if_purgemaddrs(ifp);
 
 	/* Announce that the interface is gone. */
 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
 
 	if (!vmove) {
 		struct iftype *ift = ifp->if_type;
 
 		if (ift != NULL && ift->ift_detach != NULL)
 			ift->ift_detach(ifp);
 
 		/*
 		 * Prevent further calls into the device driver via ifnet.
 		 */
 		ifp->if_ops = &ifdead_ops;
 
 		/*
 		 * Remove link ifaddr pointer and maybe decrement if_index.
 		 * Clean up all addresses.
 		 */
 		ifp->if_addr = NULL;
 
 		/* We can now free link ifaddr. */
 		if (!TAILQ_EMPTY(&ifp->if_addrhead)) {
 			ifa = TAILQ_FIRST(&ifp->if_addrhead);
 			TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
 			ifa_free(ifa);
 		}
 	}
 
 	/*
 	 * Delete all remaining routes using this interface
 	 * Unfortuneatly the only way to do this is to slog through
 	 * the entire routing table looking for routes which point
 	 * to this interface...oh well...
 	 */
 	for (i = 1; i <= AF_MAX; i++) {
 		for (j = 0; j < rt_numfibs; j++) {
 			rnh = rt_tables_get_rnh(j, i);
 			if (rnh == NULL)
 				continue;
 			RADIX_NODE_HEAD_LOCK(rnh);
 			(void) rnh->rnh_walktree(rnh, if_rtdel, ifp);
 			RADIX_NODE_HEAD_UNLOCK(rnh);
 		}
 	}
 
 	if_delgroups(ifp);
 
 	/*
 	 * We cannot hold the lock over dom_ifdetach calls as they might
 	 * sleep, for example trying to drain a callout, thus open up the
 	 * theoretical race with re-attaching.
 	 */
 	IF_AFDATA_LOCK(ifp);
 	i = ifp->if_afdata_initialized;
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_UNLOCK(ifp);
 	for (dp = domains; i > 0 && dp; dp = dp->dom_next) {
 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
 			(*dp->dom_ifdetach)(ifp,
 			    ifp->if_afdata[dp->dom_family]);
 	}
 }
 
 #ifdef VIMAGE
 /*
  * if_vmove() performs a limited version of if_detach() in current
  * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
  * An attempt is made to shrink if_index in current vnet, find an
  * unused if_index in target vnet and calls if_grow() if necessary,
  * and finally find an unused if_xname for the target vnet.
  */
 void
 if_vmove(struct ifnet *ifp, struct vnet *new_vnet)
 {
 
 	/*
 	 * Detach from current vnet, but preserve LLADDR info, do not
 	 * mark as dead etc. so that the ifnet can be reattached later.
 	 */
 	if_detach_internal(ifp, 1);
 
 	/*
 	 * Unlink the ifnet from ifindex_table[] in current vnet, and shrink
 	 * the if_index for that vnet if possible.
 	 *
 	 * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized,
 	 * or we'd lock on one vnet and unlock on another.
 	 */
 	IFNET_WLOCK();
 	ifindex_free(ifp->if_index);
 	IFNET_WUNLOCK();
 
 	/*
 	 * Perform interface-specific reassignment tasks, if provided by
 	 * the driver.
 	 */
 	if (ifp->if_reassign != NULL)
 		ifp->if_reassign(ifp, new_vnet, NULL);
 
 	/*
 	 * Switch to the context of the target vnet.
 	 */
 	CURVNET_SET_QUIET(new_vnet);
 
 	IFNET_WLOCK();
 	ifp->if_index = ifindex_alloc();
 	ifnet_setbyindex_locked(ifp->if_index, ifp);
 	IFNET_WUNLOCK();
 
 	if_attach_internal(ifp, 1);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Move an ifnet to or from another child prison/vnet, specified by the jail id.
  */
 static int
 if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct ifnet *difp;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Do not try to move the iface from and to the same prison. */
 	if (pr->pr_vnet == ifp->if_vnet) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the named iface does not exists in the dst. prison/vnet. */
 	/* XXX Lock interfaces to avoid races. */
 	CURVNET_SET_QUIET(pr->pr_vnet);
 	difp = ifunit(ifname);
 	CURVNET_RESTORE();
 	if (difp != NULL) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Move the interface into the child jail/vnet. */
 	if_vmove(ifp, pr->pr_vnet);
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 
 static int
 if_vmove_reclaim(struct thread *td, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct vnet *vnet_dst;
 	struct ifnet *ifp;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Make sure the named iface exists in the source prison/vnet. */
 	CURVNET_SET(pr->pr_vnet);
 	ifp = ifunit(ifname);		/* XXX Lock to avoid races. */
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (ENXIO);
 	}
 
 	/* Do not try to move the iface from and to the same prison. */
 	vnet_dst = TD_TO_VNET(td);
 	if (vnet_dst == ifp->if_vnet) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Get interface back from child jail/vnet. */
 	if_vmove(ifp, vnet_dst);
 	CURVNET_RESTORE();
 
 	/* Report the new if_xname back to the userland. */
 	sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (0);
 }
 #endif /* VIMAGE */
 
 /*
  * Add a group to an interface
  */
 int
 if_addgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_group	*ifg = NULL;
 	struct ifg_member	*ifgm;
 	int 			 new = 0;
 
 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
 	    groupname[strlen(groupname) - 1] <= '9')
 		return (EINVAL);
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
 			IFNET_WUNLOCK();
 			return (EEXIST);
 		}
 
 	if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP,
 	    M_NOWAIT)) == NULL) {
 	    	IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member),
 	    M_TEMP, M_NOWAIT)) == NULL) {
 		free(ifgl, M_TEMP);
 		IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, groupname))
 			break;
 
 	if (ifg == NULL) {
 		if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group),
 		    M_TEMP, M_NOWAIT)) == NULL) {
 			free(ifgl, M_TEMP);
 			free(ifgm, M_TEMP);
 			IFNET_WUNLOCK();
 			return (ENOMEM);
 		}
 		strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
 		ifg->ifg_refcnt = 0;
 		TAILQ_INIT(&ifg->ifg_members);
 		TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
 		new = 1;
 	}
 
 	ifg->ifg_refcnt++;
 	ifgl->ifgl_group = ifg;
 	ifgm->ifgm_ifp = ifp;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
 	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	IFNET_WUNLOCK();
 
 	if (new)
 		EVENTHANDLER_INVOKE(group_attach_event, ifg);
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove a group from an interface
  */
 int
 if_delgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 
 	IFNET_WLOCK();
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
 			break;
 	if (ifgl == NULL) {
 		IFNET_WUNLOCK();
 		return (ENOENT);
 	}
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 		if (ifgm->ifgm_ifp == ifp)
 			break;
 
 	if (ifgm != NULL) {
 		TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
 		free(ifgm, M_TEMP);
 	}
 
 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 		TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 		IFNET_WUNLOCK();
 		EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
 		free(ifgl->ifgl_group, M_TEMP);
 	} else
 		IFNET_WUNLOCK();
 
 	free(ifgl, M_TEMP);
 
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Remove an interface from all groups
  */
 static void
 if_delgroups(struct ifnet *ifp)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 	char groupname[IFNAMSIZ];
 
 	IFNET_WLOCK();
 	while (!TAILQ_EMPTY(&ifp->if_groups)) {
 		ifgl = TAILQ_FIRST(&ifp->if_groups);
 
 		strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
 
 		IF_ADDR_WLOCK(ifp);
 		TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
 		IF_ADDR_WUNLOCK(ifp);
 
 		TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
 			if (ifgm->ifgm_ifp == ifp)
 				break;
 
 		if (ifgm != NULL) {
 			TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm,
 			    ifgm_next);
 			free(ifgm, M_TEMP);
 		}
 
 		if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 			TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
 			IFNET_WUNLOCK();
 			EVENTHANDLER_INVOKE(group_detach_event,
 			    ifgl->ifgl_group);
 			free(ifgl->ifgl_group, M_TEMP);
 		} else
 			IFNET_WUNLOCK();
 
 		free(ifgl, M_TEMP);
 
 		EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 		IFNET_WLOCK();
 	}
 	IFNET_WUNLOCK();
 }
 
 /*
  * Stores all groups from an interface in memory pointed
  * to by data
  */
 static int
 if_getgroup(struct ifgroupreq *data, struct ifnet *ifp)
 {
 	int			 len, error;
 	struct ifg_list		*ifgl;
 	struct ifg_req		 ifgrq, *ifgp;
 	struct ifgroupreq	*ifgr = data;
 
 	if (ifgr->ifgr_len == 0) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 			ifgr->ifgr_len += sizeof(struct ifg_req);
 		IF_ADDR_RUNLOCK(ifp);
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	/* XXX: wire */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
 		if (len < sizeof(ifgrq)) {
 			IF_ADDR_RUNLOCK(ifp);
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
 		    sizeof(ifgrq.ifgrq_group));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 		    	IF_ADDR_RUNLOCK(ifp);
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (0);
 }
 
 /*
  * Stores all members of a group in memory pointed to by data
  */
 static int
 if_getgroupmembers(struct ifgroupreq *data)
 {
 	struct ifgroupreq	*ifgr = data;
 	struct ifg_group	*ifg;
 	struct ifg_member	*ifgm;
 	struct ifg_req		 ifgrq, *ifgp;
 	int			 len, error;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
 			break;
 	if (ifg == NULL) {
 		IFNET_RUNLOCK();
 		return (ENOENT);
 	}
 
 	if (ifgr->ifgr_len == 0) {
 		TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
 			ifgr->ifgr_len += sizeof(ifgrq);
 		IFNET_RUNLOCK();
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
 		if (len < sizeof(ifgrq)) {
 			IFNET_RUNLOCK();
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
 		    sizeof(ifgrq.ifgrq_member));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 			IFNET_RUNLOCK();
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IFNET_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Delete Routes for a Network Interface
  *
  * Called for each routing entry via the rnh->rnh_walktree() call above
  * to delete all route entries referencing a detaching network interface.
  *
  * Arguments:
  *	rn	pointer to node in the routing table
  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
  *
  * Returns:
  *	0	successful
  *	errno	failed - reason indicated
  *
  */
 static int
 if_rtdel(struct radix_node *rn, void *arg)
 {
 	struct rtentry	*rt = (struct rtentry *)rn;
 	struct ifnet	*ifp = arg;
 	int		err;
 
 	if (rt->rt_ifp == ifp) {
 
 		/*
 		 * Protect (sorta) against walktree recursion problems
 		 * with cloned routes
 		 */
 		if ((rt->rt_flags & RTF_UP) == 0)
 			return (0);
 
 		err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway,
 				rt_mask(rt),
 				rt->rt_flags|RTF_RNH_LOCKED|RTF_PINNED,
 				(struct rtentry **) NULL, rt->rt_fibnum);
 		if (err) {
 			log(LOG_WARNING, "if_rtdel: error %d\n", err);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Returning different software contexts associated with ifnet.
  */
 void *
 if_getsoftc(struct ifnet *ifp, ift_feature f)
 {
 
 	switch (f) {
 	case IF_DRIVER_SOFTC:
 		return (ifp->if_softc);
 	case IF_LLADDR:
 		return (LLADDR((struct sockaddr_dl *)(ifp->if_addr->ifa_addr)));
 	case IF_BPF:
 		return (ifp->if_bpf);
 	case IF_NAME:
 		return (ifp->if_xname);
 	default:
 		panic("%s: unknown feature %d", __func__, f);
 	};
 }
 
 /*
  * Return counter values from counter(9)s stored in ifnet.
  */
 uint64_t
 if_get_counter_default(struct ifnet *ifp, ift_counter cnt)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	return (counter_u64_fetch(ifp->if_counters[cnt]));
 }
 
 /*
  * Increase an ifnet counter. Usually used for counters shared
  * between the stack and a driver, but function supports them all.
  */
 void
 if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	counter_u64_add(ifp->if_counters[cnt], inc);
 }
 
 /*
  * Account successful transmission of an mbuf.
  */
 void
 if_inc_txcounters(struct ifnet *ifp, struct mbuf *m)
 {
 
 	counter_u64_add(ifp->if_counters[IFCOUNTER_OBYTES], m->m_pkthdr.len);
 	counter_u64_add(ifp->if_counters[IFCOUNTER_OPACKETS], 1);
 	if (m->m_flags & M_MCAST)
 		counter_u64_add(ifp->if_counters[IFCOUNTER_OMCASTS], 1);
 }
 
 /*
  * Set the baudrate.
  */
 void
 if_setbaudrate(struct ifnet *ifp, uint64_t baudrate)
 {
 
 	ifp->if_baudrate = baudrate;
 }
 
 /*
  * Copy data from ifnet to userland API structure if_data.
  */
 void
 if_data_copy(struct ifnet *ifp, struct if_data *ifd)
 {
 
 	ifd->ifi_type = if_type(ifp);
 	ifd->ifi_physical = 0;
 	ifd->ifi_addrlen = if_addrlen(ifp);
 	ifd->ifi_hdrlen = ifp->if_drv->ifdrv_hdrlen;
 	ifd->ifi_link_state = ifp->if_link_state;
 	ifd->ifi_vhid = 0;
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_mtu = ifp->if_mtu;
 	ifd->ifi_metric = ifp->if_metric;
 	ifd->ifi_baudrate = ifp->if_baudrate;
 	ifd->ifi_hwassist = ifp->if_hwassist;
 	ifd->ifi_epoch = ifp->if_epoch;
 	ifd->ifi_lastchange = ifp->if_lastchange;
 
 	ifd->ifi_ipackets = if_get_counter(ifp, IFCOUNTER_IPACKETS);
 	ifd->ifi_ierrors = if_get_counter(ifp, IFCOUNTER_IERRORS);
 	ifd->ifi_opackets = if_get_counter(ifp, IFCOUNTER_OPACKETS);
 	ifd->ifi_oerrors = if_get_counter(ifp, IFCOUNTER_OERRORS);
 	ifd->ifi_collisions = if_get_counter(ifp, IFCOUNTER_COLLISIONS);
 	ifd->ifi_ibytes = if_get_counter(ifp, IFCOUNTER_IBYTES);
 	ifd->ifi_obytes = if_get_counter(ifp, IFCOUNTER_OBYTES);
 	ifd->ifi_imcasts = if_get_counter(ifp, IFCOUNTER_IMCASTS);
 	ifd->ifi_omcasts = if_get_counter(ifp, IFCOUNTER_OMCASTS);
 	ifd->ifi_iqdrops = if_get_counter(ifp, IFCOUNTER_IQDROPS);
 	ifd->ifi_oqdrops = if_get_counter(ifp, IFCOUNTER_OQDROPS);
 	ifd->ifi_noproto = if_get_counter(ifp, IFCOUNTER_NOPROTO);
 }
 
 /*
  * Initialization, destruction and refcounting functions for ifaddrs.
  */
 struct ifaddr *
 ifa_alloc(size_t size, int flags)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(size >= sizeof(struct ifaddr),
 	    ("%s: invalid size %zu", __func__, size));
 
 	ifa = malloc(size, M_IFADDR, M_ZERO | flags);
 	if (ifa == NULL)
 		return (NULL);
 
 	if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 
 	refcount_init(&ifa->ifa_refcnt, 1);
 
 	return (ifa);
 
 fail:
 	/* free(NULL) is okay */
 	counter_u64_free(ifa->ifa_opackets);
 	counter_u64_free(ifa->ifa_ipackets);
 	counter_u64_free(ifa->ifa_obytes);
 	counter_u64_free(ifa->ifa_ibytes);
 	free(ifa, M_IFADDR);
 
 	return (NULL);
 }
 
 void
 ifa_ref(struct ifaddr *ifa)
 {
 
 	refcount_acquire(&ifa->ifa_refcnt);
 }
 
 void
 ifa_free(struct ifaddr *ifa)
 {
 
 	if (refcount_release(&ifa->ifa_refcnt)) {
 		counter_u64_free(ifa->ifa_opackets);
 		counter_u64_free(ifa->ifa_ipackets);
 		counter_u64_free(ifa->ifa_obytes);
 		counter_u64_free(ifa->ifa_ibytes);
 		free(ifa, M_IFADDR);
 	}
 }
 
 int
 ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 	int error = 0;
 	struct rtentry *rt = NULL;
 	struct rt_addrinfo info;
 	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
 
 	bzero(&info, sizeof(info));
 	info.rti_ifp = V_loif;
 	info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
 	info.rti_info[RTAX_DST] = ia;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
 	error = rtrequest1_fib(RTM_ADD, &info, &rt, ifa->ifa_ifp->if_fib);
 
 	if (error == 0 && rt != NULL) {
 		RT_LOCK(rt);
 		((struct sockaddr_dl *)rt->rt_gateway)->sdl_type =
 		    if_type(ifa->ifa_ifp);
 		((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
 		    ifa->ifa_ifp->if_index;
 		RT_REMREF(rt);
 		RT_UNLOCK(rt);
 	} else if (error != 0)
 		log(LOG_DEBUG, "%s: insertion failed: %u\n", __func__, error);
 
 	return (error);
 }
 
 int
 ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
 {
 	int error = 0;
 	struct rt_addrinfo info;
 	struct sockaddr_dl null_sdl;
 
 	bzero(&null_sdl, sizeof(null_sdl));
 	null_sdl.sdl_len = sizeof(null_sdl);
 	null_sdl.sdl_family = AF_LINK;
 	null_sdl.sdl_type = if_type(ifa->ifa_ifp);
 	null_sdl.sdl_index = ifa->ifa_ifp->if_index;
 	bzero(&info, sizeof(info));
 	info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
 	info.rti_info[RTAX_DST] = ia;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
 	error = rtrequest1_fib(RTM_DELETE, &info, NULL, ifa->ifa_ifp->if_fib);
 
 	if (error != 0)
 		log(LOG_DEBUG, "%s: deletion failed: %u\n", __func__, error);
 
 	return (error);
 }
 
 int
 ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *sa, int fib)
 {
 	struct rtentry *rt;
 
 	rt = rtalloc1_fib(sa, 0, 0, fib);
 	if (rt == NULL) {
 		log(LOG_DEBUG, "%s: fail", __func__);
 		return (EHOSTUNREACH);
 	}
 	((struct sockaddr_dl *)rt->rt_gateway)->sdl_type =
 	    if_type(ifa->ifa_ifp);
 	((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
 	    ifa->ifa_ifp->if_index;
 	RTFREE_LOCKED(rt);
 
 	return (0);
 }
 
 /*
  * XXX: Because sockaddr_dl has deeper structure than the sockaddr
  * structs used to represent other address families, it is necessary
  * to perform a different comparison.
  */
 
 #define	sa_dl_equal(a1, a2)	\
 	((((struct sockaddr_dl *)(a1))->sdl_len ==			\
 	 ((struct sockaddr_dl *)(a2))->sdl_len) &&			\
 	 (bcmp(LLADDR((struct sockaddr_dl *)(a1)),			\
 	       LLADDR((struct sockaddr_dl *)(a2)),			\
 	       ((struct sockaddr_dl *)(a1))->sdl_alen) == 0))
 
 /*
  * Locate an interface based on a complete address.
  */
 /*ARGSUSED*/
 static struct ifaddr *
 ifa_ifwithaddr_internal(struct sockaddr *addr, int getref)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (sa_equal(addr, ifa->ifa_addr)) {
 				if (getref)
 					ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 			/* IP6 doesn't have broadcast */
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				if (getref)
 					ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 struct ifaddr *
 ifa_ifwithaddr(struct sockaddr *addr)
 {
 
 	return (ifa_ifwithaddr_internal(addr, 1));
 }
 
 int
 ifa_ifwithaddr_check(struct sockaddr *addr)
 {
 
 	return (ifa_ifwithaddr_internal(addr, 0) != NULL);
 }
 
 /*
  * Locate an interface based on the broadcast address.
  */
 /* ARGSUSED */
 struct ifaddr *
 ifa_ifwithbroadaddr(struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Locate the point to point interface with a given destination address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithdstaddr(struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
 			continue;
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (ifa->ifa_dstaddr != NULL &&
 			    sa_equal(addr, ifa->ifa_dstaddr)) {
 				ifa_ref(ifa);
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifa);
 }
 
 /*
  * Find an interface on a specific network.  If many, choice
  * is most specific found.
  */
 struct ifaddr *
 ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 	char *addr_data = addr->sa_data, *cplim;
 
 	/*
 	 * AF_LINK addresses can be looked up directly by their index number,
 	 * so do that if we can.
 	 */
 	if (af == AF_LINK) {
 	    struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
 	    if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
 		return (ifaddr_byindex(sdl->sdl_index));
 	}
 
 	/*
 	 * Scan though each interface, looking for ones that have addresses
 	 * in this address family and the requested fib.  Maintain a reference
 	 * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that
 	 * kept it stable when we move onto the next interface.
 	 */
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			char *cp, *cp2, *cp3;
 
 			if (ifa->ifa_addr->sa_family != af)
 next:				continue;
 			if (af == AF_INET && 
 			    ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
 				/*
 				 * This is a bit broken as it doesn't
 				 * take into account that the remote end may
 				 * be a single node in the network we are
 				 * looking for.
 				 * The trouble is that we don't know the
 				 * netmask for the remote end.
 				 */
 				if (ifa->ifa_dstaddr != NULL &&
 				    sa_equal(addr, ifa->ifa_dstaddr)) {
 					ifa_ref(ifa);
 					IF_ADDR_RUNLOCK(ifp);
 					goto done;
 				}
 			} else {
 				/*
 				 * Scan all the bits in the ifa's address.
 				 * If a bit dissagrees with what we are
 				 * looking for, mask it with the netmask
 				 * to see if it really matters.
 				 * (A byte at a time)
 				 */
 				if (ifa->ifa_netmask == 0)
 					continue;
 				cp = addr_data;
 				cp2 = ifa->ifa_addr->sa_data;
 				cp3 = ifa->ifa_netmask->sa_data;
 				cplim = ifa->ifa_netmask->sa_len
 					+ (char *)ifa->ifa_netmask;
 				while (cp3 < cplim)
 					if ((*cp++ ^ *cp2++) & *cp3++)
 						goto next; /* next address! */
 				/*
 				 * If the netmask of what we just found
 				 * is more specific than what we had before
 				 * (if we had one), or if the virtual status
 				 * of new prefix is better than of the old one,
 				 * then remember the new one before continuing
 				 * to search for an even better one.
 				 */
 				if (ifa_maybe == NULL ||
 				    ifa_preferred(ifa_maybe, ifa) ||
 				    rn_refines((caddr_t)ifa->ifa_netmask,
 				    (caddr_t)ifa_maybe->ifa_netmask)) {
 					if (ifa_maybe != NULL)
 						ifa_free(ifa_maybe);
 					ifa_maybe = ifa;
 					ifa_ref(ifa_maybe);
 				}
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	ifa = ifa_maybe;
 	ifa_maybe = NULL;
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	if (ifa_maybe != NULL)
 		ifa_free(ifa_maybe);
 	return (ifa);
 }
 
 /*
  * Find an interface address specific to an interface best matching
  * a given address.
  */
 struct ifaddr *
 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	char *cp, *cp2, *cp3;
 	char *cplim;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 
 	if (af >= AF_MAX)
 		return (NULL);
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != af)
 			continue;
 		if (ifa_maybe == NULL)
 			ifa_maybe = ifa;
 		if (ifa->ifa_netmask == 0) {
 			if (sa_equal(addr, ifa->ifa_addr) ||
 			    (ifa->ifa_dstaddr &&
 			    sa_equal(addr, ifa->ifa_dstaddr)))
 				goto done;
 			continue;
 		}
 		if (ifp->if_flags & IFF_POINTOPOINT) {
 			if (sa_equal(addr, ifa->ifa_dstaddr))
 				goto done;
 		} else {
 			cp = addr->sa_data;
 			cp2 = ifa->ifa_addr->sa_data;
 			cp3 = ifa->ifa_netmask->sa_data;
 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
 			for (; cp3 < cplim; cp3++)
 				if ((*cp++ ^ *cp2++) & *cp3)
 					break;
 			if (cp3 == cplim)
 				goto done;
 		}
 	}
 	ifa = ifa_maybe;
 done:
 	if (ifa != NULL)
 		ifa_ref(ifa);
 	IF_ADDR_RUNLOCK(ifp);
 	return (ifa);
 }
 
 /*
  * See whether new ifa is better than current one:
  * 1) A non-virtual one is preferred over virtual.
  * 2) A virtual in master state preferred over any other state.
  *
  * Used in several address selecting functions.
  */
 int
 ifa_preferred(struct ifaddr *cur, struct ifaddr *next)
 {
 
 	return (cur->ifa_carp && (!next->ifa_carp ||
 	    ((*carp_master_p)(next) && !(*carp_master_p)(cur))));
 }
 
 #include <net/if_llatbl.h>
 
 /*
  * Default action when installing a route with a Link Level gateway.
  * Lookup an appropriate real ifa to point to.
  * This should be moved to /sys/net/link.c eventually.
  */
 static void
 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct ifaddr *ifa, *oifa;
 	struct sockaddr *dst;
 	struct ifnet *ifp;
 
 	if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) ||
 	    ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0))
 		return;
 	ifa = ifaof_ifpforaddr(dst, ifp);
 	if (ifa) {
 		oifa = rt->rt_ifa;
 		rt->rt_ifa = ifa;
 		ifa_free(oifa);
 		if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
 			ifa->ifa_rtrequest(cmd, rt, info);
 	}
 }
 
 struct sockaddr_dl *
 link_alloc_sdl(size_t size, int flags)
 {
 
 	return (malloc(size, M_TEMP, flags));
 }
 
 void
 link_free_sdl(struct sockaddr *sa)
 {
 	free(sa, M_TEMP);
 }
 
 /*
  * Fills in given sdl with interface basic info.
  * Returns pointer to filled sdl.
  */
 struct sockaddr_dl *
 link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)paddr;
 	memset(sdl, 0, sizeof(struct sockaddr_dl));
 	sdl->sdl_len = sizeof(struct sockaddr_dl);
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = iftype;
 
 	return (sdl);
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 static void
 if_unroute(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
 
 	ifp->if_flags &= ~flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
 	if_qflush(ifp);
 
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 static void
 if_route(struct ifnet *ifp, int flag, int fam)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
 
 	ifp->if_flags |= flag;
 	getmicrotime(&ifp->if_lastchange);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
 			pfctlinput(PRC_IFUP, ifa->ifa_addr);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 #ifdef INET6
 	in6_if_up(ifp);
 #endif
 }
 
 void	(*vlan_link_state_p)(struct ifnet *);	/* XXX: private from if_vlan */
 void	(*vlan_trunk_cap_p)(struct ifnet *);		/* XXX: private from if_vlan */
 struct ifnet *(*vlan_trunkdev_p)(struct ifnet *);
 struct	ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t);
 int	(*vlan_tag_p)(struct ifnet *, uint16_t *);
 int	(*vlan_setcookie_p)(struct ifnet *, void *);
 void	*(*vlan_cookie_p)(struct ifnet *);
 
 /*
  * Handle a change in the interface link state. To avoid LORs
  * between driver lock and upper layer locks, as well as possible
  * recursions, we post event to taskqueue, and all job
  * is done in static do_link_state_change().
  */
 void
 if_link_state_change(struct ifnet *ifp, int link_state)
 {
 	/* Return if state hasn't changed. */
 	if (ifp->if_link_state == link_state)
 		return;
 
 	ifp->if_link_state = link_state;
 
 	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
 }
 
 static void
 do_link_state_change(void *arg, int pending)
 {
 	struct ifnet *ifp = (struct ifnet *)arg;
 	int link_state = ifp->if_link_state;
 	CURVNET_SET(ifp->if_vnet);
 
 	/* Notify that the link state has changed. */
 	rt_ifmsg(ifp);
 	if (ifp->if_vlantrunk != NULL)
 		(*vlan_link_state_p)(ifp);
 
 	/* XXXGL: make ng_ether softc pointer */
 	if ((if_type(ifp) == IFT_ETHER || if_type(ifp) == IFT_L2VLAN) &&
 	    ifp->if_l2com != NULL)
 		(*ng_ether_link_state_p)(ifp, link_state);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	if (ifp->if_bridge)
 		(*bridge_linkstate_p)(ifp);
 	if (ifp->if_lagg)
 		(*lagg_linkstate_p)(ifp, link_state);
 
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
 		    NULL);
 	if (pending > 1)
 		if_printf(ifp, "%d link states coalesced\n", pending);
 	if (log_link_state_change)
 		log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
 	EVENTHANDLER_INVOKE(ifnet_link_event, ifp, ifp->if_link_state);
 	CURVNET_RESTORE();
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 void
 if_down(struct ifnet *ifp)
 {
 
 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 void
 if_up(struct ifnet *ifp)
 {
 
 	if_route(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Map interface name to interface structure pointer, with or without
  * returning a reference.
  */
 struct ifnet *
 ifunit_ref(const char *name)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
 		    !(ifp->if_flags & IFF_DYING))
 			break;
 	}
 	if (ifp != NULL)
 		if_ref(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 struct ifnet *
 ifunit(const char *name)
 {
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
 			break;
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	return (ifp);
 }
 
 /*
  * Hardware specific interface ioctls.
  */
 int
 if_drvioctl(u_long cmd, struct ifnet *ifp, void *data, struct thread *td)
 {
 	struct ifreq *ifr;
 	size_t namelen, onamelen;
 	size_t descrlen;
 	char *descrbuf, *odescrbuf;
 	char new_name[IFNAMSIZ];
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	uint32_t flags;
 	int error = 0;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCGIFINDEX:
 		ifr->ifr_index = ifp->if_index;
 		break;
 
 	case SIOCGIFFLAGS:
-		ifr->ifr_flagslow = ifp->if_flags & 0xffff;
+		ifr->ifr_flags = ifp->if_flags & 0xffff;
 		ifr->ifr_flagshigh = ifp->if_flags >> 16;
+		/*
+		 * Some software may care about IFF_RUNNING, so make
+		 * it happy.
+		 */
+		if (ifp->if_flags & IFF_UP)
+			ifr->ifr_flags |= IFF_RUNNING;
 		break;
 
 	case SIOCGIFCAP:
 		ifr->ifr_reqcap = ifp->if_capabilities;
 		ifr->ifr_curcap = ifp->if_capenable;
 		break;
 
 #ifdef MAC
 	case SIOCGIFMAC:
 		error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCGIFMETRIC:
 		ifr->ifr_metric = ifp->if_metric;
 		break;
 
 	case SIOCGIFMTU:
 		ifr->ifr_mtu = ifp->if_mtu;
 		break;
 
 	case SIOCGIFPHYS:
 		/* XXXGL: did this ever worked? */
 		ifr->ifr_phys = 0;
 		break;
 
 	case SIOCGIFDESCR:
 		error = 0;
 		sx_slock(&ifdescr_sx);
 		if (ifp->if_description == NULL)
 			error = ENOMSG;
 		else {
 			/* space for terminating nul */
 			descrlen = strlen(ifp->if_description) + 1;
 			if (ifr->ifr_buffer.length < descrlen)
 				ifr->ifr_buffer.buffer = NULL;
 			else
 				error = copyout(ifp->if_description,
 				    ifr->ifr_buffer.buffer, descrlen);
 			ifr->ifr_buffer.length = descrlen;
 		}
 		sx_sunlock(&ifdescr_sx);
 		break;
 
 	case SIOCSIFDESCR:
 		error = priv_check(td, PRIV_NET_SETIFDESCR);
 		if (error)
 			return (error);
 
 		/*
 		 * Copy only (length-1) bytes to make sure that
 		 * if_description is always nul terminated.  The
 		 * length parameter is supposed to count the
 		 * terminating nul in.
 		 */
 		if (ifr->ifr_buffer.length > ifdescr_maxlen)
 			return (ENAMETOOLONG);
 		else if (ifr->ifr_buffer.length == 0)
 			descrbuf = NULL;
 		else {
 			descrbuf = malloc(ifr->ifr_buffer.length, M_IFDESCR,
 			    M_WAITOK | M_ZERO);
 			error = copyin(ifr->ifr_buffer.buffer, descrbuf,
 			    ifr->ifr_buffer.length - 1);
 			if (error) {
 				free(descrbuf, M_IFDESCR);
 				break;
 			}
 		}
 
 		sx_xlock(&ifdescr_sx);
 		odescrbuf = ifp->if_description;
 		ifp->if_description = descrbuf;
 		sx_xunlock(&ifdescr_sx);
 
 		getmicrotime(&ifp->if_lastchange);
 		free(odescrbuf, M_IFDESCR);
 		break;
 
 	case SIOCGIFFIB:
 		ifr->ifr_fib = ifp->if_fib;
 		break;
 
 	case SIOCSIFFIB:
 		error = priv_check(td, PRIV_NET_SETIFFIB);
 		if (error)
 			return (error);
 		if (ifr->ifr_fib >= rt_numfibs)
 			return (EINVAL);
 		ifp->if_fib = ifr->ifr_fib;
 		(void )if_ioctl(ifp, cmd, data, td);
 		break;
 
 	case SIOCSIFFLAGS:
 		error = priv_check(td, PRIV_NET_SETIFFLAGS);
 		if (error)
 			return (error);
 		/*
 		 * Historically if_flags were 16-bit, and thus
 		 * they come from userland in two parts, that
 		 * we need to swap.
 		 */
-		flags = (ifr->ifr_flagslow & 0xffff) |
+		flags = (ifr->ifr_flags & 0xffff) |
 		    (ifr->ifr_flagshigh << 16);
 		if ((flags & IFF_CANTCHANGE) !=
 		    (ifp->if_flags & IFF_CANTCHANGE))
 			return (EINVAL);
 		/*
 		 * Pass new flags down to driver and see if it accepts them.
 		 */
-		ifr->ifr_flags = flags;
 		error = if_ioctl(ifp, cmd, data, td);
 		if (error)
 			return (error);
-		flags = ifr->ifr_flags;
+		flags = (ifr->ifr_flags & 0xffff) |
+		    (ifr->ifr_flagshigh << 16);
 		/*
 		 * Manage IFF_UP flip.
 		 */
 		if (ifp->if_flags & IFF_UP && (flags & IFF_UP) == 0)
 			if_down(ifp);
 		else if (flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0)
 			if_up(ifp);
 		/* See if permanently promiscuous mode bit is about to flip */
 		if ((ifp->if_flags ^ flags) & IFF_PPROMISC) {
 			if (flags & IFF_PPROMISC)
 				ifp->if_flags |= IFF_PROMISC;
 			else if (ifp->if_pcount == 0)
 				ifp->if_flags &= ~IFF_PROMISC;
 			log(LOG_INFO, "%s: permanently promiscuous mode %s\n",
 			    ifp->if_xname,
 			    (flags & IFF_PPROMISC) ? "enabled" : "disabled");
 		}
 		ifp->if_flags = flags;
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFCAP:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
 		if (error)
 			return (error);
 		if ((ifr->ifr_reqcap & IFCAP_VLAN_HWTSO) != 0)
 			ifr->ifr_reqcap |= IFCAP_VLAN_HWTAGGING;
 		if (ifr->ifr_reqcap & ~ifp->if_capabilities)
 			return (EINVAL);
 		if (ifr->ifr_reqcap == ifp->if_capenable)
 			return (0);
 		ifr->ifr_curcap = ifp->if_capenable;
 		error = if_ioctl(ifp, cmd, data, td);
 		if (error != 0)
 			break;
 #ifdef DEVICE_POLLING
 		if ((ifr->ifr_reqcap ^ ifr->ifr_curcap) & IFCAP_POLLING) {
 			if (ifr->ifr_reqcap & IFCAP_POLLING)
 				if_poll_register(ifp);
 			else
 				if_poll_deregister(ifp);
 		}
 #endif
 		ifp->if_capenable = ifr->ifr_reqcap;
 		ifp->if_hwassist = ifr->ifr_hwassist;
 		getmicrotime(&ifp->if_lastchange);
 		if (ifp->if_vlantrunk != NULL)
 			(*vlan_trunk_cap_p)(ifp);
 		break;
 #ifdef MAC
 	case SIOCSIFMAC:
 		error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCSIFNAME:
 		error = priv_check(td, PRIV_NET_SETIFNAME);
 		if (error)
 			return (error);
 		error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
 		if (error != 0)
 			return (error);
 		if (new_name[0] == '\0')
 			return (EINVAL);
 		if (ifunit(new_name) != NULL)
 			return (EEXIST);
 
 		/*
 		 * XXX: Locking.  Nothing else seems to lock if_flags,
 		 * and there are numerous other races with the
 		 * ifunit() checks not being atomic with namespace
 		 * changes (renames, vmoves, if_attach, etc).
 		 */
 		ifp->if_flags |= IFF_RENAMING;
 		
 		/* Announce the departure of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		log(LOG_INFO, "%s: changing name to '%s'\n",
 		    ifp->if_xname, new_name);
 
 		IF_ADDR_WLOCK(ifp);
 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
 		ifa = ifp->if_addr;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		namelen = strlen(new_name);
 		onamelen = sdl->sdl_nlen;
 		/*
 		 * Move the address if needed.  This is safe because we
 		 * allocate space for a name of length IFNAMSIZ when we
 		 * create this in if_attach().
 		 */
 		if (namelen != onamelen) {
 			bcopy(sdl->sdl_data + onamelen,
 			    sdl->sdl_data + namelen, sdl->sdl_alen);
 		}
 		bcopy(new_name, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
 		bzero(sdl->sdl_data, onamelen);
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		IF_ADDR_WUNLOCK(ifp);
 
 		EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 		/* Announce the return of the interface. */
 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 
 		ifp->if_flags &= ~IFF_RENAMING;
 		break;
 
 #ifdef VIMAGE
 	case SIOCSIFVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error)
 			return (error);
 		error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
 		break;
 #endif
 
 	case SIOCSIFMETRIC:
 		error = priv_check(td, PRIV_NET_SETIFMETRIC);
 		if (error)
 			return (error);
 		ifp->if_metric = ifr->ifr_metric;
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYS:
 		error = priv_check(td, PRIV_NET_SETIFPHYS);
 		if (error)
 			return (error);
 		error = if_ioctl(ifp, cmd, data, td);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFMTU:
 		error = priv_check(td, PRIV_NET_SETIFMTU);
 		if (error)
 			return (error);
 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
 			return (EINVAL);
 		if (ifr->ifr_mtu == ifp->if_mtu)
 			return (0);
 		error = if_ioctl(ifp, cmd, data, td);
 		if (error == 0) {
 			ifp->if_mtu = ifr->ifr_mtu;
 			getmicrotime(&ifp->if_lastchange);
 			rt_ifmsg(ifp);
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 		break;
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (cmd == SIOCADDMULTI)
 			error = priv_check(td, PRIV_NET_ADDMULTI);
 		else
 			error = priv_check(td, PRIV_NET_DELMULTI);
 		if (error)
 			return (error);
 
 		/* Don't allow group membership on non-multicast interfaces. */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EOPNOTSUPP);
 
 		/* Don't let users screw up protocols' entries. */
 		if (ifr->ifr_addr.sa_family != AF_LINK)
 			return (EINVAL);
 
 		if (cmd == SIOCADDMULTI) {
 			struct ifmultiaddr *ifma;
 
 			/*
 			 * Userland is only permitted to join groups once
 			 * via the if_addmulti() KPI, because it cannot hold
 			 * struct ifmultiaddr * between calls. It may also
 			 * lose a race while we check if the membership
 			 * already exists.
 			 */
 			IF_ADDR_RLOCK(ifp);
 			ifma = if_findmulti(ifp, &ifr->ifr_addr);
 			IF_ADDR_RUNLOCK(ifp);
 			if (ifma != NULL)
 				error = EADDRINUSE;
 			else
 				error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
 		} else {
 			error = if_delmulti(ifp, &ifr->ifr_addr);
 		}
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYADDR:
 	case SIOCDIFPHYADDR:
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 #endif
 	case SIOCSIFMEDIA:
 	case SIOCSIFGENERIC:
 		error = priv_check(td, PRIV_NET_HWIOCTL);
 		if (error)
 			return (error);
 		error = if_ioctl(ifp, cmd, data, td);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCGIFSTATUS:
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 	case SIOCGIFMEDIA:
 	case SIOCGIFGENERIC:
 		error = if_ioctl(ifp, cmd, data, td);
 		break;
 
 	case SIOCSIFLLADDR:
 		error = priv_check(td, PRIV_NET_SETLLADDR);
 		if (error)
 			return (error);
 		error = if_setlladdr(ifp,
 		    ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
 		EVENTHANDLER_INVOKE(iflladdr_event, ifp);
 		break;
 
 	case SIOCAIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_ADDIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_addgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	case SIOCGIFGROUP:
 		if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp)))
 			return (error);
 		break;
 
 	case SIOCDIFGROUP:
 	{
 		struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
 
 		error = priv_check(td, PRIV_NET_DELIFGROUP);
 		if (error)
 			return (error);
 		if ((error = if_delgroup(ifp, ifgr->ifgr_group)))
 			return (error);
 		break;
 	}
 
 	default:
 		error = ENOIOCTL;
 		break;
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD32
 struct ifconf32 {
 	int32_t	ifc_len;
 	union {
 		uint32_t	ifcu_buf;
 		uint32_t	ifcu_req;
 	} ifc_ifcu;
 };
 #define	SIOCGIFCONF32	_IOWR('i', 36, struct ifconf32)
 #endif
 
 /*
  * Interface ioctls.
  */
 int
 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
 {
 	struct ifnet *ifp;
 	struct ifreq *ifr;
 	int error;
 	int oif_flags;
 
 	CURVNET_SET(so->so_vnet);
 	switch (cmd) {
 	case SIOCGIFCONF:
 		error = ifconf(cmd, data);
 		CURVNET_RESTORE();
 		return (error);
 
 #ifdef COMPAT_FREEBSD32
 	case SIOCGIFCONF32:
 		{
 			struct ifconf32 *ifc32;
 			struct ifconf ifc;
 
 			ifc32 = (struct ifconf32 *)data;
 			ifc.ifc_len = ifc32->ifc_len;
 			ifc.ifc_buf = PTRIN(ifc32->ifc_buf);
 
 			error = ifconf(SIOCGIFCONF, (void *)&ifc);
 			CURVNET_RESTORE();
 			if (error == 0)
 				ifc32->ifc_len = ifc.ifc_len;
 			return (error);
 		}
 #endif
 	}
 	ifr = (struct ifreq *)data;
 
 	switch (cmd) {
 #ifdef VIMAGE
 	case SIOCSIFRVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error == 0)
 			error = if_vmove_reclaim(td, ifr->ifr_name,
 			    ifr->ifr_jid);
 		CURVNET_RESTORE();
 		return (error);
 #endif
 	case SIOCIFCREATE:
 	case SIOCIFCREATE2:
 		error = priv_check(td, PRIV_NET_IFCREATE);
 		if (error == 0)
 			error = if_clone_create(ifr->ifr_name,
 			    sizeof(ifr->ifr_name),
 			    cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL);
 		CURVNET_RESTORE();
 		return (error);
 	case SIOCIFDESTROY:
 		error = priv_check(td, PRIV_NET_IFDESTROY);
 		if (error == 0)
 			error = if_clone_destroy(ifr->ifr_name);
 		CURVNET_RESTORE();
 		return (error);
 
 	case SIOCIFGCLONERS:
 		error = if_clone_list((struct if_clonereq *)data);
 		CURVNET_RESTORE();
 		return (error);
 	case SIOCGIFGMEMB:
 		error = if_getgroupmembers((struct ifgroupreq *)data);
 		CURVNET_RESTORE();
 		return (error);
 #if defined(INET) || defined(INET6)
 	case SIOCSVH:
 	case SIOCGVH:
 		if (carp_ioctl_p == NULL)
 			error = EPROTONOSUPPORT;
 		else
 			error = (*carp_ioctl_p)(ifr, cmd, td);
 		CURVNET_RESTORE();
 		return (error);
 #endif
 	}
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		return (ENXIO);
 	}
 
 	error = if_drvioctl(cmd, ifp, data, td);
 	if (error != ENOIOCTL) {
 		if_rele(ifp);
 		CURVNET_RESTORE();
 		return (error);
 	}
 
 	oif_flags = ifp->if_flags;
 	if (so->so_proto == NULL) {
 		if_rele(ifp);
 		CURVNET_RESTORE();
 		return (EOPNOTSUPP);
 	}
 
 	/*
 	 * Pass the request on to the socket control method, and if the
 	 * latter returns EOPNOTSUPP, directly to the interface.
 	 *
 	 * Make an exception for the legacy SIOCSIF* requests.  Drivers
 	 * trust SIOCSIFADDR et al to come from an already privileged
 	 * layer, and do not perform any credentials checks or input
 	 * validation.
 	 */
 	error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data,
 	    ifp, td));
 	if (error == EOPNOTSUPP && ifp != NULL &&
 	    cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR &&
 	    cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK)
 		error = if_ioctl(ifp, cmd, data, td);
 
 	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
 #ifdef INET6
 		if (ifp->if_flags & IFF_UP)
 			in6_if_up(ifp);
 #endif
 	}
 	if_rele(ifp);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * The code common to handling reference counted flags,
  * e.g., in ifpromisc() and if_allmulti().
  * The "pflag" argument can specify a permanent mode flag to check,
  * such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
  *
  * Only to be used on stack-owned flags, not driver-owned flags.
  */
 static int
 if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
 {
 	struct ifreq ifr;
 	int error;
 	int oldflags, oldcount;
 
 	if (onswitch)
 		KASSERT(*refcount >= 0,
 		    ("%s: increment negative refcount %d for flag %d",
 		    __func__, *refcount, flag));
 	else
 		KASSERT(*refcount > 0,
 		    ("%s: decrement non-positive refcount %d for flag %d",
 		    __func__, *refcount, flag));
 
 	/* In case this mode is permanent, just touch refcount */
 	if (ifp->if_flags & pflag) {
 		*refcount += onswitch ? 1 : -1;
 		return (0);
 	}
 
 	/* Save ifnet parameters for if_ioctl() may fail */
 	oldcount = *refcount;
 	oldflags = ifp->if_flags;
 	
 	/*
 	 * See if we aren't the only and touching refcount is enough.
 	 * Actually toggle interface flag if we are the first or last.
 	 */
 	if (onswitch) {
 		if ((*refcount)++)
 			return (0);
 		ifp->if_flags |= flag;
 	} else {
 		if (--(*refcount))
 			return (0);
 		ifp->if_flags &= ~flag;
 	}
 
 	/* Call down the driver since we've changed interface flags */
 	ifr.ifr_flags = ifp->if_flags & 0xffff;
 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
 	error = if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr, curthread);
 	if (error)
 		goto recover;
 	/* Notify userland that interface flags have changed */
 	rt_ifmsg(ifp);
 	return (0);
 
 recover:
 	/* Recover after driver error */
 	*refcount = oldcount;
 	ifp->if_flags = oldflags;
 	return (error);
 }
 
 /*
  * Set/clear promiscuous mode on interface ifp based on the truth value
  * of pswitch.  The calls are reference counted so that only the first
  * "on" request actually has an effect, as does the final "off" request.
  * Results are undefined if the "off" and "on" requests are not matched.
  */
 int
 ifpromisc(struct ifnet *ifp, int pswitch)
 {
 	int error;
 	int oldflags = ifp->if_flags;
 
 	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
 			   &ifp->if_pcount, pswitch);
 	/* If promiscuous mode status has changed, log a message */
 	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC))
 		log(LOG_INFO, "%s: promiscuous mode %s\n",
 		    ifp->if_xname,
 		    (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
 	return (error);
 }
 
 /*
  * Return interface configuration
  * of system.  List may be used
  * in later ioctl's (above) to get
  * other information.
  */
 /*ARGSUSED*/
 static int
 ifconf(u_long cmd, caddr_t data)
 {
 	struct ifconf *ifc = (struct ifconf *)data;
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 	struct sbuf *sb;
 	int error, full = 0, valid_len, max_len;
 
 	/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
 	max_len = MAXPHYS - 1;
 
 	/* Prevent hostile input from being able to crash the system */
 	if (ifc->ifc_len <= 0)
 		return (EINVAL);
 
 again:
 	if (ifc->ifc_len <= max_len) {
 		max_len = ifc->ifc_len;
 		full = 1;
 	}
 	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
 	max_len = 0;
 	valid_len = 0;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		int addrs;
 
 		/*
 		 * Zero the ifr_name buffer to make sure we don't
 		 * disclose the contents of the stack.
 		 */
 		memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name));
 
 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
 		    >= sizeof(ifr.ifr_name)) {
 			sbuf_delete(sb);
 			IFNET_RUNLOCK();
 			return (ENAMETOOLONG);
 		}
 
 		addrs = 0;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa = ifa->ifa_addr;
 
 			if (prison_if(curthread->td_ucred, sa) != 0)
 				continue;
 			addrs++;
 			if (sa->sa_len <= sizeof(*sa)) {
 				ifr.ifr_addr = *sa;
 				sbuf_bcat(sb, &ifr, sizeof(ifr));
 				max_len += sizeof(ifr);
 			} else {
 				sbuf_bcat(sb, &ifr,
 				    offsetof(struct ifreq, ifr_addr));
 				max_len += offsetof(struct ifreq, ifr_addr);
 				sbuf_bcat(sb, sa, sa->sa_len);
 				max_len += sa->sa_len;
 			}
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (addrs == 0) {
 			bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
 			sbuf_bcat(sb, &ifr, sizeof(ifr));
 			max_len += sizeof(ifr);
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 	}
 	IFNET_RUNLOCK();
 
 	/*
 	 * If we didn't allocate enough space (uncommon), try again.  If
 	 * we have already allocated as much space as we are allowed,
 	 * return what we've got.
 	 */
 	if (valid_len != max_len && !full) {
 		sbuf_delete(sb);
 		goto again;
 	}
 
 	ifc->ifc_len = valid_len;
 	sbuf_finish(sb);
 	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
 	sbuf_delete(sb);
 	return (error);
 }
 
 /*
  * Just like ifpromisc(), but for all-multicast-reception mode.
  */
 int
 if_allmulti(struct ifnet *ifp, int onswitch)
 {
 
 	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
 }
 
 struct ifmultiaddr *
 if_findmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (sa->sa_family == AF_LINK) {
 			if (sa_dl_equal(ifma->ifma_addr, sa))
 				break;
 		} else {
 			if (sa_equal(ifma->ifma_addr, sa))
 				break;
 		}
 	}
 
 	return ifma;
 }
 
 /*
  * Allocate a new ifmultiaddr and initialize based on passed arguments.  We
  * make copies of passed sockaddrs.  The ifmultiaddr will not be added to
  * the ifnet multicast address list here, so the caller must do that and
  * other setup work (such as notifying the device driver).  The reference
  * count is initialized to 1.
  */
 static struct ifmultiaddr *
 if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
     int mflags)
 {
 	struct ifmultiaddr *ifma;
 	struct sockaddr *dupsa;
 
 	ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
 	    M_ZERO);
 	if (ifma == NULL)
 		return (NULL);
 
 	dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(sa, dupsa, sa->sa_len);
 	ifma->ifma_addr = dupsa;
 
 	ifma->ifma_ifp = ifp;
 	ifma->ifma_refcount = 1;
 	ifma->ifma_protospec = NULL;
 
 	if (llsa == NULL) {
 		ifma->ifma_lladdr = NULL;
 		return (ifma);
 	}
 
 	dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma->ifma_addr, M_IFMADDR);
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(llsa, dupsa, llsa->sa_len);
 	ifma->ifma_lladdr = dupsa;
 
 	return (ifma);
 }
 
 /*
  * if_freemulti: free ifmultiaddr structure and possibly attached related
  * addresses.  The caller is responsible for implementing reference
  * counting, notifying the driver, handling routing messages, and releasing
  * any dependent link layer state.
  */
 static void
 if_freemulti(struct ifmultiaddr *ifma)
 {
 
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
 	    ifma->ifma_refcount));
 
 	if (ifma->ifma_lladdr != NULL)
 		free(ifma->ifma_lladdr, M_IFMADDR);
 	free(ifma->ifma_addr, M_IFMADDR);
 	free(ifma, M_IFMADDR);
 }
 
 /*
  * Register an additional multicast address with a network interface.
  *
  * - If the address is already present, bump the reference count on the
  *   address and return.
  * - If the address is not link-layer, look up a link layer address.
  * - Allocate address structures for one or both addresses, and attach to the
  *   multicast address list on the interface.  If automatically adding a link
  *   layer address, the protocol address will own a reference to the link
  *   layer address, to be freed when it is freed.
  * - Notify the network device driver of an addition to the multicast address
  *   list.
  *
  * 'sa' points to caller-owned memory with the desired multicast address.
  *
  * 'retifma' will be used to return a pointer to the resulting multicast
  * address reference, if desired.
  */
 int
 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
     struct ifmultiaddr **retifma)
 {
 	struct ifmultiaddr *ifma, *ll_ifma;
 	struct sockaddr *llsa;
 	struct sockaddr_dl sdl;
 	int error;
 
 	/*
 	 * If the address is already present, return a new reference to it;
 	 * otherwise, allocate storage and set up a new address.
 	 */
 	IF_ADDR_WLOCK(ifp);
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL) {
 		ifma->ifma_refcount++;
 		if (retifma != NULL)
 			*retifma = ifma;
 		IF_ADDR_WUNLOCK(ifp);
 		return (0);
 	}
 
 	/*
 	 * The address isn't already present; resolve the protocol address
 	 * into a link layer address, and then look that up, bump its
 	 * refcount or allocate an ifma for that also.
 	 * Most link layer resolving functions returns address data which
 	 * fits inside default sockaddr_dl structure. However callback
 	 * can allocate another sockaddr structure, in that case we need to
 	 * free it later.
 	 */
 	sdl.sdl_len = sizeof(sdl);
 	llsa = (struct sockaddr *)&sdl;
 	error = if_resolvemulti(ifp, &llsa, sa);
 	if (error == EOPNOTSUPP)
 		llsa = NULL;
 	else if (error)
 		goto unlock_out;
 
 	/*
 	 * Allocate the new address.  Don't hook it up yet, as we may also
 	 * need to allocate a link layer multicast address.
 	 */
 	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
 	if (ifma == NULL) {
 		error = ENOMEM;
 		goto free_llsa_out;
 	}
 
 	/*
 	 * If a link layer address is found, we'll need to see if it's
 	 * already present in the address list, or allocate is as well.
 	 * When this block finishes, the link layer address will be on the
 	 * list.
 	 */
 	if (llsa != NULL) {
 		ll_ifma = if_findmulti(ifp, llsa);
 		if (ll_ifma == NULL) {
 			ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
 			if (ll_ifma == NULL) {
 				--ifma->ifma_refcount;
 				if_freemulti(ifma);
 				error = ENOMEM;
 				goto free_llsa_out;
 			}
 			TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
 			    ifma_link);
 		} else
 			ll_ifma->ifma_refcount++;
 		ifma->ifma_llifma = ll_ifma;
 	}
 
 	/*
 	 * We now have a new multicast address, ifma, and possibly a new or
 	 * referenced link layer address.  Add the primary address to the
 	 * ifnet address list.
 	 */
 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if (retifma != NULL)
 		*retifma = ifma;
 
 	/*
 	 * Must generate the message while holding the lock so that 'ifma'
 	 * pointer is still valid.
 	 */
 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
 	IF_ADDR_WUNLOCK(ifp);
 
 	/*
 	 * We are certain we have added something, so call down to the
 	 * interface to let them know about it.
 	 */
 	if_ioctl(ifp, SIOCADDMULTI, 0, curthread);
 
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 	return (0);
 
 free_llsa_out:
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 unlock_out:
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Delete a multicast group membership by network-layer group address.
  *
  * Returns ENOENT if the entry could not be found. If ifp no longer
  * exists, results are undefined. This entry point should only be used
  * from subsystems which do appropriate locking to hold ifp for the
  * duration of the call.
  * Network-layer protocol domains must use if_delmulti_ifma().
  */
 int
 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 	int lastref;
 #ifdef INVARIANTS
 	struct ifnet *oifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 		if (ifp == oifp)
 			break;
 	if (ifp != oifp)
 		ifp = NULL;
 	IFNET_RUNLOCK_NOSLEEP();
 
 	KASSERT(ifp != NULL, ("%s: ifnet went away", __func__));
 #endif
 	if (ifp == NULL)
 		return (ENOENT);
 
 	IF_ADDR_WLOCK(ifp);
 	lastref = 0;
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL)
 		lastref = if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 
 	if (ifma == NULL)
 		return (ENOENT);
 
 	if (lastref)
 		if_ioctl(ifp, SIOCDELMULTI, 0, curthread);
 
 	return (0);
 }
 
 /*
  * Delete all multicast group membership for an interface.
  * Should be used to quickly flush all multicast filters.
  */
 void
 if_delallmulti(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Delete a multicast group membership by group membership pointer.
  * Network-layer protocol domains must use this routine.
  *
  * It is safe to call this routine if the ifp disappeared.
  */
 void
 if_delmulti_ifma(struct ifmultiaddr *ifma)
 {
 	struct ifnet *ifp;
 	int lastref;
 
 	ifp = ifma->ifma_ifp;
 #ifdef DIAGNOSTIC
 	if (ifp == NULL) {
 		printf("%s: ifma_ifp seems to be detached\n", __func__);
 	} else {
 		struct ifnet *oifp;
 
 		IFNET_RLOCK_NOSLEEP();
 		TAILQ_FOREACH(oifp, &V_ifnet, if_link)
 			if (ifp == oifp)
 				break;
 		if (ifp != oifp) {
 			printf("%s: ifnet %p disappeared\n", __func__, ifp);
 			ifp = NULL;
 		}
 		IFNET_RUNLOCK_NOSLEEP();
 	}
 #endif
 	/*
 	 * If and only if the ifnet instance exists: Acquire the address lock.
 	 */
 	if (ifp != NULL)
 		IF_ADDR_WLOCK(ifp);
 
 	lastref = if_delmulti_locked(ifp, ifma, 0);
 
 	if (ifp != NULL) {
 		/*
 		 * If and only if the ifnet instance exists:
 		 *  Release the address lock.
 		 *  If the group was left: update the hardware hash filter.
 		 */
 		IF_ADDR_WUNLOCK(ifp);
 		if (lastref)
 			if_ioctl(ifp, SIOCDELMULTI, 0, curthread);
 	}
 }
 
 /*
  * Perform deletion of network-layer and/or link-layer multicast address.
  *
  * Return 0 if the reference count was decremented.
  * Return 1 if the final reference was released, indicating that the
  * hardware hash filter should be reprogrammed.
  */
 static int
 if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
 {
 	struct ifmultiaddr *ll_ifma;
 
 	if (ifp != NULL && ifma->ifma_ifp != NULL) {
 		KASSERT(ifma->ifma_ifp == ifp,
 		    ("%s: inconsistent ifp %p", __func__, ifp));
 		IF_ADDR_WLOCK_ASSERT(ifp);
 	}
 
 	ifp = ifma->ifma_ifp;
 
 	/*
 	 * If the ifnet is detaching, null out references to ifnet,
 	 * so that upper protocol layers will notice, and not attempt
 	 * to obtain locks for an ifnet which no longer exists. The
 	 * routing socket announcement must happen before the ifnet
 	 * instance is detached from the system.
 	 */
 	if (detaching) {
 #ifdef DIAGNOSTIC
 		printf("%s: detaching ifnet instance %p\n", __func__, ifp);
 #endif
 		/*
 		 * ifp may already be nulled out if we are being reentered
 		 * to delete the ll_ifma.
 		 */
 		if (ifp != NULL) {
 			rt_newmaddrmsg(RTM_DELMADDR, ifma);
 			ifma->ifma_ifp = NULL;
 		}
 	}
 
 	if (--ifma->ifma_refcount > 0)
 		return 0;
 
 	/*
 	 * If this ifma is a network-layer ifma, a link-layer ifma may
 	 * have been associated with it. Release it first if so.
 	 */
 	ll_ifma = ifma->ifma_llifma;
 	if (ll_ifma != NULL) {
 		KASSERT(ifma->ifma_lladdr != NULL,
 		    ("%s: llifma w/o lladdr", __func__));
 		if (detaching)
 			ll_ifma->ifma_ifp = NULL;	/* XXX */
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ifp != NULL) {
 				TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma,
 				    ifma_link);
 			}
 			if_freemulti(ll_ifma);
 		}
 	}
 
 	if (ifp != NULL)
 		TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if_freemulti(ifma);
 
 	/*
 	 * The last reference to this instance of struct ifmultiaddr
 	 * was released; the hardware should be notified of this change.
 	 */
 	return 1;
 }
 
 /*
  * Set the link layer address on an interface.
  *
  * At this time we only support certain types of interfaces,
  * and we don't allow the length of the address to change.
  */
 int
 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
 {
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 
 	IF_ADDR_RLOCK(ifp);
 	ifa = ifp->if_addr;
 	if (ifa == NULL) {
 		IF_ADDR_RUNLOCK(ifp);
 		return (EINVAL);
 	}
 	ifa_ref(ifa);
 	IF_ADDR_RUNLOCK(ifp);
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	if (sdl == NULL) {
 		ifa_free(ifa);
 		return (EINVAL);
 	}
 	if (len != sdl->sdl_alen) {	/* don't allow length to change */
 		ifa_free(ifa);
 		return (EINVAL);
 	}
 	switch (if_type(ifp)) {
 	case IFT_ETHER:
 	case IFT_FDDI:
 	case IFT_XETHER:
 	case IFT_ISO88025:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_ARCNET:
 	case IFT_IEEE8023ADLAG:
 	case IFT_IEEE80211:
 		bcopy(lladdr, LLADDR(sdl), len);
 		ifa_free(ifa);
 		break;
 	default:
 		ifa_free(ifa);
 		return (ENODEV);
 	}
 
 	/*
 	 * If the interface is already up, we need
 	 * to re-init it in order to reprogram its
 	 * address filter.
 	 */
 	if ((ifp->if_flags & IFF_UP) != 0) {
 		ifp->if_flags &= ~IFF_UP;
 		ifr.ifr_flags = ifp->if_flags & 0xffff;
 		ifr.ifr_flagshigh = ifp->if_flags >> 16;
 		if_ioctl(ifp, SIOCSIFFLAGS, &ifr, curthread);
 		ifp->if_flags |= IFF_UP;
 		ifr.ifr_flags = ifp->if_flags & 0xffff;
 		ifr.ifr_flagshigh = ifp->if_flags >> 16;
 		if_ioctl(ifp, SIOCSIFFLAGS, &ifr, curthread);
 #ifdef INET
 		/*
 		 * Also send gratuitous ARPs to notify other nodes about
 		 * the address change.
 		 */
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family == AF_INET)
 				arp_ifinit(ifp, ifa);
 		}
 #endif
 	}
 	return (0);
 }
 
 int
 if_printf(struct ifnet *ifp, const char * fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	retval = printf("%s: ", ifp->if_xname);
 	va_start(ap, fmt);
 	retval += vprintf(fmt, ap);
 	va_end(ap);
 	return (retval);
 }
 
 int
 if_getmtu_family(if_t ifp, int family)
 {
 	struct domain *dp;
 
 	for (dp = domains; dp; dp = dp->dom_next)
 		if (dp->dom_family == family && dp->dom_ifmtu != NULL)
 			return (dp->dom_ifmtu(ifp));
 
 	return (ifp->if_mtu);
 }
 
 /*
  * Methods for drivers to access interface unicast and multicast
  * addresses.  Driver do not know 'struct ifaddr' neither 'struct ifmultiaddr'.
  */
 void
 if_foreach_addr(if_t ifp, ifaddr_cb_t cb, void *cb_arg)
 {
 	struct ifaddr *ifa;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		(*cb)(cb_arg, ifa->ifa_addr, ifa->ifa_dstaddr,
 		    ifa->ifa_netmask);
 	IF_ADDR_RUNLOCK(ifp);
 }
 
 void
 if_foreach_maddr(if_t ifp, ifmaddr_cb_t cb, void *cb_arg)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
 		(*cb)(cb_arg, ifma->ifma_addr);
 	IF_ADDR_RUNLOCK(ifp);
 }
 
 /*
  * Generic software queue, that many non-high-end drivers use.  For now
  * it is minimalistic version of classic BSD ifqueue, but we can swap it
  * to any other implementation later.
  */
 struct ifqueue {
 	struct mbufq	ifq_mbq;
 	struct mtx	ifq_mtx;
 };
 
 static struct ifqueue *
 if_snd_alloc(int maxlen)
 {
 	struct ifqueue *ifq;
 
 	ifq = malloc(sizeof(struct ifqueue), M_IFNET, M_ZERO | M_WAITOK);
 	mbufq_init(&ifq->ifq_mbq, maxlen);
 	mtx_init(&ifq->ifq_mtx, "ifqueue", NULL, MTX_DEF);
 
 	return (ifq);
 }
 
 static void
 if_snd_free(struct ifqueue *ifq)
 {
 
 	mtx_destroy(&ifq->ifq_mtx);
 	free(ifq, M_IFNET);
 }
 
 /*
  * Flush software interface queue.
  */
 static void
 if_snd_qflush(if_t ifp)
 {
 	struct ifqueue *ifq;
 	struct mbuf *m, *n;
 	
 	ifq = ifp->if_snd;
 	mtx_lock(&ifq->ifq_mtx);
 	n = mbufq_flush(&ifq->ifq_mbq);
 	mtx_unlock(&ifq->ifq_mtx);
 	while ((m = n) != NULL) {
 		n = m->m_nextpkt;
 		m_freem(m);
 	}
 }
 
 int
 if_snd_len(if_t ifp)
 {
 	struct ifqueue *ifq = ifp->if_snd;
 
 	return (mbufq_len(&ifq->ifq_mbq));
 }
 
 int
 if_snd_enqueue(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ifqueue *ifq = ifp->if_snd;
 	int error;
 
 	mtx_lock(&ifq->ifq_mtx);
 	error = mbufq_enqueue(&ifq->ifq_mbq, m);
 	mtx_unlock(&ifq->ifq_mtx);
 	if (error) {
 		m_freem(m);
 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 	}
 	return (error);
 }
 
 struct mbuf *
 if_snd_dequeue(if_t ifp)
 {
 	struct ifqueue *ifq = ifp->if_snd;
 	struct mbuf *m;
 
 	mtx_lock(&ifq->ifq_mtx);
 	m = mbufq_dequeue(&ifq->ifq_mbq);
 	mtx_unlock(&ifq->ifq_mtx);
 	return (m);
 }
 
 void
 if_snd_prepend(if_t ifp, struct mbuf *m)
 {
 	struct ifqueue *ifq = ifp->if_snd;
 
 	mtx_lock(&ifq->ifq_mtx);
 	mbufq_prepend(&ifq->ifq_mbq, m);
 	mtx_unlock(&ifq->ifq_mtx);
 }
 
 /*
  * Implementation of if ops, that can be called from drivers.
  */
 void
 if_input_noinline(if_t ifp, struct mbuf *m)
 {
 
 	return (if_input(ifp, m));
 }
 
 int
 if_transmit_noinline(if_t ifp, struct mbuf *m)
 {
 
 	return (if_transmit(ifp, m));
 }
Index: projects/ifnet/sys/net/if.h
===================================================================
--- projects/ifnet/sys/net/if.h	(revision 277599)
+++ projects/ifnet/sys/net/if.h	(revision 277600)
@@ -1,766 +1,764 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NET_IF_H_
 #define	_NET_IF_H_
 
 #include <sys/cdefs.h>
 
 #if __BSD_VISIBLE
 /*
  * <net/if.h> does not depend on <sys/time.h> on most other systems.  This
  * helps userland compatibility.  (struct timeval ifi_lastchange)
  * The same holds for <sys/socket.h>.  (struct sockaddr ifru_addr)
  */
 #ifndef _KERNEL
 #include <sys/time.h>
 #include <sys/socket.h>
 #endif
 #endif
 
 /*
  * Length of interface external name, including terminating '\0'.
  * Note: this is the same size as a generic device's external name.
  */
 #define		IF_NAMESIZE	16
 #if __BSD_VISIBLE
 #define		IFNAMSIZ	IF_NAMESIZE
 #define		IF_MAXUNIT	0x7fff	/* historical value */
 #endif
 #if __BSD_VISIBLE
 
 /*
  * Structure used to query names of interface cloners.
  */
 
 struct if_clonereq {
 	int	ifcr_total;		/* total cloners (out) */
 	int	ifcr_count;		/* room for this many in user buffer */
 	char	*ifcr_buffer;		/* buffer for cloner names */
 };
 
 /*
  * Structure describing information about an interface
  * which may be of interest to management entities.
  */
 struct if_data {
 	/* generic interface information */
 	uint8_t	ifi_type;		/* ethernet, tokenring, etc */
 	uint8_t	ifi_physical;		/* e.g., AUI, Thinnet, 10base-T, etc */
 	uint8_t	ifi_addrlen;		/* media address length */
 	uint8_t	ifi_hdrlen;		/* media header length */
 	uint8_t	ifi_link_state;		/* current link state */
 	uint8_t	ifi_vhid;		/* carp vhid */
 	uint16_t	ifi_datalen;	/* length of this data struct */
 	uint32_t	ifi_mtu;	/* maximum transmission unit */
 	uint32_t	ifi_metric;	/* routing metric (external only) */
 	uint64_t	ifi_baudrate;	/* linespeed */
 	/* volatile statistics */
 	uint64_t	ifi_ipackets;	/* packets received on interface */
 	uint64_t	ifi_ierrors;	/* input errors on interface */
 	uint64_t	ifi_opackets;	/* packets sent on interface */
 	uint64_t	ifi_oerrors;	/* output errors on interface */
 	uint64_t	ifi_collisions;	/* collisions on csma interfaces */
 	uint64_t	ifi_ibytes;	/* total number of octets received */
 	uint64_t	ifi_obytes;	/* total number of octets sent */
 	uint64_t	ifi_imcasts;	/* packets received via multicast */
 	uint64_t	ifi_omcasts;	/* packets sent via multicast */
 	uint64_t	ifi_iqdrops;	/* dropped on input */
 	uint64_t	ifi_oqdrops;	/* dropped on output */
 	uint64_t	ifi_noproto;	/* destined for unsupported protocol */
 	uint64_t	ifi_hwassist;	/* HW offload capabilities, see IFCAP */
 
 	/* Unions are here to make sizes MI. */
 	union {				/* uptime at attach or stat reset */
 		time_t		tt;
 		uint64_t	ph;
 	} __ifi_epoch;
 #define	ifi_epoch	__ifi_epoch.tt
 	union {				/* time of last administrative change */
 		struct timeval	tv;
 		struct {
 			uint64_t ph1;
 			uint64_t ph2;
 		} ph;
 	} __ifi_lastchange;
 #define	ifi_lastchange	__ifi_lastchange.tv
 };
 
 /*-
  * Interface flags are of two types: network stack owned flags, and driver
  * owned flags.  Historically, these values were stored in the same ifnet
  * flags field, but with the advent of fine-grained locking, they have been
  * broken out such that the network stack is responsible for synchronizing
  * the stack-owned fields, and the device driver the device-owned fields.
  * Both halves can perform lockless reads of the other half's field, subject
  * to accepting the involved races.
  *
  * Both sets of flags come from the same number space, and should not be
  * permitted to conflict, as they are exposed to user space via a single
  * field.
  *
  * The following symbols identify read and write requirements for fields:
  *
  * (i) if_flags field set by device driver before attach, read-only there
  *     after.
  * (n) if_flags field written only by the network stack, read by either the
  *     stack or driver.
  * (d) if_drv_flags field written only by the device driver, read by either
  *     the stack or driver.
  */
 #define	IFF_UP		0x1		/* (n) interface is up */
 #define	IFF_BROADCAST	0x2		/* (i) broadcast address valid */
 #define	IFF_DEBUG	0x4		/* (n) turn on debugging */
 #define	IFF_LOOPBACK	0x8		/* (i) is a loopback net */
 #define	IFF_POINTOPOINT	0x10		/* (i) is a point-to-point link */
 /*			0x20		   was IFF_SMART */
 #define	IFF_RUNNING	0x40		/* (d) resources allocated */
 #define	IFF_NOARP	0x80		/* (n) no address resolution protocol */
 #define	IFF_PROMISC	0x100		/* (n) receive all packets */
 #define	IFF_ALLMULTI	0x200		/* (n) receive all multicast packets */
 #define	IFF_OACTIVE	0x400		/* (d) tx hardware queue is full */
 #define	IFF_SIMPLEX	0x800		/* (i) can't hear own transmissions */
 #define	IFF_LINK0	0x1000		/* per link layer defined bit */
 #define	IFF_LINK1	0x2000		/* per link layer defined bit */
 #define	IFF_LINK2	0x4000		/* per link layer defined bit */
 #define	IFF_ALTPHYS	IFF_LINK2	/* use alternate physical connection */
 #define	IFF_MULTICAST	0x8000		/* (i) supports multicast */
 #define	IFF_CANTCONFIG	0x10000		/* (i) unconfigurable using ioctl(2) */
 #define	IFF_PPROMISC	0x20000		/* (n) user-requested promisc mode */
 #define	IFF_MONITOR	0x40000		/* (n) user-requested monitor mode */
 #define	IFF_STATICARP	0x80000		/* (n) static ARP */
 #define	IFF_DYING	0x200000	/* (n) interface is winding down */
 #define	IFF_RENAMING	0x400000	/* (n) interface is being renamed */
 
 /* flags set internally only: */
 #define	IFF_CANTCHANGE \
 	(IFF_BROADCAST|IFF_POINTOPOINT|IFF_RUNNING|IFF_OACTIVE|\
 	    IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_PROMISC|\
 	    IFF_DYING|IFF_CANTCONFIG)
 
 /*
  * Values for if_link_state.
  */
 enum {
 	LINK_STATE_UNKNOWN = 0,		/* link invalid/unknown */
 	LINK_STATE_DOWN,		/* link is down */
 	LINK_STATE_UP,			/* link is up */
 };
 
 /*
  * Some convenience macros used for setting ifi_baudrate.
  * XXX 1000 vs. 1024? --thorpej@netbsd.org
  */
 #define	IF_Kbps(x)	((uintmax_t)(x) * 1000)	/* kilobits/sec. */
 #define	IF_Mbps(x)	(IF_Kbps((x) * 1000))	/* megabits/sec. */
 #define	IF_Gbps(x)	(IF_Mbps((x) * 1000))	/* gigabits/sec. */
 
 /*
  * Capabilities that interfaces can advertise.
  *
  * struct ifnet.if_capabilities
  *   contains the optional features & capabilities a particular interface
  *   supports (not only the driver but also the detected hw revision).
  *   Capabilities are defined by IFCAP_* below.
  * struct ifnet.if_capenable
  *   contains the enabled (either by default or through ifconfig) optional
  *   features & capabilities on this interface.
  *   Capabilities are defined by IFCAP_* below.
  * struct if_data.ifi_hwassist in mbuf CSUM_ flag form, controlled by above
  *   contains the enabled optional feature & capabilites that can be used
  *   individually per packet and are specified in the mbuf pkthdr.csum_flags
  *   field.  IFCAP_* and CSUM_* do not match one to one and CSUM_* may be
  *   more detailed or differenciated than IFCAP_*.
  *   Hwassist features are defined CSUM_* in sys/mbuf.h
  *
  * Capabilities that cannot be arbitrarily changed with ifconfig/ioctl
  * are listed in IFCAP_CANTCHANGE, similar to IFF_CANTCHANGE.
  * This is not strictly necessary because the common code never
  * changes capabilities, and it is left to the individual driver
  * to do the right thing. However, having the filter here
  * avoids replication of the same code in all individual drivers.
  */
 #define	IFCAP_RXCSUM		0x00001  /* can offload checksum on RX */
 #define	IFCAP_TXCSUM		0x00002  /* can offload checksum on TX */
 #define	IFCAP_NETCONS		0x00004  /* can be a network console */
 #define	IFCAP_VLAN_MTU		0x00008	/* VLAN-compatible MTU */
 #define	IFCAP_VLAN_HWTAGGING	0x00010	/* hardware VLAN tag support */
 #define	IFCAP_JUMBO_MTU		0x00020	/* 9000 byte MTU supported */
 #define	IFCAP_POLLING		0x00040	/* driver supports polling */
 #define	IFCAP_VLAN_HWCSUM	0x00080	/* can do IFCAP_HWCSUM on VLANs */
 #define	IFCAP_TSO4		0x00100	/* can do TCP Segmentation Offload */
 #define	IFCAP_TSO6		0x00200	/* can do TCP6 Segmentation Offload */
 #define	IFCAP_LRO		0x00400	/* can do Large Receive Offload */
 #define	IFCAP_WOL_UCAST		0x00800	/* wake on any unicast frame */
 #define	IFCAP_WOL_MCAST		0x01000	/* wake on any multicast frame */
 #define	IFCAP_WOL_MAGIC		0x02000	/* wake on any Magic Packet */
 #define	IFCAP_TOE4		0x04000	/* interface can offload TCP */
 #define	IFCAP_TOE6		0x08000	/* interface can offload TCP6 */
 #define	IFCAP_VLAN_HWFILTER	0x10000 /* interface hw can filter vlan tag */
 #define	IFCAP_POLLING_NOCOUNT	0x20000 /* polling ticks cannot be fragmented */
 #define	IFCAP_VLAN_HWTSO	0x40000 /* can do IFCAP_TSO on VLANs */
 #define	IFCAP_LINKSTATE		0x80000 /* the runtime link state is dynamic */
 #define	IFCAP_NETMAP		0x100000 /* netmap mode supported/enabled */
 #define	IFCAP_RXCSUM_IPV6	0x200000  /* can offload checksum on IPv6 RX */
 #define	IFCAP_TXCSUM_IPV6	0x400000  /* can offload checksum on IPv6 TX */
 #define	IFCAP_HWSTATS		0x800000 /* manages counters internally */
 
 #define IFCAP_HWCSUM_IPV6	(IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)
 
 #define IFCAP_HWCSUM	(IFCAP_RXCSUM | IFCAP_TXCSUM)
 #define	IFCAP_TSO	(IFCAP_TSO4 | IFCAP_TSO6)
 #define	IFCAP_WOL	(IFCAP_WOL_UCAST | IFCAP_WOL_MCAST | IFCAP_WOL_MAGIC)
 #define	IFCAP_TOE	(IFCAP_TOE4 | IFCAP_TOE6)
 
 #define	IFCAP_CANTCHANGE	(IFCAP_NETMAP)
 
 #define	IFQ_MAXLEN	50
 #define	IFNET_SLOWHZ	1		/* granularity is 1 second */
 
 /*
  * Message format for use in obtaining information about interfaces
  * from getkerninfo and the routing socket
  * For the new, extensible interface see struct if_msghdrl below.
  */
 struct if_msghdr {
 	u_short	ifm_msglen;	/* to skip over non-understood messages */
 	u_char	ifm_version;	/* future binary compatibility */
 	u_char	ifm_type;	/* message type */
 	int	ifm_addrs;	/* like rtm_addrs */
 	int	ifm_flags;	/* value of if_flags */
 	u_short	ifm_index;	/* index for associated ifp */
 	struct	if_data ifm_data;/* statistics and other data about if */
 };
 
 /*
  * The 'l' version shall be used by new interfaces, like NET_RT_IFLISTL.  It is
  * extensible after ifm_data_off or within ifm_data.  Both the if_msghdr and
  * if_data now have a member field detailing the struct length in addition to
  * the routing message length.  Macros are provided to find the start of
  * ifm_data and the start of the socket address strucutres immediately following
  * struct if_msghdrl given a pointer to struct if_msghdrl.
  */
 #define	IF_MSGHDRL_IFM_DATA(_l) \
     (struct if_data *)((char *)(_l) + (_l)->ifm_data_off)
 #define	IF_MSGHDRL_RTA(_l) \
     (void *)((uintptr_t)(_l) + (_l)->ifm_len)
 struct if_msghdrl {
 	u_short	ifm_msglen;	/* to skip over non-understood messages */
 	u_char	ifm_version;	/* future binary compatibility */
 	u_char	ifm_type;	/* message type */
 	int	ifm_addrs;	/* like rtm_addrs */
 	int	ifm_flags;	/* value of if_flags */
 	u_short	ifm_index;	/* index for associated ifp */
 	u_short _ifm_spare1;	/* spare space to grow if_index, see if_var.h */
 	u_short	ifm_len;	/* length of if_msghdrl incl. if_data */
 	u_short	ifm_data_off;	/* offset of if_data from beginning */
 	struct	if_data ifm_data;/* statistics and other data about if */
 };
 
 /*
  * Message format for use in obtaining information about interface addresses
  * from getkerninfo and the routing socket
  * For the new, extensible interface see struct ifa_msghdrl below.
  */
 struct ifa_msghdr {
 	u_short	ifam_msglen;	/* to skip over non-understood messages */
 	u_char	ifam_version;	/* future binary compatibility */
 	u_char	ifam_type;	/* message type */
 	int	ifam_addrs;	/* like rtm_addrs */
 	int	ifam_flags;	/* value of ifa_flags */
 	u_short	ifam_index;	/* index for associated ifp */
 	int	ifam_metric;	/* value of ifa_ifp->if_metric */
 };
 
 /*
  * The 'l' version shall be used by new interfaces, like NET_RT_IFLISTL.  It is
  * extensible after ifam_metric or within ifam_data.  Both the ifa_msghdrl and
  * if_data now have a member field detailing the struct length in addition to
  * the routing message length.  Macros are provided to find the start of
  * ifm_data and the start of the socket address strucutres immediately following
  * struct ifa_msghdrl given a pointer to struct ifa_msghdrl.
  */
 #define	IFA_MSGHDRL_IFAM_DATA(_l) \
     (struct if_data *)((char *)(_l) + (_l)->ifam_data_off)
 #define	IFA_MSGHDRL_RTA(_l) \
     (void *)((uintptr_t)(_l) + (_l)->ifam_len)
 struct ifa_msghdrl {
 	u_short	ifam_msglen;	/* to skip over non-understood messages */
 	u_char	ifam_version;	/* future binary compatibility */
 	u_char	ifam_type;	/* message type */
 	int	ifam_addrs;	/* like rtm_addrs */
 	int	ifam_flags;	/* value of ifa_flags */
 	u_short	ifam_index;	/* index for associated ifp */
 	u_short _ifam_spare1;	/* spare space to grow if_index, see if_var.h */
 	u_short	ifam_len;	/* length of ifa_msghdrl incl. if_data */
 	u_short	ifam_data_off;	/* offset of if_data from beginning */
 	int	ifam_metric;	/* value of ifa_ifp->if_metric */
 	struct	if_data ifam_data;/* statistics and other data about if or
 				 * address */
 };
 
 /*
  * Message format for use in obtaining information about multicast addresses
  * from the routing socket
  */
 struct ifma_msghdr {
 	u_short	ifmam_msglen;	/* to skip over non-understood messages */
 	u_char	ifmam_version;	/* future binary compatibility */
 	u_char	ifmam_type;	/* message type */
 	int	ifmam_addrs;	/* like rtm_addrs */
 	int	ifmam_flags;	/* value of ifa_flags */
 	u_short	ifmam_index;	/* index for associated ifp */
 };
 
 /*
  * Message format announcing the arrival or departure of a network interface.
  */
 struct if_announcemsghdr {
 	u_short	ifan_msglen;	/* to skip over non-understood messages */
 	u_char	ifan_version;	/* future binary compatibility */
 	u_char	ifan_type;	/* message type */
 	u_short	ifan_index;	/* index for associated ifp */
 	char	ifan_name[IFNAMSIZ]; /* if name, e.g. "en0" */
 	u_short	ifan_what;	/* what type of announcement */
 };
 
 #define	IFAN_ARRIVAL	0	/* interface arrival */
 #define	IFAN_DEPARTURE	1	/* interface departure */
 
 /*
  * Buffer with length to be used in SIOCGIFDESCR/SIOCSIFDESCR requests
  */
 struct ifreq_buffer {
 	size_t	length;
 	void	*buffer;
 };
 
 /*
  * Interface request structure used for socket
  * ioctl's.  All interface ioctl's must have parameter
  * definitions which begin with ifr_name.  The
  * remainder may be interface specific.
  */
 struct	ifreq {
 	char	ifr_name[IFNAMSIZ];		/* if name, e.g. "en0" */
 	union {
 		struct	sockaddr ifru_addr;
 		struct	sockaddr ifru_dstaddr;
 		struct	sockaddr ifru_broadaddr;
 		struct	ifreq_buffer ifru_buffer;
 		struct {
 			uint32_t ifrucap_reqcap;	/* requested/returned */
 			uint32_t ifrucap_curcap;	/* current values */
 			uint64_t ifrucap_hwassist;	/* returned hwassist */
 		}	ifru_cap;
-		u_int	ifru_flags;
-		short	ifru_sflags[2];
+		short	ifru_flags[2];
 		short	ifru_index;
 		int	ifru_jid;
 		int	ifru_metric;
 		int	ifru_mtu;
 		int	ifru_phys;
 		int	ifru_media;
 		caddr_t	ifru_data;
 		u_int	ifru_fib;
 	} ifr_ifru;
 #define	ifr_addr	ifr_ifru.ifru_addr	/* address */
 #define	ifr_dstaddr	ifr_ifru.ifru_dstaddr	/* other end of p-to-p link */
 #define	ifr_broadaddr	ifr_ifru.ifru_broadaddr	/* broadcast address */
 #define	ifr_buffer	ifr_ifru.ifru_buffer	/* user supplied buffer with its length */
-#define	ifr_flags	ifr_ifru.ifru_flags	/* flags (after fixup) */
-#define	ifr_flagslow	ifr_ifru.ifru_sflags[0]	/* flags (low 16 bits) */
-#define	ifr_flagshigh	ifr_ifru.ifru_sflags[1]	/* flags (high 16 bits) */
+#define	ifr_flags	ifr_ifru.ifru_flags[0]	/* flags (low 16 bits) */
+#define	ifr_flagshigh	ifr_ifru.ifru_flags[1]	/* flags (high 16 bits) */
 #define	ifr_jid		ifr_ifru.ifru_jid	/* jail/vnet */
 #define	ifr_metric	ifr_ifru.ifru_metric	/* metric */
 #define	ifr_mtu		ifr_ifru.ifru_mtu	/* mtu */
 #define ifr_phys	ifr_ifru.ifru_phys	/* physical wire */
 #define ifr_media	ifr_ifru.ifru_media	/* physical media */
 #define	ifr_data	ifr_ifru.ifru_data	/* for use by interface */
 #define	ifr_reqcap	ifr_ifru.ifru_cap.ifrucap_reqcap
 #define	ifr_curcap	ifr_ifru.ifru_cap.ifrucap_curcap
 #define	ifr_hwassist	ifr_ifru.ifru_cap.ifrucap_hwassist
 #define	ifr_index	ifr_ifru.ifru_index	/* interface index */
 #define	ifr_fib		ifr_ifru.ifru_fib	/* interface fib */
 };
 
 #define	_SIZEOF_ADDR_IFREQ(ifr) \
 	((ifr).ifr_addr.sa_len > sizeof(struct sockaddr) ? \
 	 (sizeof(struct ifreq) - sizeof(struct sockaddr) + \
 	  (ifr).ifr_addr.sa_len) : sizeof(struct ifreq))
 
 struct ifaliasreq {
 	char	ifra_name[IFNAMSIZ];		/* if name, e.g. "en0" */
 	struct	sockaddr ifra_addr;
 	struct	sockaddr ifra_broadaddr;
 	struct	sockaddr ifra_mask;
 	int	ifra_vhid;
 };
 
 /* 9.x compat */
 struct oifaliasreq {
 	char	ifra_name[IFNAMSIZ];
 	struct	sockaddr ifra_addr;
 	struct	sockaddr ifra_broadaddr;
 	struct	sockaddr ifra_mask;
 };
 
 struct ifmediareq {
 	char	ifm_name[IFNAMSIZ];	/* if name, e.g. "en0" */
 	int	ifm_current;		/* current media options */
 	int	ifm_mask;		/* don't care mask */
 	int	ifm_status;		/* media status */
 	int	ifm_active;		/* active options */
 	int	ifm_count;		/* # entries in ifm_ulist array */
 	int	*ifm_ulist;		/* media words */
 };
 
 struct  ifdrv {
 	char            ifd_name[IFNAMSIZ];     /* if name, e.g. "en0" */
 	unsigned long   ifd_cmd;
 	size_t          ifd_len;
 	void            *ifd_data;
 };
 
 /* 
  * Structure used to retrieve aux status data from interfaces.
  * Kernel suppliers to this interface should respect the formatting
  * needed by ifconfig(8): each line starts with a TAB and ends with
  * a newline.  The canonical example to copy and paste is in if_tun.c.
  */
 
 #define	IFSTATMAX	800		/* 10 lines of text */
 struct ifstat {
 	char	ifs_name[IFNAMSIZ];	/* if name, e.g. "en0" */
 	char	ascii[IFSTATMAX + 1];
 };
 
 /*
  * Structure used in SIOCGIFCONF request.
  * Used to retrieve interface configuration
  * for machine (useful for programs which
  * must know all networks accessible).
  */
 struct	ifconf {
 	int	ifc_len;		/* size of associated buffer */
 	union {
 		caddr_t	ifcu_buf;
 		struct	ifreq *ifcu_req;
 	} ifc_ifcu;
 #define	ifc_buf	ifc_ifcu.ifcu_buf	/* buffer address */
 #define	ifc_req	ifc_ifcu.ifcu_req	/* array of structures returned */
 };
 
 /*
  * interface groups
  */
 
 #define	IFG_ALL		"all"		/* group contains all interfaces */
 /* XXX: will we implement this? */
 #define	IFG_EGRESS	"egress"	/* if(s) default route(s) point to */
 
 struct ifg_req {
 	union {
 		char			 ifgrqu_group[IFNAMSIZ];
 		char			 ifgrqu_member[IFNAMSIZ];
 	} ifgrq_ifgrqu;
 #define	ifgrq_group	ifgrq_ifgrqu.ifgrqu_group
 #define	ifgrq_member	ifgrq_ifgrqu.ifgrqu_member
 };
 
 /*
  * Used to lookup groups for an interface
  */
 struct ifgroupreq {
 	char	ifgr_name[IFNAMSIZ];
 	u_int	ifgr_len;
 	union {
 		char	ifgru_group[IFNAMSIZ];
 		struct	ifg_req *ifgru_groups;
 	} ifgr_ifgru;
 #define ifgr_group	ifgr_ifgru.ifgru_group
 #define ifgr_groups	ifgr_ifgru.ifgru_groups
 };
 
 /*
  * Structure used to request i2c data
  * from interface transceivers.
  */
 struct ifi2creq {
 	uint8_t dev_addr;	/* i2c address (0xA0, 0xA2) */
 	uint8_t offset;		/* read offset */
 	uint8_t len;		/* read length */
 	uint8_t spare0;
 	uint32_t spare1;
 	uint8_t data[8];	/* read buffer */
 }; 
 
 #endif /* __BSD_VISIBLE */
 
 #ifndef _KERNEL
 struct if_nameindex {
 	unsigned int	if_index;	/* 1, 2, ... */
 	char		*if_name;	/* null terminated name: "le0", ... */
 };
 
 __BEGIN_DECLS
 void			 if_freenameindex(struct if_nameindex *);
 char			*if_indextoname(unsigned int, char *);
 struct if_nameindex	*if_nameindex(void);
 unsigned int		 if_nametoindex(const char *);
 __END_DECLS
 #endif
 
 #ifdef _KERNEL
 #include <net/if_types.h>
 /*
  * Under _KERNEL there live declarations from net/if.c, that are public
  * and available to network device drivers.  Declarations that are protected
  * from drivers, but available to the stack live in if_var.h.
  */
 
 /* Some forward declarations are required. */
 struct mbuf;	/* if_input, if_output, if_transmit */
 struct route;	/* if_output */
 struct vnet;	/* if_reassign */
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_IFADDR);
 MALLOC_DECLARE(M_IFMADDR);
 #endif
 
 typedef enum {
 	IFCOUNTER_IPACKETS = 0,
 	IFCOUNTER_IERRORS,
 	IFCOUNTER_OPACKETS,
 	IFCOUNTER_OERRORS,
 	IFCOUNTER_COLLISIONS,
 	IFCOUNTER_IBYTES,
 	IFCOUNTER_OBYTES,
 	IFCOUNTER_IMCASTS,
 	IFCOUNTER_OMCASTS,
 	IFCOUNTER_IQDROPS,
 	IFCOUNTER_OQDROPS,
 	IFCOUNTER_NOPROTO,
 	IFCOUNTERS /* Array size (used internally). */
 } ift_counter;
 
 typedef enum {
 	IF_DRIVER_SOFTC,
 	IF_LLADDR,
 	IF_BPF,
 	IF_NAME,
 } ift_feature;
 
 typedef struct ifnet * if_t;
 
 typedef void	(*if_init_t)(void *);
 typedef void	(*if_input_t)(if_t, struct mbuf *);
 typedef int	(*if_transmit_t)(if_t, struct mbuf *);
 typedef int	(*if_output_t)(if_t, struct mbuf *, const struct sockaddr *,
     struct route *);
 typedef int	(*if_ioctl_t)(if_t, u_long, void *, struct thread *);
 typedef uint64_t (*if_get_counter_t)(if_t, ift_counter);
 typedef void	(*if_qflush_t)(if_t);
 typedef int	(*if_resolvemulti_t)(if_t, struct sockaddr **,
     struct sockaddr *);
 typedef void	(*if_reassign_t)(if_t, struct vnet *);
 #ifdef DEVICE_POLLING
 enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS };
 typedef int	(*if_poll_t)(if_t ifp, enum poll_cmd cmd, int count);
 #endif
 
 /*
  * Interface methods.  Usually stored in ifdriver definition, however
  * some subsystems like lagg(4) or altq(4) may put a shim ifops before
  * native ones.
  */
 struct ifops {
 	if_input_t	ifop_input;	/* input routine (from h/w driver) */
 	if_transmit_t	ifop_transmit;	/* initiate output routine */
 	if_output_t	ifop_output;
 #ifdef DEVICE_POLLING
 	if_poll_t	ifop_poll;
 #endif
 	if_ioctl_t	ifop_ioctl;	/* ioctl routine */
 	if_get_counter_t ifop_get_counter; /* get counter values */
 	if_init_t	ifop_init;	/* init routine */
 	if_qflush_t	ifop_qflush;	/* flush any queue */	
 	if_resolvemulti_t ifop_resolvemulti; /* validate/resolve multicast */
 	if_reassign_t	ifop_reassign;	/* reassign to vnet routine */
 	struct ifops	*ifop_next;
 	uint8_t		ifop_origin;
 };
 enum {
 	IFOP_ORIGIN_DRIVER = 1,
 	IFOP_ORIGIN_IFTYPE = 2,
 };
 
 /*
  * Structure describing TSO properties of an interface.  Known both to ifnet
  * layer and TCP.  Most interfaces point to a static tsomax in ifdriver
  * definition.  However, vlan(4) and lagg(4) require a dynamic tsomax.
  */
 struct iftsomax {
 	uint32_t tsomax_bytes;	  /* TSO total burst length limit in bytes */
 	uint32_t tsomax_segcount; /* TSO maximum segment count */
 	uint32_t tsomax_segsize;  /* TSO maximum segment size in bytes */
 };
 
 /*
  * Driver description.  All instances of a driver share common properties
  * that are stable during runtime.  The stack can bless them, which
  * means modify, when attaching the first instance of given
  * driver.
  */
 struct ifdriver {
 	struct ifops		ifdrv_ops;
 	struct iftsomax		*ifdrv_tsomax;
 	/*
 	 * The ifdrv_name must be a pointer to storage which will last as
 	 * long as any interface does.  For physical devices, the result of
 	 * device_get_name(dev) is a good choice and for pseudo-devices a
 	 * static string works well.
 	 */
 	const char *	ifdrv_name;
 	struct if_clone *ifdrv_clone;
 	ifType		ifdrv_type;	/* from if_types.h */
 	uint8_t		ifdrv_hdrlen;	/* media header length */
 	uint8_t		ifdrv_addrlen;	/* media address length */
 	uint32_t	ifdrv_dlt;	/* from net/bpf.h */
 	uint32_t	ifdrv_dlt_hdrlen;
 	uint32_t	ifdrv_maxqlen;	/* max queue length for if_snd */
 	/*
 	 * Owned by stack.  Drivers shouldn't initialize these!
 	 */
 	uint32_t		__ifdrv_stack_owned;
 };
 
 /*
  * Arguments for if_attach().  Usually stored on stack of device_attach
  * function in driver.  In future this structure will probably have
  * different versions, so that we can support older ABIs for drivers.
  */
 struct if_attach_args {
 	uint8_t		ifat_version;	/* must be IF_ATTACH_VERSION */
 #define	IF_ATTACH_VERSION	1
 	uint8_t		ifat_spare8;
 	uint16_t	ifat_spare16;
 	uint32_t	ifat_spare32;
 	int		ifat_error;	/* Filled on return. */
 
 	struct ifdriver	*ifat_drv;
 	void 		*ifat_softc;	/* Driver private softc. */
 	const uint8_t	*ifat_lla;	/* Link-level address. */
 	int32_t		ifat_dunit;	/* Specific unit or a hint. */
 #define	IFAT_DUNIT_NONE	(-1)
 	char *		ifat_name;	/* If driver wants a specific name. */
 	/*
 	 * Variables that may differ between two instances of a same
 	 * driver, but are constant within instance lifetime.
 	 */
 	uint64_t	ifat_capabilities;
 	/*
 	 * MTU, flags, capabilities at attach time.  Driver
 	 * can change them later.
 	 */
 	uint32_t	ifat_mtu;
 	uint64_t	ifat_flags;
 	uint64_t	ifat_capenable;
 	uint64_t	ifat_hwassist;
 	uint64_t	ifat_baudrate;
 	/*
 	 * If ifat_tsomax pointer is non-zero, then an interface will
 	 * have dynamically allocated ifdrv_tsomax, that can be changed
 	 * later.  Otherwise it inherits static iftsomax from ifdriver.
 	 */
 	struct iftsomax *ifat_tsomax;
 };
 
 /*
  * Interface manipulating functions that are available for drivers.
  */
 if_t	if_attach(struct if_attach_args *);
 void	if_detach(if_t);
 void	if_mtap(if_t, struct mbuf *, void *, u_int);
 void	if_inc_counter(if_t, ift_counter, int64_t);
 void	if_inc_txcounters(if_t, struct mbuf *);
 void	if_setbaudrate(if_t, uint64_t);
 void	if_link_state_change(if_t, int);
 void *	if_getsoftc(if_t, ift_feature);
 int	if_printf(if_t, const char *, ...) __printflike(2, 3);
 int	if_drvioctl(u_long, struct ifnet *, void *, struct thread *);
 uint64_t if_get_counter_default(if_t, ift_counter);
 
 /*
  * Interface if_ops that are available for drivers.
  */
 void	if_input_noinline(if_t, struct mbuf *);
 #define	if_input(ifp, m)	if_input_noinline(ifp, m)
 int	if_transmit_noinline(if_t, struct mbuf *);
 #define	if_transmit(ifp, m)	if_transmit_noinline(ifp, m)
 
 /*
  * Traversing through interface address lists.
  */
 typedef	void	ifaddr_cb_t(void *, struct sockaddr *, struct sockaddr *,
 		    struct sockaddr *);
 typedef	void	ifmaddr_cb_t(void *, struct sockaddr *);
 void	if_foreach_addr(if_t, ifaddr_cb_t, void *);
 void	if_foreach_maddr(if_t, ifmaddr_cb_t, void *);
 
 /*
  * Generic software send queue manipulation.
  */
 int	if_snd_len(if_t);
 int	if_snd_enqueue(if_t, struct mbuf *);
 struct mbuf * if_snd_dequeue(if_t);
 void	if_snd_prepend(if_t, struct mbuf *);
 
 /*
  * Type-enforcing inliners over if_getsoftc().
  */
 static inline char *
 if_lladdr(if_t ifp)
 {
 
 	return ((char *)(if_getsoftc(ifp, IF_LLADDR)));
 }
 
 static inline const char *
 if_name(if_t ifp)
 {
 
 	return ((char *)(if_getsoftc(ifp, IF_NAME)));
 }
 #endif /* _KERNEL */
 #endif /* !_NET_IF_H_ */