Index: head/sys/net/if_vxlan.c
===================================================================
--- head/sys/net/if_vxlan.c	(revision 349368)
+++ head/sys/net/if_vxlan.c	(revision 349369)
@@ -1,3176 +1,3176 @@
 /*-
  * Copyright (c) 2014, Bryan Venteicher <bryanv@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/hash.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/refcount.h>
 #include <sys/rmlock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/if_vxlan.h>
 #include <net/netisr.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 
 struct vxlan_softc;
 LIST_HEAD(vxlan_softc_head, vxlan_softc);
 
 struct vxlan_socket_mc_info {
 	union vxlan_sockaddr		 vxlsomc_saddr;
 	union vxlan_sockaddr		 vxlsomc_gaddr;
 	int				 vxlsomc_ifidx;
 	int				 vxlsomc_users;
 };
 
 #define VXLAN_SO_MC_MAX_GROUPS		32
 
 #define VXLAN_SO_VNI_HASH_SHIFT		6
 #define VXLAN_SO_VNI_HASH_SIZE		(1 << VXLAN_SO_VNI_HASH_SHIFT)
 #define VXLAN_SO_VNI_HASH(_vni)		((_vni) % VXLAN_SO_VNI_HASH_SIZE)
 
 struct vxlan_socket {
 	struct socket			*vxlso_sock;
 	struct rmlock			 vxlso_lock;
 	u_int				 vxlso_refcnt;
 	union vxlan_sockaddr		 vxlso_laddr;
 	LIST_ENTRY(vxlan_socket)	 vxlso_entry;
 	struct vxlan_softc_head		 vxlso_vni_hash[VXLAN_SO_VNI_HASH_SIZE];
 	struct vxlan_socket_mc_info	 vxlso_mc[VXLAN_SO_MC_MAX_GROUPS];
 };
 
 #define VXLAN_SO_RLOCK(_vso, _p)	rm_rlock(&(_vso)->vxlso_lock, (_p))
 #define VXLAN_SO_RUNLOCK(_vso, _p)	rm_runlock(&(_vso)->vxlso_lock, (_p))
 #define VXLAN_SO_WLOCK(_vso)		rm_wlock(&(_vso)->vxlso_lock)
 #define VXLAN_SO_WUNLOCK(_vso)		rm_wunlock(&(_vso)->vxlso_lock)
 #define VXLAN_SO_LOCK_ASSERT(_vso) \
     rm_assert(&(_vso)->vxlso_lock, RA_LOCKED)
 #define VXLAN_SO_LOCK_WASSERT(_vso) \
     rm_assert(&(_vso)->vxlso_lock, RA_WLOCKED)
 
 #define VXLAN_SO_ACQUIRE(_vso)		refcount_acquire(&(_vso)->vxlso_refcnt)
 #define VXLAN_SO_RELEASE(_vso)		refcount_release(&(_vso)->vxlso_refcnt)
 
 struct vxlan_ftable_entry {
 	LIST_ENTRY(vxlan_ftable_entry)	 vxlfe_hash;
 	uint16_t			 vxlfe_flags;
 	uint8_t				 vxlfe_mac[ETHER_ADDR_LEN];
 	union vxlan_sockaddr		 vxlfe_raddr;
 	time_t				 vxlfe_expire;
 };
 
 #define VXLAN_FE_FLAG_DYNAMIC		0x01
 #define VXLAN_FE_FLAG_STATIC		0x02
 
 #define VXLAN_FE_IS_DYNAMIC(_fe) \
     ((_fe)->vxlfe_flags & VXLAN_FE_FLAG_DYNAMIC)
 
 #define VXLAN_SC_FTABLE_SHIFT		9
 #define VXLAN_SC_FTABLE_SIZE		(1 << VXLAN_SC_FTABLE_SHIFT)
 #define VXLAN_SC_FTABLE_MASK		(VXLAN_SC_FTABLE_SIZE - 1)
 #define VXLAN_SC_FTABLE_HASH(_sc, _mac)	\
     (vxlan_mac_hash(_sc, _mac) % VXLAN_SC_FTABLE_SIZE)
 
 LIST_HEAD(vxlan_ftable_head, vxlan_ftable_entry);
 
 struct vxlan_statistics {
 	uint32_t	ftable_nospace;
 	uint32_t	ftable_lock_upgrade_failed;
 };
 
 struct vxlan_softc {
 	struct ifnet			*vxl_ifp;
 	struct vxlan_socket		*vxl_sock;
 	uint32_t			 vxl_vni;
 	union vxlan_sockaddr		 vxl_src_addr;
 	union vxlan_sockaddr		 vxl_dst_addr;
 	uint32_t			 vxl_flags;
 #define VXLAN_FLAG_INIT		0x0001
 #define VXLAN_FLAG_TEARDOWN	0x0002
 #define VXLAN_FLAG_LEARN	0x0004
 
 	uint32_t			 vxl_port_hash_key;
 	uint16_t			 vxl_min_port;
 	uint16_t			 vxl_max_port;
 	uint8_t				 vxl_ttl;
 
 	/* Lookup table from MAC address to forwarding entry. */
 	uint32_t			 vxl_ftable_cnt;
 	uint32_t			 vxl_ftable_max;
 	uint32_t			 vxl_ftable_timeout;
 	uint32_t			 vxl_ftable_hash_key;
 	struct vxlan_ftable_head	*vxl_ftable;
 
 	/* Derived from vxl_dst_addr. */
 	struct vxlan_ftable_entry	 vxl_default_fe;
 
 	struct ip_moptions		*vxl_im4o;
 	struct ip6_moptions		*vxl_im6o;
 
 	struct rmlock			 vxl_lock;
 	volatile u_int			 vxl_refcnt;
 
 	int				 vxl_unit;
 	int				 vxl_vso_mc_index;
 	struct vxlan_statistics		 vxl_stats;
 	struct sysctl_oid		*vxl_sysctl_node;
 	struct sysctl_ctx_list		 vxl_sysctl_ctx;
 	struct callout			 vxl_callout;
 	struct ether_addr		 vxl_hwaddr;
 	int				 vxl_mc_ifindex;
 	struct ifnet			*vxl_mc_ifp;
 	struct ifmedia 			 vxl_media;
 	char				 vxl_mc_ifname[IFNAMSIZ];
 	LIST_ENTRY(vxlan_softc)		 vxl_entry;
 	LIST_ENTRY(vxlan_softc)		 vxl_ifdetach_list;
 };
 
 #define VXLAN_RLOCK(_sc, _p)	rm_rlock(&(_sc)->vxl_lock, (_p))
 #define VXLAN_RUNLOCK(_sc, _p)	rm_runlock(&(_sc)->vxl_lock, (_p))
 #define VXLAN_WLOCK(_sc)	rm_wlock(&(_sc)->vxl_lock)
 #define VXLAN_WUNLOCK(_sc)	rm_wunlock(&(_sc)->vxl_lock)
 #define VXLAN_LOCK_WOWNED(_sc)	rm_wowned(&(_sc)->vxl_lock)
 #define VXLAN_LOCK_ASSERT(_sc)	rm_assert(&(_sc)->vxl_lock, RA_LOCKED)
 #define VXLAN_LOCK_WASSERT(_sc) rm_assert(&(_sc)->vxl_lock, RA_WLOCKED)
 #define VXLAN_UNLOCK(_sc, _p) do {		\
     if (VXLAN_LOCK_WOWNED(_sc))			\
 	VXLAN_WUNLOCK(_sc);			\
     else					\
 	VXLAN_RUNLOCK(_sc, _p);			\
 } while (0)
 
 #define VXLAN_ACQUIRE(_sc)	refcount_acquire(&(_sc)->vxl_refcnt)
 #define VXLAN_RELEASE(_sc)	refcount_release(&(_sc)->vxl_refcnt)
 
 #define	satoconstsin(sa)	((const struct sockaddr_in *)(sa))
 #define	satoconstsin6(sa)	((const struct sockaddr_in6 *)(sa))
 
 struct vxlanudphdr {
 	struct udphdr		vxlh_udp;
 	struct vxlan_header	vxlh_hdr;
 } __packed;
 
 static int	vxlan_ftable_addr_cmp(const uint8_t *, const uint8_t *);
 static void	vxlan_ftable_init(struct vxlan_softc *);
 static void	vxlan_ftable_fini(struct vxlan_softc *);
 static void	vxlan_ftable_flush(struct vxlan_softc *, int);
 static void	vxlan_ftable_expire(struct vxlan_softc *);
 static int	vxlan_ftable_update_locked(struct vxlan_softc *,
 		    const union vxlan_sockaddr *, const uint8_t *,
 		    struct rm_priotracker *);
 static int	vxlan_ftable_learn(struct vxlan_softc *,
 		    const struct sockaddr *, const uint8_t *);
 static int	vxlan_ftable_sysctl_dump(SYSCTL_HANDLER_ARGS);
 
 static struct vxlan_ftable_entry *
 		vxlan_ftable_entry_alloc(void);
 static void	vxlan_ftable_entry_free(struct vxlan_ftable_entry *);
 static void	vxlan_ftable_entry_init(struct vxlan_softc *,
 		    struct vxlan_ftable_entry *, const uint8_t *,
 		    const struct sockaddr *, uint32_t);
 static void	vxlan_ftable_entry_destroy(struct vxlan_softc *,
 		    struct vxlan_ftable_entry *);
 static int	vxlan_ftable_entry_insert(struct vxlan_softc *,
 		    struct vxlan_ftable_entry *);
 static struct vxlan_ftable_entry *
 		vxlan_ftable_entry_lookup(struct vxlan_softc *,
 		    const uint8_t *);
 static void	vxlan_ftable_entry_dump(struct vxlan_ftable_entry *,
 		    struct sbuf *);
 
 static struct vxlan_socket *
 		vxlan_socket_alloc(const union vxlan_sockaddr *);
 static void	vxlan_socket_destroy(struct vxlan_socket *);
 static void	vxlan_socket_release(struct vxlan_socket *);
 static struct vxlan_socket *
 		vxlan_socket_lookup(union vxlan_sockaddr *vxlsa);
 static void	vxlan_socket_insert(struct vxlan_socket *);
 static int	vxlan_socket_init(struct vxlan_socket *, struct ifnet *);
 static int	vxlan_socket_bind(struct vxlan_socket *, struct ifnet *);
 static int	vxlan_socket_create(struct ifnet *, int,
 		    const union vxlan_sockaddr *, struct vxlan_socket **);
 static void	vxlan_socket_ifdetach(struct vxlan_socket *,
 		    struct ifnet *, struct vxlan_softc_head *);
 
 static struct vxlan_socket *
 		vxlan_socket_mc_lookup(const union vxlan_sockaddr *);
 static int	vxlan_sockaddr_mc_info_match(
 		    const struct vxlan_socket_mc_info *,
 		    const union vxlan_sockaddr *,
 		    const union vxlan_sockaddr *, int);
 static int	vxlan_socket_mc_join_group(struct vxlan_socket *,
 		    const union vxlan_sockaddr *, const union vxlan_sockaddr *,
 		    int *, union vxlan_sockaddr *);
 static int	vxlan_socket_mc_leave_group(struct vxlan_socket *,
 		    const union vxlan_sockaddr *,
 		    const union vxlan_sockaddr *, int);
 static int	vxlan_socket_mc_add_group(struct vxlan_socket *,
 		    const union vxlan_sockaddr *, const union vxlan_sockaddr *,
 		    int, int *);
 static void	vxlan_socket_mc_release_group_by_idx(struct vxlan_socket *,
 		    int);
 
 static struct vxlan_softc *
 		vxlan_socket_lookup_softc_locked(struct vxlan_socket *,
 		    uint32_t);
 static struct vxlan_softc *
 		vxlan_socket_lookup_softc(struct vxlan_socket *, uint32_t);
 static int	vxlan_socket_insert_softc(struct vxlan_socket *,
 		    struct vxlan_softc *);
 static void	vxlan_socket_remove_softc(struct vxlan_socket *,
 		    struct vxlan_softc *);
 
 static struct ifnet *
 		vxlan_multicast_if_ref(struct vxlan_softc *, int);
 static void	vxlan_free_multicast(struct vxlan_softc *);
 static int	vxlan_setup_multicast_interface(struct vxlan_softc *);
 
 static int	vxlan_setup_multicast(struct vxlan_softc *);
 static int	vxlan_setup_socket(struct vxlan_softc *);
 static void	vxlan_setup_interface(struct vxlan_softc *);
 static int	vxlan_valid_init_config(struct vxlan_softc *);
 static void	vxlan_init_wait(struct vxlan_softc *);
 static void	vxlan_init_complete(struct vxlan_softc *);
 static void	vxlan_init(void *);
 static void	vxlan_release(struct vxlan_softc *);
 static void	vxlan_teardown_wait(struct vxlan_softc *);
 static void	vxlan_teardown_complete(struct vxlan_softc *);
 static void	vxlan_teardown_locked(struct vxlan_softc *);
 static void	vxlan_teardown(struct vxlan_softc *);
 static void	vxlan_ifdetach(struct vxlan_softc *, struct ifnet *,
 		    struct vxlan_softc_head *);
 static void	vxlan_timer(void *);
 
 static int	vxlan_ctrl_get_config(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_vni(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_local_addr(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_remote_addr(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_local_port(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_remote_port(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_port_range(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_ftable_timeout(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_ftable_max(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_multicast_if(struct vxlan_softc * , void *);
 static int	vxlan_ctrl_set_ttl(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_set_learn(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_ftable_entry_add(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_ftable_entry_rem(struct vxlan_softc *, void *);
 static int	vxlan_ctrl_flush(struct vxlan_softc *, void *);
 static int	vxlan_ioctl_drvspec(struct vxlan_softc *,
 		    struct ifdrv *, int);
 static int	vxlan_ioctl_ifflags(struct vxlan_softc *);
 static int	vxlan_ioctl(struct ifnet *, u_long, caddr_t);
 
 #if defined(INET) || defined(INET6)
 static uint16_t vxlan_pick_source_port(struct vxlan_softc *, struct mbuf *);
 static void	vxlan_encap_header(struct vxlan_softc *, struct mbuf *,
 		    int, uint16_t, uint16_t);
 #endif
 static int	vxlan_encap4(struct vxlan_softc *,
 		    const union vxlan_sockaddr *, struct mbuf *);
 static int	vxlan_encap6(struct vxlan_softc *,
 		    const union vxlan_sockaddr *, struct mbuf *);
 static int	vxlan_transmit(struct ifnet *, struct mbuf *);
 static void	vxlan_qflush(struct ifnet *);
 static void	vxlan_rcv_udp_packet(struct mbuf *, int, struct inpcb *,
 		    const struct sockaddr *, void *);
 static int	vxlan_input(struct vxlan_socket *, uint32_t, struct mbuf **,
 		    const struct sockaddr *);
 
 static void	vxlan_set_default_config(struct vxlan_softc *);
 static int	vxlan_set_user_config(struct vxlan_softc *,
 		     struct ifvxlanparam *);
 static int	vxlan_clone_create(struct if_clone *, int, caddr_t);
 static void	vxlan_clone_destroy(struct ifnet *);
 
 static uint32_t vxlan_mac_hash(struct vxlan_softc *, const uint8_t *);
 static int	vxlan_media_change(struct ifnet *);
 static void	vxlan_media_status(struct ifnet *, struct ifmediareq *);
 
 static int	vxlan_sockaddr_cmp(const union vxlan_sockaddr *,
 		    const struct sockaddr *);
 static void	vxlan_sockaddr_copy(union vxlan_sockaddr *,
 		    const struct sockaddr *);
 static int	vxlan_sockaddr_in_equal(const union vxlan_sockaddr *,
 		    const struct sockaddr *);
 static void	vxlan_sockaddr_in_copy(union vxlan_sockaddr *,
 		    const struct sockaddr *);
 static int	vxlan_sockaddr_supported(const union vxlan_sockaddr *, int);
 static int	vxlan_sockaddr_in_any(const union vxlan_sockaddr *);
 static int	vxlan_sockaddr_in_multicast(const union vxlan_sockaddr *);
 static int	vxlan_sockaddr_in6_embedscope(union vxlan_sockaddr *);
 
 static int	vxlan_can_change_config(struct vxlan_softc *);
 static int	vxlan_check_vni(uint32_t);
 static int	vxlan_check_ttl(int);
 static int	vxlan_check_ftable_timeout(uint32_t);
 static int	vxlan_check_ftable_max(uint32_t);
 
 static void	vxlan_sysctl_setup(struct vxlan_softc *);
 static void	vxlan_sysctl_destroy(struct vxlan_softc *);
 static int	vxlan_tunable_int(struct vxlan_softc *, const char *, int);
 
 static void	vxlan_ifdetach_event(void *, struct ifnet *);
 static void	vxlan_load(void);
 static void	vxlan_unload(void);
 static int	vxlan_modevent(module_t, int, void *);
 
 static const char vxlan_name[] = "vxlan";
 static MALLOC_DEFINE(M_VXLAN, vxlan_name,
     "Virtual eXtensible LAN Interface");
 static struct if_clone *vxlan_cloner;
 
 static struct mtx vxlan_list_mtx;
 #define VXLAN_LIST_LOCK()	mtx_lock(&vxlan_list_mtx)
 #define VXLAN_LIST_UNLOCK()	mtx_unlock(&vxlan_list_mtx)
 
 static LIST_HEAD(, vxlan_socket) vxlan_socket_list;
 
 static eventhandler_tag vxlan_ifdetach_event_tag;
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, OID_AUTO, vxlan, CTLFLAG_RW, 0,
     "Virtual eXtensible Local Area Network");
 
 static int vxlan_legacy_port = 0;
 TUNABLE_INT("net.link.vxlan.legacy_port", &vxlan_legacy_port);
 static int vxlan_reuse_port = 0;
 TUNABLE_INT("net.link.vxlan.reuse_port", &vxlan_reuse_port);
 
 /* Default maximum number of addresses in the forwarding table. */
 #ifndef VXLAN_FTABLE_MAX
 #define VXLAN_FTABLE_MAX	2000
 #endif
 
 /* Timeout (in seconds) of addresses learned in the forwarding table. */
 #ifndef VXLAN_FTABLE_TIMEOUT
 #define VXLAN_FTABLE_TIMEOUT	(20 * 60)
 #endif
 
 /*
  * Maximum timeout (in seconds) of addresses learned in the forwarding
  * table.
  */
 #ifndef VXLAN_FTABLE_MAX_TIMEOUT
 #define VXLAN_FTABLE_MAX_TIMEOUT	(60 * 60 * 24)
 #endif
 
 /* Number of seconds between pruning attempts of the forwarding table. */
 #ifndef VXLAN_FTABLE_PRUNE
 #define VXLAN_FTABLE_PRUNE	(5 * 60)
 #endif
 
 static int vxlan_ftable_prune_period = VXLAN_FTABLE_PRUNE;
 
 struct vxlan_control {
 	int	(*vxlc_func)(struct vxlan_softc *, void *);
 	int	vxlc_argsize;
 	int	vxlc_flags;
 #define VXLAN_CTRL_FLAG_COPYIN	0x01
 #define VXLAN_CTRL_FLAG_COPYOUT	0x02
 #define VXLAN_CTRL_FLAG_SUSER	0x04
 };
 
 static const struct vxlan_control vxlan_control_table[] = {
 	[VXLAN_CMD_GET_CONFIG] =
 	    {	vxlan_ctrl_get_config, sizeof(struct ifvxlancfg),
 		VXLAN_CTRL_FLAG_COPYOUT
 	    },
 
 	[VXLAN_CMD_SET_VNI] =
 	    {   vxlan_ctrl_set_vni, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_LOCAL_ADDR] =
 	    {   vxlan_ctrl_set_local_addr, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_REMOTE_ADDR] =
 	    {   vxlan_ctrl_set_remote_addr, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_LOCAL_PORT] =
 	    {   vxlan_ctrl_set_local_port, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_REMOTE_PORT] =
 	    {   vxlan_ctrl_set_remote_port, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_PORT_RANGE] =
 	    {   vxlan_ctrl_set_port_range, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_FTABLE_TIMEOUT] =
 	    {	vxlan_ctrl_set_ftable_timeout, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_FTABLE_MAX] =
 	    {	vxlan_ctrl_set_ftable_max, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_MULTICAST_IF] =
 	    {	vxlan_ctrl_set_multicast_if, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_TTL] =
 	    {	vxlan_ctrl_set_ttl, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_SET_LEARN] =
 	    {	vxlan_ctrl_set_learn, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_FTABLE_ENTRY_ADD] =
 	    {	vxlan_ctrl_ftable_entry_add, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_FTABLE_ENTRY_REM] =
 	    {	vxlan_ctrl_ftable_entry_rem, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 
 	[VXLAN_CMD_FLUSH] =
 	    {   vxlan_ctrl_flush, sizeof(struct ifvxlancmd),
 		VXLAN_CTRL_FLAG_COPYIN | VXLAN_CTRL_FLAG_SUSER,
 	    },
 };
 
 static const int vxlan_control_table_size = nitems(vxlan_control_table);
 
 static int
 vxlan_ftable_addr_cmp(const uint8_t *a, const uint8_t *b)
 {
 	int i, d;
 
 	for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++)
 		d = ((int)a[i]) - ((int)b[i]);
 
 	return (d);
 }
 
 static void
 vxlan_ftable_init(struct vxlan_softc *sc)
 {
 	int i;
 
 	sc->vxl_ftable = malloc(sizeof(struct vxlan_ftable_head) *
 	    VXLAN_SC_FTABLE_SIZE, M_VXLAN, M_ZERO | M_WAITOK);
 
 	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++)
 		LIST_INIT(&sc->vxl_ftable[i]);
 	sc->vxl_ftable_hash_key = arc4random();
 }
 
 static void
 vxlan_ftable_fini(struct vxlan_softc *sc)
 {
 	int i;
 
 	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) {
 		KASSERT(LIST_EMPTY(&sc->vxl_ftable[i]),
 		    ("%s: vxlan %p ftable[%d] not empty", __func__, sc, i));
 	}
 	MPASS(sc->vxl_ftable_cnt == 0);
 
 	free(sc->vxl_ftable, M_VXLAN);
 	sc->vxl_ftable = NULL;
 }
 
 static void
 vxlan_ftable_flush(struct vxlan_softc *sc, int all)
 {
 	struct vxlan_ftable_entry *fe, *tfe;
 	int i;
 
 	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) {
 		LIST_FOREACH_SAFE(fe, &sc->vxl_ftable[i], vxlfe_hash, tfe) {
 			if (all || VXLAN_FE_IS_DYNAMIC(fe))
 				vxlan_ftable_entry_destroy(sc, fe);
 		}
 	}
 }
 
 static void
 vxlan_ftable_expire(struct vxlan_softc *sc)
 {
 	struct vxlan_ftable_entry *fe, *tfe;
 	int i;
 
 	VXLAN_LOCK_WASSERT(sc);
 
 	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) {
 		LIST_FOREACH_SAFE(fe, &sc->vxl_ftable[i], vxlfe_hash, tfe) {
 			if (VXLAN_FE_IS_DYNAMIC(fe) &&
 			    time_uptime >= fe->vxlfe_expire)
 				vxlan_ftable_entry_destroy(sc, fe);
 		}
 	}
 }
 
 static int
 vxlan_ftable_update_locked(struct vxlan_softc *sc,
     const union vxlan_sockaddr *vxlsa, const uint8_t *mac,
     struct rm_priotracker *tracker)
 {
 	struct vxlan_ftable_entry *fe;
 	int error __unused;
 
 	VXLAN_LOCK_ASSERT(sc);
 
 again:
 	/*
 	 * A forwarding entry for this MAC address might already exist. If
 	 * so, update it, otherwise create a new one. We may have to upgrade
 	 * the lock if we have to change or create an entry.
 	 */
 	fe = vxlan_ftable_entry_lookup(sc, mac);
 	if (fe != NULL) {
 		fe->vxlfe_expire = time_uptime + sc->vxl_ftable_timeout;
 
 		if (!VXLAN_FE_IS_DYNAMIC(fe) ||
 		    vxlan_sockaddr_in_equal(&fe->vxlfe_raddr, &vxlsa->sa))
 			return (0);
 		if (!VXLAN_LOCK_WOWNED(sc)) {
 			VXLAN_RUNLOCK(sc, tracker);
 			VXLAN_WLOCK(sc);
 			sc->vxl_stats.ftable_lock_upgrade_failed++;
 			goto again;
 		}
 		vxlan_sockaddr_in_copy(&fe->vxlfe_raddr, &vxlsa->sa);
 		return (0);
 	}
 
 	if (!VXLAN_LOCK_WOWNED(sc)) {
 		VXLAN_RUNLOCK(sc, tracker);
 		VXLAN_WLOCK(sc);
 		sc->vxl_stats.ftable_lock_upgrade_failed++;
 		goto again;
 	}
 
 	if (sc->vxl_ftable_cnt >= sc->vxl_ftable_max) {
 		sc->vxl_stats.ftable_nospace++;
 		return (ENOSPC);
 	}
 
 	fe = vxlan_ftable_entry_alloc();
 	if (fe == NULL)
 		return (ENOMEM);
 
 	vxlan_ftable_entry_init(sc, fe, mac, &vxlsa->sa, VXLAN_FE_FLAG_DYNAMIC);
 
 	/* The prior lookup failed, so the insert should not. */
 	error = vxlan_ftable_entry_insert(sc, fe);
 	MPASS(error == 0);
 
 	return (0);
 }
 
 static int
 vxlan_ftable_learn(struct vxlan_softc *sc, const struct sockaddr *sa,
     const uint8_t *mac)
 {
 	struct rm_priotracker tracker;
 	union vxlan_sockaddr vxlsa;
 	int error;
 
 	/*
 	 * The source port may be randomly selected by the remote host, so
 	 * use the port of the default destination address.
 	 */
 	vxlan_sockaddr_copy(&vxlsa, sa);
 	vxlsa.in4.sin_port = sc->vxl_dst_addr.in4.sin_port;
 
 	if (VXLAN_SOCKADDR_IS_IPV6(&vxlsa)) {
 		error = vxlan_sockaddr_in6_embedscope(&vxlsa);
 		if (error)
 			return (error);
 	}
 
 	VXLAN_RLOCK(sc, &tracker);
 	error = vxlan_ftable_update_locked(sc, &vxlsa, mac, &tracker);
 	VXLAN_UNLOCK(sc, &tracker);
 
 	return (error);
 }
 
 static int
 vxlan_ftable_sysctl_dump(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	struct sbuf sb;
 	struct vxlan_softc *sc;
 	struct vxlan_ftable_entry *fe;
 	size_t size;
 	int i, error;
 
 	/*
 	 * This is mostly intended for debugging during development. It is
 	 * not practical to dump an entire large table this way.
 	 */
 
 	sc = arg1;
 	size = PAGE_SIZE;	/* Calculate later. */
 
 	sbuf_new(&sb, NULL, size, SBUF_FIXEDLEN);
 	sbuf_putc(&sb, '\n');
 
 	VXLAN_RLOCK(sc, &tracker);
 	for (i = 0; i < VXLAN_SC_FTABLE_SIZE; i++) {
 		LIST_FOREACH(fe, &sc->vxl_ftable[i], vxlfe_hash) {
 			if (sbuf_error(&sb) != 0)
 				break;
 			vxlan_ftable_entry_dump(fe, &sb);
 		}
 	}
 	VXLAN_RUNLOCK(sc, &tracker);
 
 	if (sbuf_len(&sb) == 1)
 		sbuf_setpos(&sb, 0);
 
 	sbuf_finish(&sb);
 	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 
 	return (error);
 }
 
 static struct vxlan_ftable_entry *
 vxlan_ftable_entry_alloc(void)
 {
 	struct vxlan_ftable_entry *fe;
 
 	fe = malloc(sizeof(*fe), M_VXLAN, M_ZERO | M_NOWAIT);
 
 	return (fe);
 }
 
 static void
 vxlan_ftable_entry_free(struct vxlan_ftable_entry *fe)
 {
 
 	free(fe, M_VXLAN);
 }
 
 static void
 vxlan_ftable_entry_init(struct vxlan_softc *sc, struct vxlan_ftable_entry *fe,
     const uint8_t *mac, const struct sockaddr *sa, uint32_t flags)
 {
 
 	fe->vxlfe_flags = flags;
 	fe->vxlfe_expire = time_uptime + sc->vxl_ftable_timeout;
 	memcpy(fe->vxlfe_mac, mac, ETHER_ADDR_LEN);
 	vxlan_sockaddr_copy(&fe->vxlfe_raddr, sa);
 }
 
 static void
 vxlan_ftable_entry_destroy(struct vxlan_softc *sc,
     struct vxlan_ftable_entry *fe)
 {
 
 	sc->vxl_ftable_cnt--;
 	LIST_REMOVE(fe, vxlfe_hash);
 	vxlan_ftable_entry_free(fe);
 }
 
 static int
 vxlan_ftable_entry_insert(struct vxlan_softc *sc,
     struct vxlan_ftable_entry *fe)
 {
 	struct vxlan_ftable_entry *lfe;
 	uint32_t hash;
 	int dir;
 
 	VXLAN_LOCK_WASSERT(sc);
 	hash = VXLAN_SC_FTABLE_HASH(sc, fe->vxlfe_mac);
 
 	lfe = LIST_FIRST(&sc->vxl_ftable[hash]);
 	if (lfe == NULL) {
 		LIST_INSERT_HEAD(&sc->vxl_ftable[hash], fe, vxlfe_hash);
 		goto out;
 	}
 
 	do {
 		dir = vxlan_ftable_addr_cmp(fe->vxlfe_mac, lfe->vxlfe_mac);
 		if (dir == 0)
 			return (EEXIST);
 		if (dir > 0) {
 			LIST_INSERT_BEFORE(lfe, fe, vxlfe_hash);
 			goto out;
 		} else if (LIST_NEXT(lfe, vxlfe_hash) == NULL) {
 			LIST_INSERT_AFTER(lfe, fe, vxlfe_hash);
 			goto out;
 		} else
 			lfe = LIST_NEXT(lfe, vxlfe_hash);
 	} while (lfe != NULL);
 
 out:
 	sc->vxl_ftable_cnt++;
 
 	return (0);
 }
 
 static struct vxlan_ftable_entry *
 vxlan_ftable_entry_lookup(struct vxlan_softc *sc, const uint8_t *mac)
 {
 	struct vxlan_ftable_entry *fe;
 	uint32_t hash;
 	int dir;
 
 	VXLAN_LOCK_ASSERT(sc);
 	hash = VXLAN_SC_FTABLE_HASH(sc, mac);
 
 	LIST_FOREACH(fe, &sc->vxl_ftable[hash], vxlfe_hash) {
 		dir = vxlan_ftable_addr_cmp(mac, fe->vxlfe_mac);
 		if (dir == 0)
 			return (fe);
 		if (dir > 0)
 			break;
 	}
 
 	return (NULL);
 }
 
 static void
 vxlan_ftable_entry_dump(struct vxlan_ftable_entry *fe, struct sbuf *sb)
 {
 	char buf[64];
 	const union vxlan_sockaddr *sa;
 	const void *addr;
 	int i, len, af, width;
 
 	sa = &fe->vxlfe_raddr;
 	af = sa->sa.sa_family;
 	len = sbuf_len(sb);
 
 	sbuf_printf(sb, "%c 0x%02X ", VXLAN_FE_IS_DYNAMIC(fe) ? 'D' : 'S',
 	    fe->vxlfe_flags);
 
 	for (i = 0; i < ETHER_ADDR_LEN - 1; i++)
 		sbuf_printf(sb, "%02X:", fe->vxlfe_mac[i]);
 	sbuf_printf(sb, "%02X ", fe->vxlfe_mac[i]);
 
 	if (af == AF_INET) {
 		addr = &sa->in4.sin_addr;
 		width = INET_ADDRSTRLEN - 1;
 	} else {
 		addr = &sa->in6.sin6_addr;
 		width = INET6_ADDRSTRLEN - 1;
 	}
 	inet_ntop(af, addr, buf, sizeof(buf));
 	sbuf_printf(sb, "%*s ", width, buf);
 
 	sbuf_printf(sb, "%08jd", (intmax_t)fe->vxlfe_expire);
 
 	sbuf_putc(sb, '\n');
 
 	/* Truncate a partial line. */
 	if (sbuf_error(sb) != 0)
 		sbuf_setpos(sb, len);
 }
 
 static struct vxlan_socket *
 vxlan_socket_alloc(const union vxlan_sockaddr *sa)
 {
 	struct vxlan_socket *vso;
 	int i;
 
 	vso = malloc(sizeof(*vso), M_VXLAN, M_WAITOK | M_ZERO);
 	rm_init(&vso->vxlso_lock, "vxlansorm");
 	refcount_init(&vso->vxlso_refcnt, 0);
 	for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++)
 		LIST_INIT(&vso->vxlso_vni_hash[i]);
 	vso->vxlso_laddr = *sa;
 
 	return (vso);
 }
 
 static void
 vxlan_socket_destroy(struct vxlan_socket *vso)
 {
 	struct socket *so;
 #ifdef INVARIANTS
 	int i;
 	struct vxlan_socket_mc_info *mc;
 
 	for (i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) {
 		mc = &vso->vxlso_mc[i];
 		KASSERT(mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC,
 		    ("%s: socket %p mc[%d] still has address",
 		     __func__, vso, i));
 	}
 
 	for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) {
 		KASSERT(LIST_EMPTY(&vso->vxlso_vni_hash[i]),
 		    ("%s: socket %p vni_hash[%d] not empty",
 		     __func__, vso, i));
 	}
 #endif
 	so = vso->vxlso_sock;
 	if (so != NULL) {
 		vso->vxlso_sock = NULL;
 		soclose(so);
 	}
 
 	rm_destroy(&vso->vxlso_lock);
 	free(vso, M_VXLAN);
 }
 
 static void
 vxlan_socket_release(struct vxlan_socket *vso)
 {
 	int destroy;
 
 	VXLAN_LIST_LOCK();
 	destroy = VXLAN_SO_RELEASE(vso);
 	if (destroy != 0)
 		LIST_REMOVE(vso, vxlso_entry);
 	VXLAN_LIST_UNLOCK();
 
 	if (destroy != 0)
 		vxlan_socket_destroy(vso);
 }
 
 static struct vxlan_socket *
 vxlan_socket_lookup(union vxlan_sockaddr *vxlsa)
 {
 	struct vxlan_socket *vso;
 
 	VXLAN_LIST_LOCK();
 	LIST_FOREACH(vso, &vxlan_socket_list, vxlso_entry) {
 		if (vxlan_sockaddr_cmp(&vso->vxlso_laddr, &vxlsa->sa) == 0) {
 			VXLAN_SO_ACQUIRE(vso);
 			break;
 		}
 	}
 	VXLAN_LIST_UNLOCK();
 
 	return (vso);
 }
 
 static void
 vxlan_socket_insert(struct vxlan_socket *vso)
 {
 
 	VXLAN_LIST_LOCK();
 	VXLAN_SO_ACQUIRE(vso);
 	LIST_INSERT_HEAD(&vxlan_socket_list, vso, vxlso_entry);
 	VXLAN_LIST_UNLOCK();
 }
 
 static int
 vxlan_socket_init(struct vxlan_socket *vso, struct ifnet *ifp)
 {
 	struct thread *td;
 	int error;
 
 	td = curthread;
 
 	error = socreate(vso->vxlso_laddr.sa.sa_family, &vso->vxlso_sock,
 	    SOCK_DGRAM, IPPROTO_UDP, td->td_ucred, td);
 	if (error) {
 		if_printf(ifp, "cannot create socket: %d\n", error);
 		return (error);
 	}
 
 	error = udp_set_kernel_tunneling(vso->vxlso_sock,
 	    vxlan_rcv_udp_packet, NULL, vso);
 	if (error) {
 		if_printf(ifp, "cannot set tunneling function: %d\n", error);
 		return (error);
 	}
 
 	if (vxlan_reuse_port != 0) {
 		struct sockopt sopt;
 		int val = 1;
 
 		bzero(&sopt, sizeof(sopt));
 		sopt.sopt_dir = SOPT_SET;
 		sopt.sopt_level = IPPROTO_IP;
 		sopt.sopt_name = SO_REUSEPORT;
 		sopt.sopt_val = &val;
 		sopt.sopt_valsize = sizeof(val);
 		error = sosetopt(vso->vxlso_sock, &sopt);
 		if (error) {
 			if_printf(ifp,
 			    "cannot set REUSEADDR socket opt: %d\n", error);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static int
 vxlan_socket_bind(struct vxlan_socket *vso, struct ifnet *ifp)
 {
 	union vxlan_sockaddr laddr;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 	laddr = vso->vxlso_laddr;
 
 	error = sobind(vso->vxlso_sock, &laddr.sa, td);
 	if (error) {
 		if (error != EADDRINUSE)
 			if_printf(ifp, "cannot bind socket: %d\n", error);
 		return (error);
 	}
 
 	return (0);
 }
 
 static int
 vxlan_socket_create(struct ifnet *ifp, int multicast,
     const union vxlan_sockaddr *saddr, struct vxlan_socket **vsop)
 {
 	union vxlan_sockaddr laddr;
 	struct vxlan_socket *vso;
 	int error;
 
 	laddr = *saddr;
 
 	/*
 	 * If this socket will be multicast, then only the local port
 	 * must be specified when binding.
 	 */
 	if (multicast != 0) {
 		if (VXLAN_SOCKADDR_IS_IPV4(&laddr))
 			laddr.in4.sin_addr.s_addr = INADDR_ANY;
 #ifdef INET6
 		else
 			laddr.in6.sin6_addr = in6addr_any;
 #endif
 	}
 
 	vso = vxlan_socket_alloc(&laddr);
 	if (vso == NULL)
 		return (ENOMEM);
 
 	error = vxlan_socket_init(vso, ifp);
 	if (error)
 		goto fail;
 
 	error = vxlan_socket_bind(vso, ifp);
 	if (error)
 		goto fail;
 
 	/*
 	 * There is a small window between the bind completing and
 	 * inserting the socket, so that a concurrent create may fail.
 	 * Let's not worry about that for now.
 	 */
 	vxlan_socket_insert(vso);
 	*vsop = vso;
 
 	return (0);
 
 fail:
 	vxlan_socket_destroy(vso);
 
 	return (error);
 }
 
 static void
 vxlan_socket_ifdetach(struct vxlan_socket *vso, struct ifnet *ifp,
     struct vxlan_softc_head *list)
 {
 	struct rm_priotracker tracker;
 	struct vxlan_softc *sc;
 	int i;
 
 	VXLAN_SO_RLOCK(vso, &tracker);
 	for (i = 0; i < VXLAN_SO_VNI_HASH_SIZE; i++) {
 		LIST_FOREACH(sc, &vso->vxlso_vni_hash[i], vxl_entry)
 			vxlan_ifdetach(sc, ifp, list);
 	}
 	VXLAN_SO_RUNLOCK(vso, &tracker);
 }
 
 static struct vxlan_socket *
 vxlan_socket_mc_lookup(const union vxlan_sockaddr *vxlsa)
 {
 	union vxlan_sockaddr laddr;
 	struct vxlan_socket *vso;
 
 	laddr = *vxlsa;
 
 	if (VXLAN_SOCKADDR_IS_IPV4(&laddr))
 		laddr.in4.sin_addr.s_addr = INADDR_ANY;
 #ifdef INET6
 	else
 		laddr.in6.sin6_addr = in6addr_any;
 #endif
 
 	vso = vxlan_socket_lookup(&laddr);
 
 	return (vso);
 }
 
 static int
 vxlan_sockaddr_mc_info_match(const struct vxlan_socket_mc_info *mc,
     const union vxlan_sockaddr *group, const union vxlan_sockaddr *local,
     int ifidx)
 {
 
 	if (!vxlan_sockaddr_in_any(local) &&
 	    !vxlan_sockaddr_in_equal(&mc->vxlsomc_saddr, &local->sa))
 		return (0);
 	if (!vxlan_sockaddr_in_equal(&mc->vxlsomc_gaddr, &group->sa))
 		return (0);
 	if (ifidx != 0 && ifidx != mc->vxlsomc_ifidx)
 		return (0);
 
 	return (1);
 }
 
 static int
 vxlan_socket_mc_join_group(struct vxlan_socket *vso,
     const union vxlan_sockaddr *group, const union vxlan_sockaddr *local,
     int *ifidx, union vxlan_sockaddr *source)
 {
 	struct sockopt sopt;
 	int error;
 
 	*source = *local;
 
 	if (VXLAN_SOCKADDR_IS_IPV4(group)) {
 		struct ip_mreq mreq;
 
 		mreq.imr_multiaddr = group->in4.sin_addr;
 		mreq.imr_interface = local->in4.sin_addr;
 
 		bzero(&sopt, sizeof(sopt));
 		sopt.sopt_dir = SOPT_SET;
 		sopt.sopt_level = IPPROTO_IP;
 		sopt.sopt_name = IP_ADD_MEMBERSHIP;
 		sopt.sopt_val = &mreq;
 		sopt.sopt_valsize = sizeof(mreq);
 		error = sosetopt(vso->vxlso_sock, &sopt);
 		if (error)
 			return (error);
 
 		/*
 		 * BMV: Ideally, there would be a formal way for us to get
 		 * the local interface that was selected based on the
 		 * imr_interface address. We could then update *ifidx so
 		 * vxlan_sockaddr_mc_info_match() would return a match for
 		 * later creates that explicitly set the multicast interface.
 		 *
 		 * If we really need to, we can of course look in the INP's
 		 * membership list:
 		 *     sotoinpcb(vso->vxlso_sock)->inp_moptions->
-		 *         imo_membership[]->inm_ifp
+		 *         imo_head[]->imf_inm->inm_ifp
 		 * similarly to imo_match_group().
 		 */
 		source->in4.sin_addr = local->in4.sin_addr;
 
 	} else if (VXLAN_SOCKADDR_IS_IPV6(group)) {
 		struct ipv6_mreq mreq;
 
 		mreq.ipv6mr_multiaddr = group->in6.sin6_addr;
 		mreq.ipv6mr_interface = *ifidx;
 
 		bzero(&sopt, sizeof(sopt));
 		sopt.sopt_dir = SOPT_SET;
 		sopt.sopt_level = IPPROTO_IPV6;
 		sopt.sopt_name = IPV6_JOIN_GROUP;
 		sopt.sopt_val = &mreq;
 		sopt.sopt_valsize = sizeof(mreq);
 		error = sosetopt(vso->vxlso_sock, &sopt);
 		if (error)
 			return (error);
 
 		/*
 		 * BMV: As with IPv4, we would really like to know what
 		 * interface in6p_lookup_mcast_ifp() selected.
 		 */
 	} else
 		error = EAFNOSUPPORT;
 
 	return (error);
 }
 
 static int
 vxlan_socket_mc_leave_group(struct vxlan_socket *vso,
     const union vxlan_sockaddr *group, const union vxlan_sockaddr *source,
     int ifidx)
 {
 	struct sockopt sopt;
 	int error;
 
 	bzero(&sopt, sizeof(sopt));
 	sopt.sopt_dir = SOPT_SET;
 
 	if (VXLAN_SOCKADDR_IS_IPV4(group)) {
 		struct ip_mreq mreq;
 
 		mreq.imr_multiaddr = group->in4.sin_addr;
 		mreq.imr_interface = source->in4.sin_addr;
 
 		sopt.sopt_level = IPPROTO_IP;
 		sopt.sopt_name = IP_DROP_MEMBERSHIP;
 		sopt.sopt_val = &mreq;
 		sopt.sopt_valsize = sizeof(mreq);
 		error = sosetopt(vso->vxlso_sock, &sopt);
 
 	} else if (VXLAN_SOCKADDR_IS_IPV6(group)) {
 		struct ipv6_mreq mreq;
 
 		mreq.ipv6mr_multiaddr = group->in6.sin6_addr;
 		mreq.ipv6mr_interface = ifidx;
 
 		sopt.sopt_level = IPPROTO_IPV6;
 		sopt.sopt_name = IPV6_LEAVE_GROUP;
 		sopt.sopt_val = &mreq;
 		sopt.sopt_valsize = sizeof(mreq);
 		error = sosetopt(vso->vxlso_sock, &sopt);
 
 	} else
 		error = EAFNOSUPPORT;
 
 	return (error);
 }
 
 static int
 vxlan_socket_mc_add_group(struct vxlan_socket *vso,
     const union vxlan_sockaddr *group, const union vxlan_sockaddr *local,
     int ifidx, int *idx)
 {
 	union vxlan_sockaddr source;
 	struct vxlan_socket_mc_info *mc;
 	int i, empty, error;
 
 	/*
 	 * Within a socket, the same multicast group may be used by multiple
 	 * interfaces, each with a different network identifier. But a socket
 	 * may only join a multicast group once, so keep track of the users
 	 * here.
 	 */
 
 	VXLAN_SO_WLOCK(vso);
 	for (empty = 0, i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) {
 		mc = &vso->vxlso_mc[i];
 
 		if (mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC) {
 			empty++;
 			continue;
 		}
 
 		if (vxlan_sockaddr_mc_info_match(mc, group, local, ifidx))
 			goto out;
 	}
 	VXLAN_SO_WUNLOCK(vso);
 
 	if (empty == 0)
 		return (ENOSPC);
 
 	error = vxlan_socket_mc_join_group(vso, group, local, &ifidx, &source);
 	if (error)
 		return (error);
 
 	VXLAN_SO_WLOCK(vso);
 	for (i = 0; i < VXLAN_SO_MC_MAX_GROUPS; i++) {
 		mc = &vso->vxlso_mc[i];
 
 		if (mc->vxlsomc_gaddr.sa.sa_family == AF_UNSPEC) {
 			vxlan_sockaddr_copy(&mc->vxlsomc_gaddr, &group->sa);
 			vxlan_sockaddr_copy(&mc->vxlsomc_saddr, &source.sa);
 			mc->vxlsomc_ifidx = ifidx;
 			goto out;
 		}
 	}
 	VXLAN_SO_WUNLOCK(vso);
 
 	error = vxlan_socket_mc_leave_group(vso, group, &source, ifidx);
 	MPASS(error == 0);
 
 	return (ENOSPC);
 
 out:
 	mc->vxlsomc_users++;
 	VXLAN_SO_WUNLOCK(vso);
 
 	*idx = i;
 
 	return (0);
 }
 
 static void
 vxlan_socket_mc_release_group_by_idx(struct vxlan_socket *vso, int idx)
 {
 	union vxlan_sockaddr group, source;
 	struct vxlan_socket_mc_info *mc;
 	int ifidx, leave;
 
 	KASSERT(idx >= 0 && idx < VXLAN_SO_MC_MAX_GROUPS,
 	    ("%s: vso %p idx %d out of bounds", __func__, vso, idx));
 
 	leave = 0;
 	mc = &vso->vxlso_mc[idx];
 
 	VXLAN_SO_WLOCK(vso);
 	mc->vxlsomc_users--;
 	if (mc->vxlsomc_users == 0) {
 		group = mc->vxlsomc_gaddr;
 		source = mc->vxlsomc_saddr;
 		ifidx = mc->vxlsomc_ifidx;
 		bzero(mc, sizeof(*mc));
 		leave = 1;
 	}
 	VXLAN_SO_WUNLOCK(vso);
 
 	if (leave != 0) {
 		/*
 		 * Our socket's membership in this group may have already
 		 * been removed if we joined through an interface that's
 		 * been detached.
 		 */
 		vxlan_socket_mc_leave_group(vso, &group, &source, ifidx);
 	}
 }
 
 static struct vxlan_softc *
 vxlan_socket_lookup_softc_locked(struct vxlan_socket *vso, uint32_t vni)
 {
 	struct vxlan_softc *sc;
 	uint32_t hash;
 
 	VXLAN_SO_LOCK_ASSERT(vso);
 	hash = VXLAN_SO_VNI_HASH(vni);
 
 	LIST_FOREACH(sc, &vso->vxlso_vni_hash[hash], vxl_entry) {
 		if (sc->vxl_vni == vni) {
 			VXLAN_ACQUIRE(sc);
 			break;
 		}
 	}
 
 	return (sc);
 }
 
 static struct vxlan_softc *
 vxlan_socket_lookup_softc(struct vxlan_socket *vso, uint32_t vni)
 {
 	struct rm_priotracker tracker;
 	struct vxlan_softc *sc;
 
 	VXLAN_SO_RLOCK(vso, &tracker);
 	sc = vxlan_socket_lookup_softc_locked(vso, vni);
 	VXLAN_SO_RUNLOCK(vso, &tracker);
 
 	return (sc);
 }
 
 static int
 vxlan_socket_insert_softc(struct vxlan_socket *vso, struct vxlan_softc *sc)
 {
 	struct vxlan_softc *tsc;
 	uint32_t vni, hash;
 
 	vni = sc->vxl_vni;
 	hash = VXLAN_SO_VNI_HASH(vni);
 
 	VXLAN_SO_WLOCK(vso);
 	tsc = vxlan_socket_lookup_softc_locked(vso, vni);
 	if (tsc != NULL) {
 		VXLAN_SO_WUNLOCK(vso);
 		vxlan_release(tsc);
 		return (EEXIST);
 	}
 
 	VXLAN_ACQUIRE(sc);
 	LIST_INSERT_HEAD(&vso->vxlso_vni_hash[hash], sc, vxl_entry);
 	VXLAN_SO_WUNLOCK(vso);
 
 	return (0);
 }
 
 static void
 vxlan_socket_remove_softc(struct vxlan_socket *vso, struct vxlan_softc *sc)
 {
 
 	VXLAN_SO_WLOCK(vso);
 	LIST_REMOVE(sc, vxl_entry);
 	VXLAN_SO_WUNLOCK(vso);
 
 	vxlan_release(sc);
 }
 
 static struct ifnet *
 vxlan_multicast_if_ref(struct vxlan_softc *sc, int ipv4)
 {
 	struct ifnet *ifp;
 
 	VXLAN_LOCK_ASSERT(sc);
 
 	if (ipv4 && sc->vxl_im4o != NULL)
 		ifp = sc->vxl_im4o->imo_multicast_ifp;
 	else if (!ipv4 && sc->vxl_im6o != NULL)
 		ifp = sc->vxl_im6o->im6o_multicast_ifp;
 	else
 		ifp = NULL;
 
 	if (ifp != NULL)
 		if_ref(ifp);
 
 	return (ifp);
 }
 
 static void
 vxlan_free_multicast(struct vxlan_softc *sc)
 {
 
 	if (sc->vxl_mc_ifp != NULL) {
 		if_rele(sc->vxl_mc_ifp);
 		sc->vxl_mc_ifp = NULL;
 		sc->vxl_mc_ifindex = 0;
 	}
 
 	if (sc->vxl_im4o != NULL) {
 		free(sc->vxl_im4o, M_VXLAN);
 		sc->vxl_im4o = NULL;
 	}
 
 	if (sc->vxl_im6o != NULL) {
 		free(sc->vxl_im6o, M_VXLAN);
 		sc->vxl_im6o = NULL;
 	}
 }
 
 static int
 vxlan_setup_multicast_interface(struct vxlan_softc *sc)
 {
 	struct ifnet *ifp;
 
 	ifp = ifunit_ref(sc->vxl_mc_ifname);
 	if (ifp == NULL) {
 		if_printf(sc->vxl_ifp, "multicast interface %s does "
 		    "not exist\n", sc->vxl_mc_ifname);
 		return (ENOENT);
 	}
 
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 		if_printf(sc->vxl_ifp, "interface %s does not support "
 		     "multicast\n", sc->vxl_mc_ifname);
 		if_rele(ifp);
 		return (ENOTSUP);
 	}
 
 	sc->vxl_mc_ifp = ifp;
 	sc->vxl_mc_ifindex = ifp->if_index;
 
 	return (0);
 }
 
 static int
 vxlan_setup_multicast(struct vxlan_softc *sc)
 {
 	const union vxlan_sockaddr *group;
 	int error;
 
 	group = &sc->vxl_dst_addr;
 	error = 0;
 
 	if (sc->vxl_mc_ifname[0] != '\0') {
 		error = vxlan_setup_multicast_interface(sc);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Initialize an multicast options structure that is sufficiently
 	 * populated for use in the respective IP output routine. This
 	 * structure is typically stored in the socket, but our sockets
 	 * may be shared among multiple interfaces.
 	 */
 	if (VXLAN_SOCKADDR_IS_IPV4(group)) {
 		sc->vxl_im4o = malloc(sizeof(struct ip_moptions), M_VXLAN,
 		    M_ZERO | M_WAITOK);
 		sc->vxl_im4o->imo_multicast_ifp = sc->vxl_mc_ifp;
 		sc->vxl_im4o->imo_multicast_ttl = sc->vxl_ttl;
 		sc->vxl_im4o->imo_multicast_vif = -1;
 	} else if (VXLAN_SOCKADDR_IS_IPV6(group)) {
 		sc->vxl_im6o = malloc(sizeof(struct ip6_moptions), M_VXLAN,
 		    M_ZERO | M_WAITOK);
 		sc->vxl_im6o->im6o_multicast_ifp = sc->vxl_mc_ifp;
 		sc->vxl_im6o->im6o_multicast_hlim = sc->vxl_ttl;
 	}
 
 	return (error);
 }
 
 static int
 vxlan_setup_socket(struct vxlan_softc *sc)
 {
 	struct vxlan_socket *vso;
 	struct ifnet *ifp;
 	union vxlan_sockaddr *saddr, *daddr;
 	int multicast, error;
 
 	vso = NULL;
 	ifp = sc->vxl_ifp;
 	saddr = &sc->vxl_src_addr;
 	daddr = &sc->vxl_dst_addr;
 
 	multicast = vxlan_sockaddr_in_multicast(daddr);
 	MPASS(multicast != -1);
 	sc->vxl_vso_mc_index = -1;
 
 	/*
 	 * Try to create the socket. If that fails, attempt to use an
 	 * existing socket.
 	 */
 	error = vxlan_socket_create(ifp, multicast, saddr, &vso);
 	if (error) {
 		if (multicast != 0)
 			vso = vxlan_socket_mc_lookup(saddr);
 		else
 			vso = vxlan_socket_lookup(saddr);
 
 		if (vso == NULL) {
 			if_printf(ifp, "cannot create socket (error: %d), "
 			    "and no existing socket found\n", error);
 			goto out;
 		}
 	}
 
 	if (multicast != 0) {
 		error = vxlan_setup_multicast(sc);
 		if (error)
 			goto out;
 
 		error = vxlan_socket_mc_add_group(vso, daddr, saddr,
 		    sc->vxl_mc_ifindex, &sc->vxl_vso_mc_index);
 		if (error)
 			goto out;
 	}
 
 	sc->vxl_sock = vso;
 	error = vxlan_socket_insert_softc(vso, sc);
 	if (error) {
 		sc->vxl_sock = NULL;
 		if_printf(ifp, "network identifier %d already exists in "
 		    "this socket\n", sc->vxl_vni);
 		goto out;
 	}
 
 	return (0);
 
 out:
 	if (vso != NULL) {
 		if (sc->vxl_vso_mc_index != -1) {
 			vxlan_socket_mc_release_group_by_idx(vso,
 			    sc->vxl_vso_mc_index);
 			sc->vxl_vso_mc_index = -1;
 		}
 		if (multicast != 0)
 			vxlan_free_multicast(sc);
 		vxlan_socket_release(vso);
 	}
 
 	return (error);
 }
 
 static void
 vxlan_setup_interface(struct vxlan_softc *sc)
 {
 	struct ifnet *ifp;
 
 	ifp = sc->vxl_ifp;
 	ifp->if_hdrlen = ETHER_HDR_LEN + sizeof(struct vxlanudphdr);
 
 	if (VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_dst_addr) != 0)
 		ifp->if_hdrlen += sizeof(struct ip);
 	else if (VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_dst_addr) != 0)
 		ifp->if_hdrlen += sizeof(struct ip6_hdr);
 }
 
 static int
 vxlan_valid_init_config(struct vxlan_softc *sc)
 {
 	const char *reason;
 
 	if (vxlan_check_vni(sc->vxl_vni) != 0) {
 		reason = "invalid virtual network identifier specified";
 		goto fail;
 	}
 
 	if (vxlan_sockaddr_supported(&sc->vxl_src_addr, 1) == 0) {
 		reason = "source address type is not supported";
 		goto fail;
 	}
 
 	if (vxlan_sockaddr_supported(&sc->vxl_dst_addr, 0) == 0) {
 		reason = "destination address type is not supported";
 		goto fail;
 	}
 
 	if (vxlan_sockaddr_in_any(&sc->vxl_dst_addr) != 0) {
 		reason = "no valid destination address specified";
 		goto fail;
 	}
 
 	if (vxlan_sockaddr_in_multicast(&sc->vxl_dst_addr) == 0 &&
 	    sc->vxl_mc_ifname[0] != '\0') {
 		reason = "can only specify interface with a group address";
 		goto fail;
 	}
 
 	if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) {
 		if (VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_src_addr) ^
 		    VXLAN_SOCKADDR_IS_IPV4(&sc->vxl_dst_addr)) {
 			reason = "source and destination address must both "
 			    "be either IPv4 or IPv6";
 			goto fail;
 		}
 	}
 
 	if (sc->vxl_src_addr.in4.sin_port == 0) {
 		reason = "local port not specified";
 		goto fail;
 	}
 
 	if (sc->vxl_dst_addr.in4.sin_port == 0) {
 		reason = "remote port not specified";
 		goto fail;
 	}
 
 	return (0);
 
 fail:
 	if_printf(sc->vxl_ifp, "cannot initialize interface: %s\n", reason);
 	return (EINVAL);
 }
 
 static void
 vxlan_init_wait(struct vxlan_softc *sc)
 {
 
 	VXLAN_LOCK_WASSERT(sc);
 	while (sc->vxl_flags & VXLAN_FLAG_INIT)
 		rm_sleep(sc, &sc->vxl_lock, 0, "vxlint", hz);
 }
 
 static void
 vxlan_init_complete(struct vxlan_softc *sc)
 {
 
 	VXLAN_WLOCK(sc);
 	sc->vxl_flags &= ~VXLAN_FLAG_INIT;
 	wakeup(sc);
 	VXLAN_WUNLOCK(sc);
 }
 
 static void
 vxlan_init(void *xsc)
 {
 	static const uint8_t empty_mac[ETHER_ADDR_LEN];
 	struct vxlan_softc *sc;
 	struct ifnet *ifp;
 
 	sc = xsc;
 	ifp = sc->vxl_ifp;
 
 	VXLAN_WLOCK(sc);
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		VXLAN_WUNLOCK(sc);
 		return;
 	}
 	sc->vxl_flags |= VXLAN_FLAG_INIT;
 	VXLAN_WUNLOCK(sc);
 
 	if (vxlan_valid_init_config(sc) != 0)
 		goto out;
 
 	vxlan_setup_interface(sc);
 
 	if (vxlan_setup_socket(sc) != 0)
 		goto out;
 
 	/* Initialize the default forwarding entry. */
 	vxlan_ftable_entry_init(sc, &sc->vxl_default_fe, empty_mac,
 	    &sc->vxl_dst_addr.sa, VXLAN_FE_FLAG_STATIC);
 
 	VXLAN_WLOCK(sc);
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	callout_reset(&sc->vxl_callout, vxlan_ftable_prune_period * hz,
 	    vxlan_timer, sc);
 	VXLAN_WUNLOCK(sc);
 
 	if_link_state_change(ifp, LINK_STATE_UP);
 out:
 	vxlan_init_complete(sc);
 }
 
 static void
 vxlan_release(struct vxlan_softc *sc)
 {
 
 	/*
 	 * The softc may be destroyed as soon as we release our reference,
 	 * so we cannot serialize the wakeup with the softc lock. We use a
 	 * timeout in our sleeps so a missed wakeup is unfortunate but not
 	 * fatal.
 	 */
 	if (VXLAN_RELEASE(sc) != 0)
 		wakeup(sc);
 }
 
 static void
 vxlan_teardown_wait(struct vxlan_softc *sc)
 {
 
 	VXLAN_LOCK_WASSERT(sc);
 	while (sc->vxl_flags & VXLAN_FLAG_TEARDOWN)
 		rm_sleep(sc, &sc->vxl_lock, 0, "vxltrn", hz);
 }
 
 static void
 vxlan_teardown_complete(struct vxlan_softc *sc)
 {
 
 	VXLAN_WLOCK(sc);
 	sc->vxl_flags &= ~VXLAN_FLAG_TEARDOWN;
 	wakeup(sc);
 	VXLAN_WUNLOCK(sc);
 }
 
 static void
 vxlan_teardown_locked(struct vxlan_softc *sc)
 {
 	struct ifnet *ifp;
 	struct vxlan_socket *vso;
 
 	ifp = sc->vxl_ifp;
 
 	VXLAN_LOCK_WASSERT(sc);
 	MPASS(sc->vxl_flags & VXLAN_FLAG_TEARDOWN);
 
 	ifp->if_flags &= ~IFF_UP;
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	callout_stop(&sc->vxl_callout);
 	vso = sc->vxl_sock;
 	sc->vxl_sock = NULL;
 
 	VXLAN_WUNLOCK(sc);
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 
 	if (vso != NULL) {
 		vxlan_socket_remove_softc(vso, sc);
 
 		if (sc->vxl_vso_mc_index != -1) {
 			vxlan_socket_mc_release_group_by_idx(vso,
 			    sc->vxl_vso_mc_index);
 			sc->vxl_vso_mc_index = -1;
 		}
 	}
 
 	VXLAN_WLOCK(sc);
 	while (sc->vxl_refcnt != 0)
 		rm_sleep(sc, &sc->vxl_lock, 0, "vxldrn", hz);
 	VXLAN_WUNLOCK(sc);
 
 	callout_drain(&sc->vxl_callout);
 
 	vxlan_free_multicast(sc);
 	if (vso != NULL)
 		vxlan_socket_release(vso);
 
 	vxlan_teardown_complete(sc);
 }
 
 static void
 vxlan_teardown(struct vxlan_softc *sc)
 {
 
 	VXLAN_WLOCK(sc);
 	if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) {
 		vxlan_teardown_wait(sc);
 		VXLAN_WUNLOCK(sc);
 		return;
 	}
 
 	sc->vxl_flags |= VXLAN_FLAG_TEARDOWN;
 	vxlan_teardown_locked(sc);
 }
 
 static void
 vxlan_ifdetach(struct vxlan_softc *sc, struct ifnet *ifp,
     struct vxlan_softc_head *list)
 {
 
 	VXLAN_WLOCK(sc);
 
 	if (sc->vxl_mc_ifp != ifp)
 		goto out;
 	if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN)
 		goto out;
 
 	sc->vxl_flags |= VXLAN_FLAG_TEARDOWN;
 	LIST_INSERT_HEAD(list, sc, vxl_ifdetach_list);
 
 out:
 	VXLAN_WUNLOCK(sc);
 }
 
 static void
 vxlan_timer(void *xsc)
 {
 	struct vxlan_softc *sc;
 
 	sc = xsc;
 	VXLAN_LOCK_WASSERT(sc);
 
 	vxlan_ftable_expire(sc);
 	callout_schedule(&sc->vxl_callout, vxlan_ftable_prune_period * hz);
 }
 
 static int
 vxlan_ioctl_ifflags(struct vxlan_softc *sc)
 {
 	struct ifnet *ifp;
 
 	ifp = sc->vxl_ifp;
 
 	if (ifp->if_flags & IFF_UP) {
 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			vxlan_init(sc);
 	} else {
 		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 			vxlan_teardown(sc);
 	}
 
 	return (0);
 }
 
 static int
 vxlan_ctrl_get_config(struct vxlan_softc *sc, void *arg)
 {
 	struct rm_priotracker tracker;
 	struct ifvxlancfg *cfg;
 
 	cfg = arg;
 	bzero(cfg, sizeof(*cfg));
 
 	VXLAN_RLOCK(sc, &tracker);
 	cfg->vxlc_vni = sc->vxl_vni;
 	memcpy(&cfg->vxlc_local_sa, &sc->vxl_src_addr,
 	    sizeof(union vxlan_sockaddr));
 	memcpy(&cfg->vxlc_remote_sa, &sc->vxl_dst_addr,
 	    sizeof(union vxlan_sockaddr));
 	cfg->vxlc_mc_ifindex = sc->vxl_mc_ifindex;
 	cfg->vxlc_ftable_cnt = sc->vxl_ftable_cnt;
 	cfg->vxlc_ftable_max = sc->vxl_ftable_max;
 	cfg->vxlc_ftable_timeout = sc->vxl_ftable_timeout;
 	cfg->vxlc_port_min = sc->vxl_min_port;
 	cfg->vxlc_port_max = sc->vxl_max_port;
 	cfg->vxlc_learn = (sc->vxl_flags & VXLAN_FLAG_LEARN) != 0;
 	cfg->vxlc_ttl = sc->vxl_ttl;
 	VXLAN_RUNLOCK(sc, &tracker);
 
 #ifdef INET6
 	if (VXLAN_SOCKADDR_IS_IPV6(&cfg->vxlc_local_sa))
 		sa6_recoverscope(&cfg->vxlc_local_sa.in6);
 	if (VXLAN_SOCKADDR_IS_IPV6(&cfg->vxlc_remote_sa))
 		sa6_recoverscope(&cfg->vxlc_remote_sa.in6);
 #endif
 
 	return (0);
 }
 
 static int
 vxlan_ctrl_set_vni(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	if (vxlan_check_vni(cmd->vxlcmd_vni) != 0)
 		return (EINVAL);
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		sc->vxl_vni = cmd->vxlcmd_vni;
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_local_addr(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	union vxlan_sockaddr *vxlsa;
 	int error;
 
 	cmd = arg;
 	vxlsa = &cmd->vxlcmd_sa;
 
 	if (!VXLAN_SOCKADDR_IS_IPV46(vxlsa))
 		return (EINVAL);
 	if (vxlan_sockaddr_in_multicast(vxlsa) != 0)
 		return (EINVAL);
 	if (VXLAN_SOCKADDR_IS_IPV6(vxlsa)) {
 		error = vxlan_sockaddr_in6_embedscope(vxlsa);
 		if (error)
 			return (error);
 	}
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		vxlan_sockaddr_in_copy(&sc->vxl_src_addr, &vxlsa->sa);
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_remote_addr(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	union vxlan_sockaddr *vxlsa;
 	int error;
 
 	cmd = arg;
 	vxlsa = &cmd->vxlcmd_sa;
 
 	if (!VXLAN_SOCKADDR_IS_IPV46(vxlsa))
 		return (EINVAL);
 	if (VXLAN_SOCKADDR_IS_IPV6(vxlsa)) {
 		error = vxlan_sockaddr_in6_embedscope(vxlsa);
 		if (error)
 			return (error);
 	}
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		vxlan_sockaddr_in_copy(&sc->vxl_dst_addr, &vxlsa->sa);
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_local_port(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	if (cmd->vxlcmd_port == 0)
 		return (EINVAL);
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		sc->vxl_src_addr.in4.sin_port = htons(cmd->vxlcmd_port);
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_remote_port(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	if (cmd->vxlcmd_port == 0)
 		return (EINVAL);
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		sc->vxl_dst_addr.in4.sin_port = htons(cmd->vxlcmd_port);
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_port_range(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	uint16_t min, max;
 	int error;
 
 	cmd = arg;
 	min = cmd->vxlcmd_port_min;
 	max = cmd->vxlcmd_port_max;
 
 	if (max < min)
 		return (EINVAL);
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		sc->vxl_min_port = min;
 		sc->vxl_max_port = max;
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_ftable_timeout(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_check_ftable_timeout(cmd->vxlcmd_ftable_timeout) == 0) {
 		sc->vxl_ftable_timeout = cmd->vxlcmd_ftable_timeout;
 		error = 0;
 	} else
 		error = EINVAL;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_ftable_max(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_check_ftable_max(cmd->vxlcmd_ftable_max) == 0) {
 		sc->vxl_ftable_max = cmd->vxlcmd_ftable_max;
 		error = 0;
 	} else
 		error = EINVAL;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_multicast_if(struct vxlan_softc * sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_can_change_config(sc)) {
 		strlcpy(sc->vxl_mc_ifname, cmd->vxlcmd_ifname, IFNAMSIZ);
 		error = 0;
 	} else
 		error = EBUSY;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_ttl(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int error;
 
 	cmd = arg;
 
 	VXLAN_WLOCK(sc);
 	if (vxlan_check_ttl(cmd->vxlcmd_ttl) == 0) {
 		sc->vxl_ttl = cmd->vxlcmd_ttl;
 		if (sc->vxl_im4o != NULL)
 			sc->vxl_im4o->imo_multicast_ttl = sc->vxl_ttl;
 		if (sc->vxl_im6o != NULL)
 			sc->vxl_im6o->im6o_multicast_hlim = sc->vxl_ttl;
 		error = 0;
 	} else
 		error = EINVAL;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_set_learn(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 
 	cmd = arg;
 
 	VXLAN_WLOCK(sc);
 	if (cmd->vxlcmd_flags & VXLAN_CMD_FLAG_LEARN)
 		sc->vxl_flags |= VXLAN_FLAG_LEARN;
 	else
 		sc->vxl_flags &= ~VXLAN_FLAG_LEARN;
 	VXLAN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 vxlan_ctrl_ftable_entry_add(struct vxlan_softc *sc, void *arg)
 {
 	union vxlan_sockaddr vxlsa;
 	struct ifvxlancmd *cmd;
 	struct vxlan_ftable_entry *fe;
 	int error;
 
 	cmd = arg;
 	vxlsa = cmd->vxlcmd_sa;
 
 	if (!VXLAN_SOCKADDR_IS_IPV46(&vxlsa))
 		return (EINVAL);
 	if (vxlan_sockaddr_in_any(&vxlsa) != 0)
 		return (EINVAL);
 	if (vxlan_sockaddr_in_multicast(&vxlsa) != 0)
 		return (EINVAL);
 	/* BMV: We could support both IPv4 and IPv6 later. */
 	if (vxlsa.sa.sa_family != sc->vxl_dst_addr.sa.sa_family)
 		return (EAFNOSUPPORT);
 
 	if (VXLAN_SOCKADDR_IS_IPV6(&vxlsa)) {
 		error = vxlan_sockaddr_in6_embedscope(&vxlsa);
 		if (error)
 			return (error);
 	}
 
 	fe = vxlan_ftable_entry_alloc();
 	if (fe == NULL)
 		return (ENOMEM);
 
 	if (vxlsa.in4.sin_port == 0)
 		vxlsa.in4.sin_port = sc->vxl_dst_addr.in4.sin_port;
 
 	vxlan_ftable_entry_init(sc, fe, cmd->vxlcmd_mac, &vxlsa.sa,
 	    VXLAN_FE_FLAG_STATIC);
 
 	VXLAN_WLOCK(sc);
 	error = vxlan_ftable_entry_insert(sc, fe);
 	VXLAN_WUNLOCK(sc);
 
 	if (error)
 		vxlan_ftable_entry_free(fe);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_ftable_entry_rem(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	struct vxlan_ftable_entry *fe;
 	int error;
 
 	cmd = arg;
 
 	VXLAN_WLOCK(sc);
 	fe = vxlan_ftable_entry_lookup(sc, cmd->vxlcmd_mac);
 	if (fe != NULL) {
 		vxlan_ftable_entry_destroy(sc, fe);
 		error = 0;
 	} else
 		error = ENOENT;
 	VXLAN_WUNLOCK(sc);
 
 	return (error);
 }
 
 static int
 vxlan_ctrl_flush(struct vxlan_softc *sc, void *arg)
 {
 	struct ifvxlancmd *cmd;
 	int all;
 
 	cmd = arg;
 	all = cmd->vxlcmd_flags & VXLAN_CMD_FLAG_FLUSH_ALL;
 
 	VXLAN_WLOCK(sc);
 	vxlan_ftable_flush(sc, all);
 	VXLAN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 vxlan_ioctl_drvspec(struct vxlan_softc *sc, struct ifdrv *ifd, int get)
 {
 	const struct vxlan_control *vc;
 	union {
 		struct ifvxlancfg	cfg;
 		struct ifvxlancmd	cmd;
 	} args;
 	int out, error;
 
 	if (ifd->ifd_cmd >= vxlan_control_table_size)
 		return (EINVAL);
 
 	bzero(&args, sizeof(args));
 	vc = &vxlan_control_table[ifd->ifd_cmd];
 	out = (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYOUT) != 0;
 
 	if ((get != 0 && out == 0) || (get == 0 && out != 0))
 		return (EINVAL);
 
 	if (vc->vxlc_flags & VXLAN_CTRL_FLAG_SUSER) {
 		error = priv_check(curthread, PRIV_NET_VXLAN);
 		if (error)
 			return (error);
 	}
 
 	if (ifd->ifd_len != vc->vxlc_argsize ||
 	    ifd->ifd_len > sizeof(args))
 		return (EINVAL);
 
 	if (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYIN) {
 		error = copyin(ifd->ifd_data, &args, ifd->ifd_len);
 		if (error)
 			return (error);
 	}
 
 	error = vc->vxlc_func(sc, &args);
 	if (error)
 		return (error);
 
 	if (vc->vxlc_flags & VXLAN_CTRL_FLAG_COPYOUT) {
 		error = copyout(&args, ifd->ifd_data, ifd->ifd_len);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 static int
 vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct vxlan_softc *sc;
 	struct ifreq *ifr;
 	struct ifdrv *ifd;
 	int error;
 
 	sc = ifp->if_softc;
 	ifr = (struct ifreq *) data;
 	ifd = (struct ifdrv *) data;
 
 	switch (cmd) {
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		error = 0;
 		break;
 
 	case SIOCGDRVSPEC:
 	case SIOCSDRVSPEC:
 		error = vxlan_ioctl_drvspec(sc, ifd, cmd == SIOCGDRVSPEC);
 		break;
 
 	case SIOCSIFFLAGS:
 		error = vxlan_ioctl_ifflags(sc);
 		break;
 
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &sc->vxl_media, cmd);
 		break;
 
 	default:
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 
 	return (error);
 }
 
 #if defined(INET) || defined(INET6)
 static uint16_t
 vxlan_pick_source_port(struct vxlan_softc *sc, struct mbuf *m)
 {
 	int range;
 	uint32_t hash;
 
 	range = sc->vxl_max_port - sc->vxl_min_port + 1;
 
 	if (M_HASHTYPE_ISHASH(m))
 		hash = m->m_pkthdr.flowid;
 	else
 		hash = jenkins_hash(m->m_data, ETHER_HDR_LEN,
 		    sc->vxl_port_hash_key);
 
 	return (sc->vxl_min_port + (hash % range));
 }
 
 static void
 vxlan_encap_header(struct vxlan_softc *sc, struct mbuf *m, int ipoff,
     uint16_t srcport, uint16_t dstport)
 {
 	struct vxlanudphdr *hdr;
 	struct udphdr *udph;
 	struct vxlan_header *vxh;
 	int len;
 
 	len = m->m_pkthdr.len - ipoff;
 	MPASS(len >= sizeof(struct vxlanudphdr));
 	hdr = mtodo(m, ipoff);
 
 	udph = &hdr->vxlh_udp;
 	udph->uh_sport = srcport;
 	udph->uh_dport = dstport;
 	udph->uh_ulen = htons(len);
 	udph->uh_sum = 0;
 
 	vxh = &hdr->vxlh_hdr;
 	vxh->vxlh_flags = htonl(VXLAN_HDR_FLAGS_VALID_VNI);
 	vxh->vxlh_vni = htonl(sc->vxl_vni << VXLAN_HDR_VNI_SHIFT);
 }
 #endif
 
 static int
 vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
     struct mbuf *m)
 {
 #ifdef INET
 	struct ifnet *ifp;
 	struct ip *ip;
 	struct in_addr srcaddr, dstaddr;
 	uint16_t srcport, dstport;
 	int len, mcast, error;
 
 	ifp = sc->vxl_ifp;
 	srcaddr = sc->vxl_src_addr.in4.sin_addr;
 	srcport = vxlan_pick_source_port(sc, m);
 	dstaddr = fvxlsa->in4.sin_addr;
 	dstport = fvxlsa->in4.sin_port;
 
 	M_PREPEND(m, sizeof(struct ip) + sizeof(struct vxlanudphdr),
 	    M_NOWAIT);
 	if (m == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENOBUFS);
 	}
 
 	len = m->m_pkthdr.len;
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = 0;
 	ip->ip_len = htons(len);
 	ip->ip_off = 0;
 	ip->ip_ttl = sc->vxl_ttl;
 	ip->ip_p = IPPROTO_UDP;
 	ip->ip_sum = 0;
 	ip->ip_src = srcaddr;
 	ip->ip_dst = dstaddr;
 
 	vxlan_encap_header(sc, m, sizeof(struct ip), srcport, dstport);
 
 	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
 	m->m_flags &= ~(M_MCAST | M_BCAST);
 
 	error = ip_output(m, NULL, NULL, 0, sc->vxl_im4o, NULL);
 	if (error == 0) {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 		if (mcast != 0)
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 	} else
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 
 	return (error);
 #else
 	m_freem(m);
 	return (ENOTSUP);
 #endif
 }
 
 static int
 vxlan_encap6(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
     struct mbuf *m)
 {
 #ifdef INET6
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6;
 	const struct in6_addr *srcaddr, *dstaddr;
 	uint16_t srcport, dstport;
 	int len, mcast, error;
 
 	ifp = sc->vxl_ifp;
 	srcaddr = &sc->vxl_src_addr.in6.sin6_addr;
 	srcport = vxlan_pick_source_port(sc, m);
 	dstaddr = &fvxlsa->in6.sin6_addr;
 	dstport = fvxlsa->in6.sin6_port;
 
 	M_PREPEND(m, sizeof(struct ip6_hdr) + sizeof(struct vxlanudphdr),
 	    M_NOWAIT);
 	if (m == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENOBUFS);
 	}
 
 	len = m->m_pkthdr.len;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;		/* BMV: Keep in forwarding entry? */
 	ip6->ip6_vfc = IPV6_VERSION;
 	ip6->ip6_plen = 0;
 	ip6->ip6_nxt = IPPROTO_UDP;
 	ip6->ip6_hlim = sc->vxl_ttl;
 	ip6->ip6_src = *srcaddr;
 	ip6->ip6_dst = *dstaddr;
 
 	vxlan_encap_header(sc, m, sizeof(struct ip6_hdr), srcport, dstport);
 
 	/*
 	 * XXX BMV We need support for RFC6935 before we can send and
 	 * receive IPv6 UDP packets with a zero checksum.
 	 */
 	{
 		struct udphdr *hdr = mtodo(m, sizeof(struct ip6_hdr));
 		hdr->uh_sum = in6_cksum_pseudo(ip6,
 		    m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0);
 		m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	}
 
 	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
 	m->m_flags &= ~(M_MCAST | M_BCAST);
 
 	error = ip6_output(m, NULL, NULL, 0, sc->vxl_im6o, NULL, NULL);
 	if (error == 0) {
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 		if (mcast != 0)
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 	} else
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 
 	return (error);
 #else
 	m_freem(m);
 	return (ENOTSUP);
 #endif
 }
 
 static int
 vxlan_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct rm_priotracker tracker;
 	union vxlan_sockaddr vxlsa;
 	struct vxlan_softc *sc;
 	struct vxlan_ftable_entry *fe;
 	struct ifnet *mcifp;
 	struct ether_header *eh;
 	int ipv4, error;
 
 	sc = ifp->if_softc;
 	eh = mtod(m, struct ether_header *);
 	fe = NULL;
 	mcifp = NULL;
 
 	ETHER_BPF_MTAP(ifp, m);
 
 	VXLAN_RLOCK(sc, &tracker);
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		VXLAN_RUNLOCK(sc, &tracker);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	if ((m->m_flags & (M_BCAST | M_MCAST)) == 0)
 		fe = vxlan_ftable_entry_lookup(sc, eh->ether_dhost);
 	if (fe == NULL)
 		fe = &sc->vxl_default_fe;
 	vxlan_sockaddr_copy(&vxlsa, &fe->vxlfe_raddr.sa);
 
 	ipv4 = VXLAN_SOCKADDR_IS_IPV4(&vxlsa) != 0;
 	if (vxlan_sockaddr_in_multicast(&vxlsa) != 0)
 		mcifp = vxlan_multicast_if_ref(sc, ipv4);
 
 	VXLAN_ACQUIRE(sc);
 	VXLAN_RUNLOCK(sc, &tracker);
 
 	if (ipv4 != 0)
 		error = vxlan_encap4(sc, &vxlsa, m);
 	else
 		error = vxlan_encap6(sc, &vxlsa, m);
 
 	vxlan_release(sc);
 	if (mcifp != NULL)
 		if_rele(mcifp);
 
 	return (error);
 }
 
 static void
 vxlan_qflush(struct ifnet *ifp __unused)
 {
 }
 
 static void
 vxlan_rcv_udp_packet(struct mbuf *m, int offset, struct inpcb *inpcb,
     const struct sockaddr *srcsa, void *xvso)
 {
 	struct vxlan_socket *vso;
 	struct vxlan_header *vxh, vxlanhdr;
 	uint32_t vni;
 	int error __unused;
 
 	M_ASSERTPKTHDR(m);
 	vso = xvso;
 	offset += sizeof(struct udphdr);
 
 	if (m->m_pkthdr.len < offset + sizeof(struct vxlan_header))
 		goto out;
 
 	if (__predict_false(m->m_len < offset + sizeof(struct vxlan_header))) {
 		m_copydata(m, offset, sizeof(struct vxlan_header),
 		    (caddr_t) &vxlanhdr);
 		vxh = &vxlanhdr;
 	} else
 		vxh = mtodo(m, offset);
 
 	/*
 	 * Drop if there is a reserved bit set in either the flags or VNI
 	 * fields of the header. This goes against the specification, but
 	 * a bit set may indicate an unsupported new feature. This matches
 	 * the behavior of the Linux implementation.
 	 */
 	if (vxh->vxlh_flags != htonl(VXLAN_HDR_FLAGS_VALID_VNI) ||
 	    vxh->vxlh_vni & ~htonl(VXLAN_VNI_MASK))
 		goto out;
 
 	vni = ntohl(vxh->vxlh_vni) >> VXLAN_HDR_VNI_SHIFT;
 	/* Adjust to the start of the inner Ethernet frame. */
 	m_adj(m, offset + sizeof(struct vxlan_header));
 
 	error = vxlan_input(vso, vni, &m, srcsa);
 	MPASS(error != 0 || m == NULL);
 
 out:
 	if (m != NULL)
 		m_freem(m);
 }
 
 static int
 vxlan_input(struct vxlan_socket *vso, uint32_t vni, struct mbuf **m0,
     const struct sockaddr *sa)
 {
 	struct vxlan_softc *sc;
 	struct ifnet *ifp;
 	struct mbuf *m;
 	struct ether_header *eh;
 	int error;
 
 	sc = vxlan_socket_lookup_softc(vso, vni);
 	if (sc == NULL)
 		return (ENOENT);
 
 	ifp = sc->vxl_ifp;
 	m = *m0;
 	eh = mtod(m, struct ether_header *);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		error = ENETDOWN;
 		goto out;
 	} else if (ifp == m->m_pkthdr.rcvif) {
 		/* XXX Does not catch more complex loops. */
 		error = EDEADLK;
 		goto out;
 	}
 
 	if (sc->vxl_flags & VXLAN_FLAG_LEARN)
 		vxlan_ftable_learn(sc, sa, eh->ether_shost);
 
 	m_clrprotoflags(m);
 	m->m_pkthdr.rcvif = ifp;
 	M_SETFIB(m, ifp->if_fib);
 
 	error = netisr_queue_src(NETISR_ETHER, 0, m);
 	*m0 = NULL;
 
 out:
 	vxlan_release(sc);
 	return (error);
 }
 
 static void
 vxlan_set_default_config(struct vxlan_softc *sc)
 {
 
 	sc->vxl_flags |= VXLAN_FLAG_LEARN;
 
 	sc->vxl_vni = VXLAN_VNI_MAX;
 	sc->vxl_ttl = IPDEFTTL;
 
 	if (!vxlan_tunable_int(sc, "legacy_port", vxlan_legacy_port)) {
 		sc->vxl_src_addr.in4.sin_port = htons(VXLAN_PORT);
 		sc->vxl_dst_addr.in4.sin_port = htons(VXLAN_PORT);
 	} else {
 		sc->vxl_src_addr.in4.sin_port = htons(VXLAN_LEGACY_PORT);
 		sc->vxl_dst_addr.in4.sin_port = htons(VXLAN_LEGACY_PORT);
 	}
 
 	sc->vxl_min_port = V_ipport_firstauto;
 	sc->vxl_max_port = V_ipport_lastauto;
 
 	sc->vxl_ftable_max = VXLAN_FTABLE_MAX;
 	sc->vxl_ftable_timeout = VXLAN_FTABLE_TIMEOUT;
 }
 
 static int
 vxlan_set_user_config(struct vxlan_softc *sc, struct ifvxlanparam *vxlp)
 {
 
 #ifndef INET
 	if (vxlp->vxlp_with & (VXLAN_PARAM_WITH_LOCAL_ADDR4 |
 	    VXLAN_PARAM_WITH_REMOTE_ADDR4))
 		return (EAFNOSUPPORT);
 #endif
 
 #ifndef INET6
 	if (vxlp->vxlp_with & (VXLAN_PARAM_WITH_LOCAL_ADDR6 |
 	    VXLAN_PARAM_WITH_REMOTE_ADDR6))
 		return (EAFNOSUPPORT);
 #else
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR6) {
 		int error = vxlan_sockaddr_in6_embedscope(&vxlp->vxlp_local_sa);
 		if (error)
 			return (error);
 	}
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR6) {
 		int error = vxlan_sockaddr_in6_embedscope(
 		   &vxlp->vxlp_remote_sa);
 		if (error)
 			return (error);
 	}
 #endif
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_VNI) {
 		if (vxlan_check_vni(vxlp->vxlp_vni) == 0)
 			sc->vxl_vni = vxlp->vxlp_vni;
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR4) {
 		sc->vxl_src_addr.in4.sin_len = sizeof(struct sockaddr_in);
 		sc->vxl_src_addr.in4.sin_family = AF_INET;
 		sc->vxl_src_addr.in4.sin_addr =
 		    vxlp->vxlp_local_sa.in4.sin_addr;
 	} else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_ADDR6) {
 		sc->vxl_src_addr.in6.sin6_len = sizeof(struct sockaddr_in6);
 		sc->vxl_src_addr.in6.sin6_family = AF_INET6;
 		sc->vxl_src_addr.in6.sin6_addr =
 		    vxlp->vxlp_local_sa.in6.sin6_addr;
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR4) {
 		sc->vxl_dst_addr.in4.sin_len = sizeof(struct sockaddr_in);
 		sc->vxl_dst_addr.in4.sin_family = AF_INET;
 		sc->vxl_dst_addr.in4.sin_addr =
 		    vxlp->vxlp_remote_sa.in4.sin_addr;
 	} else if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_ADDR6) {
 		sc->vxl_dst_addr.in6.sin6_len = sizeof(struct sockaddr_in6);
 		sc->vxl_dst_addr.in6.sin6_family = AF_INET6;
 		sc->vxl_dst_addr.in6.sin6_addr =
 		    vxlp->vxlp_remote_sa.in6.sin6_addr;
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LOCAL_PORT)
 		sc->vxl_src_addr.in4.sin_port = htons(vxlp->vxlp_local_port);
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_REMOTE_PORT)
 		sc->vxl_dst_addr.in4.sin_port = htons(vxlp->vxlp_remote_port);
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_PORT_RANGE) {
 		if (vxlp->vxlp_min_port <= vxlp->vxlp_max_port) {
 			sc->vxl_min_port = vxlp->vxlp_min_port;
 			sc->vxl_max_port = vxlp->vxlp_max_port;
 		}
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_MULTICAST_IF)
 		strlcpy(sc->vxl_mc_ifname, vxlp->vxlp_mc_ifname, IFNAMSIZ);
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_FTABLE_TIMEOUT) {
 		if (vxlan_check_ftable_timeout(vxlp->vxlp_ftable_timeout) == 0)
 			sc->vxl_ftable_timeout = vxlp->vxlp_ftable_timeout;
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_FTABLE_MAX) {
 		if (vxlan_check_ftable_max(vxlp->vxlp_ftable_max) == 0)
 			sc->vxl_ftable_max = vxlp->vxlp_ftable_max;
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_TTL) {
 		if (vxlan_check_ttl(vxlp->vxlp_ttl) == 0)
 			sc->vxl_ttl = vxlp->vxlp_ttl;
 	}
 
 	if (vxlp->vxlp_with & VXLAN_PARAM_WITH_LEARN) {
 		if (vxlp->vxlp_learn == 0)
 			sc->vxl_flags &= ~VXLAN_FLAG_LEARN;
 	}
 
 	return (0);
 }
 
 static int
 vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct vxlan_softc *sc;
 	struct ifnet *ifp;
 	struct ifvxlanparam vxlp;
 	int error;
 
 	sc = malloc(sizeof(struct vxlan_softc), M_VXLAN, M_WAITOK | M_ZERO);
 	sc->vxl_unit = unit;
 	vxlan_set_default_config(sc);
 
 	if (params != 0) {
 		error = copyin(params, &vxlp, sizeof(vxlp));
 		if (error)
 			goto fail;
 
 		error = vxlan_set_user_config(sc, &vxlp);
 		if (error)
 			goto fail;
 	}
 
 	ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		error = ENOSPC;
 		goto fail;
 	}
 
 	sc->vxl_ifp = ifp;
 	rm_init(&sc->vxl_lock, "vxlanrm");
 	callout_init_rw(&sc->vxl_callout, &sc->vxl_lock, 0);
 	sc->vxl_port_hash_key = arc4random();
 	vxlan_ftable_init(sc);
 
 	vxlan_sysctl_setup(sc);
 
 	ifp->if_softc = sc;
 	if_initname(ifp, vxlan_name, unit);
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_init = vxlan_init;
 	ifp->if_ioctl = vxlan_ioctl;
 	ifp->if_transmit = vxlan_transmit;
 	ifp->if_qflush = vxlan_qflush;
 	ifp->if_capabilities |= IFCAP_LINKSTATE;
 	ifp->if_capenable |= IFCAP_LINKSTATE;
 
 	ifmedia_init(&sc->vxl_media, 0, vxlan_media_change, vxlan_media_status);
 	ifmedia_add(&sc->vxl_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(&sc->vxl_media, IFM_ETHER | IFM_AUTO);
 
 	ether_gen_addr(ifp, &sc->vxl_hwaddr);
 	ether_ifattach(ifp, sc->vxl_hwaddr.octet);
 
 	ifp->if_baudrate = 0;
 	ifp->if_hdrlen = 0;
 
 	return (0);
 
 fail:
 	free(sc, M_VXLAN);
 	return (error);
 }
 
 static void
 vxlan_clone_destroy(struct ifnet *ifp)
 {
 	struct vxlan_softc *sc;
 
 	sc = ifp->if_softc;
 
 	vxlan_teardown(sc);
 
 	vxlan_ftable_flush(sc, 1);
 
 	ether_ifdetach(ifp);
 	if_free(ifp);
 	ifmedia_removeall(&sc->vxl_media);
 
 	vxlan_ftable_fini(sc);
 
 	vxlan_sysctl_destroy(sc);
 	rm_destroy(&sc->vxl_lock);
 	free(sc, M_VXLAN);
 }
 
 /* BMV: Taken from if_bridge. */
 static uint32_t
 vxlan_mac_hash(struct vxlan_softc *sc, const uint8_t *addr)
 {
 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->vxl_ftable_hash_key;
 
 	b += addr[5] << 8;
 	b += addr[4];
 	a += addr[3] << 24;
 	a += addr[2] << 16;
 	a += addr[1] << 8;
 	a += addr[0];
 
 /*
  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
  */
 #define	mix(a, b, c)							\
 do {									\
 	a -= b; a -= c; a ^= (c >> 13);					\
 	b -= c; b -= a; b ^= (a << 8);					\
 	c -= a; c -= b; c ^= (b >> 13);					\
 	a -= b; a -= c; a ^= (c >> 12);					\
 	b -= c; b -= a; b ^= (a << 16);					\
 	c -= a; c -= b; c ^= (b >> 5);					\
 	a -= b; a -= c; a ^= (c >> 3);					\
 	b -= c; b -= a; b ^= (a << 10);					\
 	c -= a; c -= b; c ^= (b >> 15);					\
 } while (0)
 
 	mix(a, b, c);
 
 #undef mix
 
 	return (c);
 }
 
 static int
 vxlan_media_change(struct ifnet *ifp)
 {
 
 	/* Ignore. */
 	return (0);
 }
 
 static void
 vxlan_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 
 	ifmr->ifm_status = IFM_ACTIVE | IFM_AVALID;
 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
 }
 
 static int
 vxlan_sockaddr_cmp(const union vxlan_sockaddr *vxladdr,
     const struct sockaddr *sa)
 {
 
 	return (bcmp(&vxladdr->sa, sa, vxladdr->sa.sa_len));
 }
 
 static void
 vxlan_sockaddr_copy(union vxlan_sockaddr *vxladdr,
     const struct sockaddr *sa)
 {
 
 	MPASS(sa->sa_family == AF_INET || sa->sa_family == AF_INET6);
 	bzero(vxladdr, sizeof(*vxladdr));
 
 	if (sa->sa_family == AF_INET) {
 		vxladdr->in4 = *satoconstsin(sa);
 		vxladdr->in4.sin_len = sizeof(struct sockaddr_in);
 	} else if (sa->sa_family == AF_INET6) {
 		vxladdr->in6 = *satoconstsin6(sa);
 		vxladdr->in6.sin6_len = sizeof(struct sockaddr_in6);
 	}
 }
 
 static int
 vxlan_sockaddr_in_equal(const union vxlan_sockaddr *vxladdr,
     const struct sockaddr *sa)
 {
 	int equal;
 
 	if (sa->sa_family == AF_INET) {
 		const struct in_addr *in4 = &satoconstsin(sa)->sin_addr;
 		equal = in4->s_addr == vxladdr->in4.sin_addr.s_addr;
 	} else if (sa->sa_family == AF_INET6) {
 		const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr;
 		equal = IN6_ARE_ADDR_EQUAL(in6, &vxladdr->in6.sin6_addr);
 	} else
 		equal = 0;
 
 	return (equal);
 }
 
 static void
 vxlan_sockaddr_in_copy(union vxlan_sockaddr *vxladdr,
     const struct sockaddr *sa)
 {
 
 	MPASS(sa->sa_family == AF_INET || sa->sa_family == AF_INET6);
 
 	if (sa->sa_family == AF_INET) {
 		const struct in_addr *in4 = &satoconstsin(sa)->sin_addr;
 		vxladdr->in4.sin_family = AF_INET;
 		vxladdr->in4.sin_len = sizeof(struct sockaddr_in);
 		vxladdr->in4.sin_addr = *in4;
 	} else if (sa->sa_family == AF_INET6) {
 		const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr;
 		vxladdr->in6.sin6_family = AF_INET6;
 		vxladdr->in6.sin6_len = sizeof(struct sockaddr_in6);
 		vxladdr->in6.sin6_addr = *in6;
 	}
 }
 
 static int
 vxlan_sockaddr_supported(const union vxlan_sockaddr *vxladdr, int unspec)
 {
 	const struct sockaddr *sa;
 	int supported;
 
 	sa = &vxladdr->sa;
 	supported = 0;
 
 	if (sa->sa_family == AF_UNSPEC && unspec != 0) {
 		supported = 1;
 	} else if (sa->sa_family == AF_INET) {
 #ifdef INET
 		supported = 1;
 #endif
 	} else if (sa->sa_family == AF_INET6) {
 #ifdef INET6
 		supported = 1;
 #endif
 	}
 
 	return (supported);
 }
 
 static int
 vxlan_sockaddr_in_any(const union vxlan_sockaddr *vxladdr)
 {
 	const struct sockaddr *sa;
 	int any;
 
 	sa = &vxladdr->sa;
 
 	if (sa->sa_family == AF_INET) {
 		const struct in_addr *in4 = &satoconstsin(sa)->sin_addr;
 		any = in4->s_addr == INADDR_ANY;
 	} else if (sa->sa_family == AF_INET6) {
 		const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr;
 		any = IN6_IS_ADDR_UNSPECIFIED(in6);
 	} else
 		any = -1;
 
 	return (any);
 }
 
 static int
 vxlan_sockaddr_in_multicast(const union vxlan_sockaddr *vxladdr)
 {
 	const struct sockaddr *sa;
 	int mc;
 
 	sa = &vxladdr->sa;
 
 	if (sa->sa_family == AF_INET) {
 		const struct in_addr *in4 = &satoconstsin(sa)->sin_addr;
 		mc = IN_MULTICAST(ntohl(in4->s_addr));
 	} else if (sa->sa_family == AF_INET6) {
 		const struct in6_addr *in6 = &satoconstsin6(sa)->sin6_addr;
 		mc = IN6_IS_ADDR_MULTICAST(in6);
 	} else
 		mc = -1;
 
 	return (mc);
 }
 
 static int
 vxlan_sockaddr_in6_embedscope(union vxlan_sockaddr *vxladdr)
 {
 	int error;
 
 	MPASS(VXLAN_SOCKADDR_IS_IPV6(vxladdr));
 #ifdef INET6
 	error = sa6_embedscope(&vxladdr->in6, V_ip6_use_defzone);
 #else
 	error = EAFNOSUPPORT;
 #endif
 
 	return (error);
 }
 
 static int
 vxlan_can_change_config(struct vxlan_softc *sc)
 {
 	struct ifnet *ifp;
 
 	ifp = sc->vxl_ifp;
 	VXLAN_LOCK_ASSERT(sc);
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 		return (0);
 	if (sc->vxl_flags & (VXLAN_FLAG_INIT | VXLAN_FLAG_TEARDOWN))
 		return (0);
 
 	return (1);
 }
 
 static int
 vxlan_check_vni(uint32_t vni)
 {
 
 	return (vni >= VXLAN_VNI_MAX);
 }
 
 static int
 vxlan_check_ttl(int ttl)
 {
 
 	return (ttl > MAXTTL);
 }
 
 static int
 vxlan_check_ftable_timeout(uint32_t timeout)
 {
 
 	return (timeout > VXLAN_FTABLE_MAX_TIMEOUT);
 }
 
 static int
 vxlan_check_ftable_max(uint32_t max)
 {
 
 	return (max > VXLAN_FTABLE_MAX);
 }
 
 static void
 vxlan_sysctl_setup(struct vxlan_softc *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *node;
 	struct vxlan_statistics *stats;
 	char namebuf[8];
 
 	ctx = &sc->vxl_sysctl_ctx;
 	stats = &sc->vxl_stats;
 	snprintf(namebuf, sizeof(namebuf), "%d", sc->vxl_unit);
 
 	sysctl_ctx_init(ctx);
 	sc->vxl_sysctl_node = SYSCTL_ADD_NODE(ctx,
 	    SYSCTL_STATIC_CHILDREN(_net_link_vxlan), OID_AUTO, namebuf,
 	    CTLFLAG_RD, NULL, "");
 
 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->vxl_sysctl_node),
 	    OID_AUTO, "ftable", CTLFLAG_RD, NULL, "");
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "count",
 	    CTLFLAG_RD, &sc->vxl_ftable_cnt, 0,
 	    "Number of entries in fowarding table");
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "max",
 	     CTLFLAG_RD, &sc->vxl_ftable_max, 0,
 	    "Maximum number of entries allowed in fowarding table");
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "timeout",
 	    CTLFLAG_RD, &sc->vxl_ftable_timeout, 0,
 	    "Number of seconds between prunes of the forwarding table");
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "dump",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
 	    sc, 0, vxlan_ftable_sysctl_dump, "A",
 	    "Dump the forwarding table entries");
 
 	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->vxl_sysctl_node),
 	    OID_AUTO, "stats", CTLFLAG_RD, NULL, "");
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO,
 	    "ftable_nospace", CTLFLAG_RD, &stats->ftable_nospace, 0,
 	    "Fowarding table reached maximum entries");
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(node), OID_AUTO,
 	    "ftable_lock_upgrade_failed", CTLFLAG_RD,
 	    &stats->ftable_lock_upgrade_failed, 0,
 	    "Forwarding table update required lock upgrade");
 }
 
 static void
 vxlan_sysctl_destroy(struct vxlan_softc *sc)
 {
 
 	sysctl_ctx_free(&sc->vxl_sysctl_ctx);
 	sc->vxl_sysctl_node = NULL;
 }
 
 static int
 vxlan_tunable_int(struct vxlan_softc *sc, const char *knob, int def)
 {
 	char path[64];
 
 	snprintf(path, sizeof(path), "net.link.vxlan.%d.%s",
 	    sc->vxl_unit, knob);
 	TUNABLE_INT_FETCH(path, &def);
 
 	return (def);
 }
 
 static void
 vxlan_ifdetach_event(void *arg __unused, struct ifnet *ifp)
 {
 	struct vxlan_softc_head list;
 	struct vxlan_socket *vso;
 	struct vxlan_softc *sc, *tsc;
 
 	LIST_INIT(&list);
 
 	if (ifp->if_flags & IFF_RENAMING)
 		return;
 	if ((ifp->if_flags & IFF_MULTICAST) == 0)
 		return;
 
 	VXLAN_LIST_LOCK();
 	LIST_FOREACH(vso, &vxlan_socket_list, vxlso_entry)
 		vxlan_socket_ifdetach(vso, ifp, &list);
 	VXLAN_LIST_UNLOCK();
 
 	LIST_FOREACH_SAFE(sc, &list, vxl_ifdetach_list, tsc) {
 		LIST_REMOVE(sc, vxl_ifdetach_list);
 
 		VXLAN_WLOCK(sc);
 		if (sc->vxl_flags & VXLAN_FLAG_INIT)
 			vxlan_init_wait(sc);
 		vxlan_teardown_locked(sc);
 	}
 }
 
 static void
 vxlan_load(void)
 {
 
 	mtx_init(&vxlan_list_mtx, "vxlan list", NULL, MTX_DEF);
 	LIST_INIT(&vxlan_socket_list);
 	vxlan_ifdetach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
 	    vxlan_ifdetach_event, NULL, EVENTHANDLER_PRI_ANY);
 	vxlan_cloner = if_clone_simple(vxlan_name, vxlan_clone_create,
 	    vxlan_clone_destroy, 0);
 }
 
 static void
 vxlan_unload(void)
 {
 
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 	    vxlan_ifdetach_event_tag);
 	if_clone_detach(vxlan_cloner);
 	mtx_destroy(&vxlan_list_mtx);
 	MPASS(LIST_EMPTY(&vxlan_socket_list));
 }
 
 static int
 vxlan_modevent(module_t mod, int type, void *unused)
 {
 	int error;
 
 	error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		vxlan_load();
 		break;
 	case MOD_UNLOAD:
 		vxlan_unload();
 		break;
 	default:
 		error = ENOTSUP;
 		break;
 	}
 
 	return (error);
 }
 
 static moduledata_t vxlan_mod = {
 	"if_vxlan",
 	vxlan_modevent,
 	0
 };
 
 DECLARE_MODULE(if_vxlan, vxlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_vxlan, 1);
Index: head/sys/netinet/in.h
===================================================================
--- head/sys/netinet/in.h	(revision 349368)
+++ head/sys/netinet/in.h	(revision 349369)
@@ -1,676 +1,672 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in.h	8.3 (Berkeley) 1/3/94
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IN_H_
 #define	_NETINET_IN_H_
 
 #include <sys/cdefs.h>
 #include <sys/_types.h>
 #include <machine/endian.h>
 
 /* Protocols common to RFC 1700, POSIX, and X/Open. */
 #define	IPPROTO_IP		0		/* dummy for IP */
 #define	IPPROTO_ICMP		1		/* control message protocol */
 #define	IPPROTO_TCP		6		/* tcp */
 #define	IPPROTO_UDP		17		/* user datagram protocol */
 
 #define	INADDR_ANY		((in_addr_t)0x00000000)
 #define	INADDR_BROADCAST	((in_addr_t)0xffffffff)	/* must be masked */
 
 #ifndef _UINT8_T_DECLARED
 typedef	__uint8_t		uint8_t;
 #define	_UINT8_T_DECLARED
 #endif
 
 #ifndef _UINT16_T_DECLARED
 typedef	__uint16_t		uint16_t;
 #define	_UINT16_T_DECLARED
 #endif
 
 #ifndef _UINT32_T_DECLARED
 typedef	__uint32_t		uint32_t;
 #define	_UINT32_T_DECLARED
 #endif
 
 #ifndef _IN_ADDR_T_DECLARED
 typedef	uint32_t		in_addr_t;
 #define	_IN_ADDR_T_DECLARED
 #endif
 
 #ifndef _IN_PORT_T_DECLARED
 typedef	uint16_t		in_port_t;
 #define	_IN_PORT_T_DECLARED
 #endif
 
 #ifndef _SA_FAMILY_T_DECLARED
 typedef	__sa_family_t		sa_family_t;
 #define	_SA_FAMILY_T_DECLARED
 #endif
 
 /* Internet address (a structure for historical reasons). */
 #ifndef	_STRUCT_IN_ADDR_DECLARED
 struct in_addr {
 	in_addr_t s_addr;
 };
 #define	_STRUCT_IN_ADDR_DECLARED
 #endif
 
 #ifndef	_SOCKLEN_T_DECLARED
 typedef	__socklen_t	socklen_t;
 #define	_SOCKLEN_T_DECLARED
 #endif
 
 #include <sys/_sockaddr_storage.h>
 
 /* Socket address, internet style. */
 struct sockaddr_in {
 	uint8_t	sin_len;
 	sa_family_t	sin_family;
 	in_port_t	sin_port;
 	struct	in_addr sin_addr;
 	char	sin_zero[8];
 };
 
 #if !defined(_KERNEL) && __POSIX_VISIBLE >= 200112
 
 #ifndef _BYTEORDER_PROTOTYPED
 #define	_BYTEORDER_PROTOTYPED
 __BEGIN_DECLS
 uint32_t	htonl(uint32_t);
 uint16_t	htons(uint16_t);
 uint32_t	ntohl(uint32_t);
 uint16_t	ntohs(uint16_t);
 __END_DECLS
 #endif
 
 #ifndef _BYTEORDER_FUNC_DEFINED
 #define	_BYTEORDER_FUNC_DEFINED
 #define	htonl(x)	__htonl(x)
 #define	htons(x)	__htons(x)
 #define	ntohl(x)	__ntohl(x)
 #define	ntohs(x)	__ntohs(x)
 #endif
 
 #endif /* !_KERNEL && __POSIX_VISIBLE >= 200112 */
 
 #if __POSIX_VISIBLE >= 200112
 #define	IPPROTO_IPV6		41		/* IP6 header */
 #define	IPPROTO_RAW		255		/* raw IP packet */
 #define	INET_ADDRSTRLEN		16
 #endif
 
 #if __BSD_VISIBLE
 /*
  * Constants and structures defined by the internet system,
  * Per RFC 790, September 1981, and numerous additions.
  */
 
 /*
  * Protocols (RFC 1700)
  */
 #define	IPPROTO_HOPOPTS		0		/* IP6 hop-by-hop options */
 #define	IPPROTO_IGMP		2		/* group mgmt protocol */
 #define	IPPROTO_GGP		3		/* gateway^2 (deprecated) */
 #define	IPPROTO_IPV4		4		/* IPv4 encapsulation */
 #define	IPPROTO_IPIP		IPPROTO_IPV4	/* for compatibility */
 #define	IPPROTO_ST		7		/* Stream protocol II */
 #define	IPPROTO_EGP		8		/* exterior gateway protocol */
 #define	IPPROTO_PIGP		9		/* private interior gateway */
 #define	IPPROTO_RCCMON		10		/* BBN RCC Monitoring */
 #define	IPPROTO_NVPII		11		/* network voice protocol*/
 #define	IPPROTO_PUP		12		/* pup */
 #define	IPPROTO_ARGUS		13		/* Argus */
 #define	IPPROTO_EMCON		14		/* EMCON */
 #define	IPPROTO_XNET		15		/* Cross Net Debugger */
 #define	IPPROTO_CHAOS		16		/* Chaos*/
 #define	IPPROTO_MUX		18		/* Multiplexing */
 #define	IPPROTO_MEAS		19		/* DCN Measurement Subsystems */
 #define	IPPROTO_HMP		20		/* Host Monitoring */
 #define	IPPROTO_PRM		21		/* Packet Radio Measurement */
 #define	IPPROTO_IDP		22		/* xns idp */
 #define	IPPROTO_TRUNK1		23		/* Trunk-1 */
 #define	IPPROTO_TRUNK2		24		/* Trunk-2 */
 #define	IPPROTO_LEAF1		25		/* Leaf-1 */
 #define	IPPROTO_LEAF2		26		/* Leaf-2 */
 #define	IPPROTO_RDP		27		/* Reliable Data */
 #define	IPPROTO_IRTP		28		/* Reliable Transaction */
 #define	IPPROTO_TP		29		/* tp-4 w/ class negotiation */
 #define	IPPROTO_BLT		30		/* Bulk Data Transfer */
 #define	IPPROTO_NSP		31		/* Network Services */
 #define	IPPROTO_INP		32		/* Merit Internodal */
 #define	IPPROTO_SEP		33		/* Sequential Exchange */
 #define	IPPROTO_3PC		34		/* Third Party Connect */
 #define	IPPROTO_IDPR		35		/* InterDomain Policy Routing */
 #define	IPPROTO_XTP		36		/* XTP */
 #define	IPPROTO_DDP		37		/* Datagram Delivery */
 #define	IPPROTO_CMTP		38		/* Control Message Transport */
 #define	IPPROTO_TPXX		39		/* TP++ Transport */
 #define	IPPROTO_IL		40		/* IL transport protocol */
 #define	IPPROTO_SDRP		42		/* Source Demand Routing */
 #define	IPPROTO_ROUTING		43		/* IP6 routing header */
 #define	IPPROTO_FRAGMENT	44		/* IP6 fragmentation header */
 #define	IPPROTO_IDRP		45		/* InterDomain Routing*/
 #define	IPPROTO_RSVP		46		/* resource reservation */
 #define	IPPROTO_GRE		47		/* General Routing Encap. */
 #define	IPPROTO_MHRP		48		/* Mobile Host Routing */
 #define	IPPROTO_BHA		49		/* BHA */
 #define	IPPROTO_ESP		50		/* IP6 Encap Sec. Payload */
 #define	IPPROTO_AH		51		/* IP6 Auth Header */
 #define	IPPROTO_INLSP		52		/* Integ. Net Layer Security */
 #define	IPPROTO_SWIPE		53		/* IP with encryption */
 #define	IPPROTO_NHRP		54		/* Next Hop Resolution */
 #define	IPPROTO_MOBILE		55		/* IP Mobility */
 #define	IPPROTO_TLSP		56		/* Transport Layer Security */
 #define	IPPROTO_SKIP		57		/* SKIP */
 #define	IPPROTO_ICMPV6		58		/* ICMP6 */
 #define	IPPROTO_NONE		59		/* IP6 no next header */
 #define	IPPROTO_DSTOPTS		60		/* IP6 destination option */
 #define	IPPROTO_AHIP		61		/* any host internal protocol */
 #define	IPPROTO_CFTP		62		/* CFTP */
 #define	IPPROTO_HELLO		63		/* "hello" routing protocol */
 #define	IPPROTO_SATEXPAK	64		/* SATNET/Backroom EXPAK */
 #define	IPPROTO_KRYPTOLAN	65		/* Kryptolan */
 #define	IPPROTO_RVD		66		/* Remote Virtual Disk */
 #define	IPPROTO_IPPC		67		/* Pluribus Packet Core */
 #define	IPPROTO_ADFS		68		/* Any distributed FS */
 #define	IPPROTO_SATMON		69		/* Satnet Monitoring */
 #define	IPPROTO_VISA		70		/* VISA Protocol */
 #define	IPPROTO_IPCV		71		/* Packet Core Utility */
 #define	IPPROTO_CPNX		72		/* Comp. Prot. Net. Executive */
 #define	IPPROTO_CPHB		73		/* Comp. Prot. HeartBeat */
 #define	IPPROTO_WSN		74		/* Wang Span Network */
 #define	IPPROTO_PVP		75		/* Packet Video Protocol */
 #define	IPPROTO_BRSATMON	76		/* BackRoom SATNET Monitoring */
 #define	IPPROTO_ND		77		/* Sun net disk proto (temp.) */
 #define	IPPROTO_WBMON		78		/* WIDEBAND Monitoring */
 #define	IPPROTO_WBEXPAK		79		/* WIDEBAND EXPAK */
 #define	IPPROTO_EON		80		/* ISO cnlp */
 #define	IPPROTO_VMTP		81		/* VMTP */
 #define	IPPROTO_SVMTP		82		/* Secure VMTP */
 #define	IPPROTO_VINES		83		/* Banyon VINES */
 #define	IPPROTO_TTP		84		/* TTP */
 #define	IPPROTO_IGP		85		/* NSFNET-IGP */
 #define	IPPROTO_DGP		86		/* dissimilar gateway prot. */
 #define	IPPROTO_TCF		87		/* TCF */
 #define	IPPROTO_IGRP		88		/* Cisco/GXS IGRP */
 #define	IPPROTO_OSPFIGP		89		/* OSPFIGP */
 #define	IPPROTO_SRPC		90		/* Strite RPC protocol */
 #define	IPPROTO_LARP		91		/* Locus Address Resoloution */
 #define	IPPROTO_MTP		92		/* Multicast Transport */
 #define	IPPROTO_AX25		93		/* AX.25 Frames */
 #define	IPPROTO_IPEIP		94		/* IP encapsulated in IP */
 #define	IPPROTO_MICP		95		/* Mobile Int.ing control */
 #define	IPPROTO_SCCSP		96		/* Semaphore Comm. security */
 #define	IPPROTO_ETHERIP		97		/* Ethernet IP encapsulation */
 #define	IPPROTO_ENCAP		98		/* encapsulation header */
 #define	IPPROTO_APES		99		/* any private encr. scheme */
 #define	IPPROTO_GMTP		100		/* GMTP*/
 #define	IPPROTO_IPCOMP		108		/* payload compression (IPComp) */
 #define	IPPROTO_SCTP		132		/* SCTP */
 #define	IPPROTO_MH		135		/* IPv6 Mobility Header */
 #define	IPPROTO_UDPLITE		136		/* UDP-Lite */
 #define	IPPROTO_HIP		139		/* IP6 Host Identity Protocol */
 #define	IPPROTO_SHIM6		140		/* IP6 Shim6 Protocol */
 /* 101-254: Partly Unassigned */
 #define	IPPROTO_PIM		103		/* Protocol Independent Mcast */
 #define	IPPROTO_CARP		112		/* CARP */
 #define	IPPROTO_PGM		113		/* PGM */
 #define	IPPROTO_MPLS		137		/* MPLS-in-IP */
 #define	IPPROTO_PFSYNC		240		/* PFSYNC */
 #define	IPPROTO_RESERVED_253	253		/* Reserved */
 #define	IPPROTO_RESERVED_254	254		/* Reserved */
 /* 255: Reserved */
 /* BSD Private, local use, namespace incursion, no longer used */
 #define	IPPROTO_OLD_DIVERT	254		/* OLD divert pseudo-proto */
 #define	IPPROTO_MAX		256
 
 /* last return value of *_input(), meaning "all job for this pkt is done".  */
 #define	IPPROTO_DONE		257
 
 /* Only used internally, so can be outside the range of valid IP protocols. */
 #define	IPPROTO_DIVERT		258		/* divert pseudo-protocol */
 #define	IPPROTO_SEND		259		/* SeND pseudo-protocol */
 
 /*
  * Defined to avoid confusion.  The master value is defined by
  * PROTO_SPACER in sys/protosw.h.
  */
 #define	IPPROTO_SPACER		32767		/* spacer for loadable protos */
 
 /*
  * Local port number conventions:
  *
  * When a user does a bind(2) or connect(2) with a port number of zero,
  * a non-conflicting local port address is chosen.
  * The default range is IPPORT_HIFIRSTAUTO through
  * IPPORT_HILASTAUTO, although that is settable by sysctl.
  *
  * A user may set the IPPROTO_IP option IP_PORTRANGE to change this
  * default assignment range.
  *
  * The value IP_PORTRANGE_DEFAULT causes the default behavior.
  *
  * The value IP_PORTRANGE_HIGH changes the range of candidate port numbers
  * into the "high" range.  These are reserved for client outbound connections
  * which do not want to be filtered by any firewalls.
  *
  * The value IP_PORTRANGE_LOW changes the range to the "low" are
  * that is (by convention) restricted to privileged processes.  This
  * convention is based on "vouchsafe" principles only.  It is only secure
  * if you trust the remote host to restrict these ports.
  *
  * The default range of ports and the high range can be changed by
  * sysctl(3).  (net.inet.ip.portrange.{hi,low,}{first,last})
  *
  * Changing those values has bad security implications if you are
  * using a stateless firewall that is allowing packets outside of that
  * range in order to allow transparent outgoing connections.
  *
  * Such a firewall configuration will generally depend on the use of these
  * default values.  If you change them, you may find your Security
  * Administrator looking for you with a heavy object.
  *
  * For a slightly more orthodox text view on this:
  *
  *            ftp://ftp.isi.edu/in-notes/iana/assignments/port-numbers
  *
  *    port numbers are divided into three ranges:
  *
  *                0 -  1023 Well Known Ports
  *             1024 - 49151 Registered Ports
  *            49152 - 65535 Dynamic and/or Private Ports
  *
  */
 
 /*
  * Ports < IPPORT_RESERVED are reserved for
  * privileged processes (e.g. root).         (IP_PORTRANGE_LOW)
  */
 #define	IPPORT_RESERVED		1024
 
 /*
  * Default local port range, used by IP_PORTRANGE_DEFAULT
  */
 #define IPPORT_EPHEMERALFIRST	10000
 #define IPPORT_EPHEMERALLAST	65535 
  
 /*
  * Dynamic port range, used by IP_PORTRANGE_HIGH.
  */
 #define	IPPORT_HIFIRSTAUTO	49152
 #define	IPPORT_HILASTAUTO	65535
 
 /*
  * Scanning for a free reserved port return a value below IPPORT_RESERVED,
  * but higher than IPPORT_RESERVEDSTART.  Traditionally the start value was
  * 512, but that conflicts with some well-known-services that firewalls may
  * have a fit if we use.
  */
 #define	IPPORT_RESERVEDSTART	600
 
 #define	IPPORT_MAX		65535
 
 /*
  * Definitions of bits in internet address integers.
  * On subnets, the decomposition of addresses to host and net parts
  * is done according to subnet mask, not the masks here.
  */
 #define	IN_CLASSA(i)		(((in_addr_t)(i) & 0x80000000) == 0)
 #define	IN_CLASSA_NET		0xff000000
 #define	IN_CLASSA_NSHIFT	24
 #define	IN_CLASSA_HOST		0x00ffffff
 #define	IN_CLASSA_MAX		128
 
 #define	IN_CLASSB(i)		(((in_addr_t)(i) & 0xc0000000) == 0x80000000)
 #define	IN_CLASSB_NET		0xffff0000
 #define	IN_CLASSB_NSHIFT	16
 #define	IN_CLASSB_HOST		0x0000ffff
 #define	IN_CLASSB_MAX		65536
 
 #define	IN_CLASSC(i)		(((in_addr_t)(i) & 0xe0000000) == 0xc0000000)
 #define	IN_CLASSC_NET		0xffffff00
 #define	IN_CLASSC_NSHIFT	8
 #define	IN_CLASSC_HOST		0x000000ff
 
 #define	IN_CLASSD(i)		(((in_addr_t)(i) & 0xf0000000) == 0xe0000000)
 #define	IN_CLASSD_NET		0xf0000000	/* These ones aren't really */
 #define	IN_CLASSD_NSHIFT	28		/* net and host fields, but */
 #define	IN_CLASSD_HOST		0x0fffffff	/* routing needn't know.    */
 #define	IN_MULTICAST(i)		IN_CLASSD(i)
 
 #define	IN_EXPERIMENTAL(i)	(((in_addr_t)(i) & 0xf0000000) == 0xf0000000)
 #define	IN_BADCLASS(i)		(((in_addr_t)(i) & 0xf0000000) == 0xf0000000)
 
 #define IN_LINKLOCAL(i)		(((in_addr_t)(i) & 0xffff0000) == 0xa9fe0000)
 #define IN_LOOPBACK(i)		(((in_addr_t)(i) & 0xff000000) == 0x7f000000)
 #define IN_ZERONET(i)		(((in_addr_t)(i) & 0xff000000) == 0)
 
 #define	IN_PRIVATE(i)	((((in_addr_t)(i) & 0xff000000) == 0x0a000000) || \
 			 (((in_addr_t)(i) & 0xfff00000) == 0xac100000) || \
 			 (((in_addr_t)(i) & 0xffff0000) == 0xc0a80000))
 
 #define	IN_LOCAL_GROUP(i)	(((in_addr_t)(i) & 0xffffff00) == 0xe0000000)
  
 #define	IN_ANY_LOCAL(i)		(IN_LINKLOCAL(i) || IN_LOCAL_GROUP(i))
 
 #define	INADDR_LOOPBACK		((in_addr_t)0x7f000001)
 #ifndef _KERNEL
 #define	INADDR_NONE		((in_addr_t)0xffffffff)	/* -1 return */
 #endif
 
 #define	INADDR_UNSPEC_GROUP	((in_addr_t)0xe0000000)	/* 224.0.0.0 */
 #define	INADDR_ALLHOSTS_GROUP	((in_addr_t)0xe0000001)	/* 224.0.0.1 */
 #define	INADDR_ALLRTRS_GROUP	((in_addr_t)0xe0000002)	/* 224.0.0.2 */
 #define	INADDR_ALLRPTS_GROUP	((in_addr_t)0xe0000016)	/* 224.0.0.22, IGMPv3 */
 #define	INADDR_CARP_GROUP	((in_addr_t)0xe0000012)	/* 224.0.0.18 */
 #define	INADDR_PFSYNC_GROUP	((in_addr_t)0xe00000f0)	/* 224.0.0.240 */
 #define	INADDR_ALLMDNS_GROUP	((in_addr_t)0xe00000fb)	/* 224.0.0.251 */
 #define	INADDR_MAX_LOCAL_GROUP	((in_addr_t)0xe00000ff)	/* 224.0.0.255 */
 
 #define	IN_LOOPBACKNET		127			/* official! */
 
 #define	IN_RFC3021_MASK		((in_addr_t)0xfffffffe)
 
 /*
  * Options for use with [gs]etsockopt at the IP level.
  * First word of comment is data type; bool is stored in int.
  */
 #define	IP_OPTIONS		1    /* buf/ip_opts; set/get IP options */
 #define	IP_HDRINCL		2    /* int; header is included with data */
 #define	IP_TOS			3    /* int; IP type of service and preced. */
 #define	IP_TTL			4    /* int; IP time to live */
 #define	IP_RECVOPTS		5    /* bool; receive all IP opts w/dgram */
 #define	IP_RECVRETOPTS		6    /* bool; receive IP opts for response */
 #define	IP_RECVDSTADDR		7    /* bool; receive IP dst addr w/dgram */
 #define	IP_SENDSRCADDR		IP_RECVDSTADDR /* cmsg_type to set src addr */
 #define	IP_RETOPTS		8    /* ip_opts; set/get IP options */
 #define	IP_MULTICAST_IF		9    /* struct in_addr *or* struct ip_mreqn;
 				      * set/get IP multicast i/f  */
 #define	IP_MULTICAST_TTL	10   /* u_char; set/get IP multicast ttl */
 #define	IP_MULTICAST_LOOP	11   /* u_char; set/get IP multicast loopback */
 #define	IP_ADD_MEMBERSHIP	12   /* ip_mreq; add an IP group membership */
 #define	IP_DROP_MEMBERSHIP	13   /* ip_mreq; drop an IP group membership */
 #define	IP_MULTICAST_VIF	14   /* set/get IP mcast virt. iface */
 #define	IP_RSVP_ON		15   /* enable RSVP in kernel */
 #define	IP_RSVP_OFF		16   /* disable RSVP in kernel */
 #define	IP_RSVP_VIF_ON		17   /* set RSVP per-vif socket */
 #define	IP_RSVP_VIF_OFF		18   /* unset RSVP per-vif socket */
 #define	IP_PORTRANGE		19   /* int; range to choose for unspec port */
 #define	IP_RECVIF		20   /* bool; receive reception if w/dgram */
 /* for IPSEC */
 #define	IP_IPSEC_POLICY		21   /* int; set/get security policy */
 				     /* unused; was IP_FAITH */
 #define	IP_ONESBCAST		23   /* bool: send all-ones broadcast */
 #define	IP_BINDANY		24   /* bool: allow bind to any address */
 #define	IP_BINDMULTI		25   /* bool: allow multiple listeners on a tuple */
 #define	IP_RSS_LISTEN_BUCKET	26   /* int; set RSS listen bucket */
 #define	IP_ORIGDSTADDR		27   /* bool: receive IP dst addr/port w/dgram */
 #define	IP_RECVORIGDSTADDR      IP_ORIGDSTADDR
 
 /*
  * Options for controlling the firewall and dummynet.
  * Historical options (from 40 to 64) will eventually be
  * replaced by only two options, IP_FW3 and IP_DUMMYNET3.
  */
 #define	IP_FW_TABLE_ADD		40   /* add entry */
 #define	IP_FW_TABLE_DEL		41   /* delete entry */
 #define	IP_FW_TABLE_FLUSH	42   /* flush table */
 #define	IP_FW_TABLE_GETSIZE	43   /* get table size */
 #define	IP_FW_TABLE_LIST	44   /* list table contents */
 
 #define	IP_FW3			48   /* generic ipfw v.3 sockopts */
 #define	IP_DUMMYNET3		49   /* generic dummynet v.3 sockopts */
 
 #define	IP_FW_ADD		50   /* add a firewall rule to chain */
 #define	IP_FW_DEL		51   /* delete a firewall rule from chain */
 #define	IP_FW_FLUSH		52   /* flush firewall rule chain */
 #define	IP_FW_ZERO		53   /* clear single/all firewall counter(s) */
 #define	IP_FW_GET		54   /* get entire firewall rule chain */
 #define	IP_FW_RESETLOG		55   /* reset logging counters */
 
 #define IP_FW_NAT_CFG           56   /* add/config a nat rule */
 #define IP_FW_NAT_DEL           57   /* delete a nat rule */
 #define IP_FW_NAT_GET_CONFIG    58   /* get configuration of a nat rule */
 #define IP_FW_NAT_GET_LOG       59   /* get log of a nat rule */
 
 #define	IP_DUMMYNET_CONFIGURE	60   /* add/configure a dummynet pipe */
 #define	IP_DUMMYNET_DEL		61   /* delete a dummynet pipe from chain */
 #define	IP_DUMMYNET_FLUSH	62   /* flush dummynet */
 #define	IP_DUMMYNET_GET		64   /* get entire dummynet pipes */
 
 #define	IP_RECVTTL		65   /* bool; receive IP TTL w/dgram */
 #define	IP_MINTTL		66   /* minimum TTL for packet or drop */
 #define	IP_DONTFRAG		67   /* don't fragment packet */
 #define	IP_RECVTOS		68   /* bool; receive IP TOS w/dgram */
 
 /* IPv4 Source Filter Multicast API [RFC3678] */
 #define	IP_ADD_SOURCE_MEMBERSHIP	70   /* join a source-specific group */
 #define	IP_DROP_SOURCE_MEMBERSHIP	71   /* drop a single source */
 #define	IP_BLOCK_SOURCE			72   /* block a source */
 #define	IP_UNBLOCK_SOURCE		73   /* unblock a source */
 
 /* The following option is private; do not use it from user applications. */
 #define	IP_MSFILTER			74   /* set/get filter list */
 
 /* Protocol Independent Multicast API [RFC3678] */
 #define	MCAST_JOIN_GROUP		80   /* join an any-source group */
 #define	MCAST_LEAVE_GROUP		81   /* leave all sources for group */
 #define	MCAST_JOIN_SOURCE_GROUP		82   /* join a source-specific group */
 #define	MCAST_LEAVE_SOURCE_GROUP	83   /* leave a single source */
 #define	MCAST_BLOCK_SOURCE		84   /* block a source */
 #define	MCAST_UNBLOCK_SOURCE		85   /* unblock a source */
 
 /* Flow and RSS definitions */
 #define	IP_FLOWID		90   /* get flow id for the given socket/inp */
 #define	IP_FLOWTYPE		91   /* get flow type (M_HASHTYPE) */
 #define	IP_RSSBUCKETID		92   /* get RSS flowid -> bucket mapping */
 #define	IP_RECVFLOWID		93   /* bool; receive IP flowid/flowtype w/ datagram */
 #define	IP_RECVRSSBUCKETID	94   /* bool; receive IP RSS bucket id w/ datagram */
 
 /*
  * Defaults and limits for options
  */
 #define	IP_DEFAULT_MULTICAST_TTL  1	/* normally limit m'casts to 1 hop  */
 #define	IP_DEFAULT_MULTICAST_LOOP 1	/* normally hear sends if a member  */
 
 /*
- * The imo_membership vector for each socket is now dynamically allocated at
- * run-time, bounded by USHRT_MAX, and is reallocated when needed, sized
- * according to a power-of-two increment.
+ * Limit for IPv4 multicast memberships
  */
-#define	IP_MIN_MEMBERSHIPS	31
 #define	IP_MAX_MEMBERSHIPS	4095
-#define	IP_MAX_SOURCE_FILTER	1024	/* XXX to be unused */
 
 /*
  * Default resource limits for IPv4 multicast source filtering.
  * These may be modified by sysctl.
  */
 #define	IP_MAX_GROUP_SRC_FILTER		512	/* sources per group */
 #define	IP_MAX_SOCK_SRC_FILTER		128	/* sources per socket/group */
 #define	IP_MAX_SOCK_MUTE_FILTER		128	/* XXX no longer used */
 
 /*
  * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP.
  */
 struct ip_mreq {
 	struct	in_addr imr_multiaddr;	/* IP multicast address of group */
 	struct	in_addr imr_interface;	/* local IP address of interface */
 };
 
 /*
  * Modified argument structure for IP_MULTICAST_IF, obtained from Linux.
  * This is used to specify an interface index for multicast sends, as
  * the IPv4 legacy APIs do not support this (unless IP_SENDIF is available).
  */
 struct ip_mreqn {
 	struct	in_addr imr_multiaddr;	/* IP multicast address of group */
 	struct	in_addr imr_address;	/* local IP address of interface */
 	int		imr_ifindex;	/* Interface index; cast to uint32_t */
 };
 
 /*
  * Argument structure for IPv4 Multicast Source Filter APIs. [RFC3678]
  */
 struct ip_mreq_source {
 	struct	in_addr imr_multiaddr;	/* IP multicast address of group */
 	struct	in_addr imr_sourceaddr;	/* IP address of source */
 	struct	in_addr imr_interface;	/* local IP address of interface */
 };
 
 /*
  * Argument structures for Protocol-Independent Multicast Source
  * Filter APIs. [RFC3678]
  */
 struct group_req {
 	uint32_t		gr_interface;	/* interface index */
 	struct sockaddr_storage	gr_group;	/* group address */
 };
 
 struct group_source_req {
 	uint32_t		gsr_interface;	/* interface index */
 	struct sockaddr_storage	gsr_group;	/* group address */
 	struct sockaddr_storage	gsr_source;	/* source address */
 };
 
 #ifndef __MSFILTERREQ_DEFINED
 #define __MSFILTERREQ_DEFINED
 /*
  * The following structure is private; do not use it from user applications.
  * It is used to communicate IP_MSFILTER/IPV6_MSFILTER information between
  * the RFC 3678 libc functions and the kernel.
  */
 struct __msfilterreq {
 	uint32_t		 msfr_ifindex;	/* interface index */
 	uint32_t		 msfr_fmode;	/* filter mode for group */
 	uint32_t		 msfr_nsrcs;	/* # of sources in msfr_srcs */
 	struct sockaddr_storage	 msfr_group;	/* group address */
 	struct sockaddr_storage	*msfr_srcs;	/* pointer to the first member
 						 * of a contiguous array of
 						 * sources to filter in full.
 						 */
 };
 #endif
 
 struct sockaddr;
 
 /*
  * Advanced (Full-state) APIs [RFC3678]
  * The RFC specifies uint_t for the 6th argument to [sg]etsourcefilter().
  * We use uint32_t here to be consistent.
  */
 int	setipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t,
 	    uint32_t, struct in_addr *);
 int	getipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t *,
 	    uint32_t *, struct in_addr *);
 int	setsourcefilter(int, uint32_t, struct sockaddr *, socklen_t,
 	    uint32_t, uint32_t, struct sockaddr_storage *);
 int	getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t,
 	    uint32_t *, uint32_t *, struct sockaddr_storage *);
 
 /*
  * Filter modes; also used to represent per-socket filter mode internally.
  */
 #define	MCAST_UNDEFINED	0	/* fmode: not yet defined */
 #define	MCAST_INCLUDE	1	/* fmode: include these source(s) */
 #define	MCAST_EXCLUDE	2	/* fmode: exclude these source(s) */
 
 /*
  * Argument for IP_PORTRANGE:
  * - which range to search when port is unspecified at bind() or connect()
  */
 #define	IP_PORTRANGE_DEFAULT	0	/* default range */
 #define	IP_PORTRANGE_HIGH	1	/* "high" - request firewall bypass */
 #define	IP_PORTRANGE_LOW	2	/* "low" - vouchsafe security */
 
 /*
  * Identifiers for IP sysctl nodes
  */
 #define	IPCTL_FORWARDING	1	/* act as router */
 #define	IPCTL_SENDREDIRECTS	2	/* may send redirects when forwarding */
 #define	IPCTL_DEFTTL		3	/* default TTL */
 #ifdef notyet
 #define	IPCTL_DEFMTU		4	/* default MTU */
 #endif
 /*	IPCTL_RTEXPIRE		5	deprecated */
 /*	IPCTL_RTMINEXPIRE	6	deprecated */
 /*	IPCTL_RTMAXCACHE	7	deprecated */
 #define	IPCTL_SOURCEROUTE	8	/* may perform source routes */
 #define	IPCTL_DIRECTEDBROADCAST	9	/* may re-broadcast received packets */
 #define	IPCTL_INTRQMAXLEN	10	/* max length of netisr queue */
 #define	IPCTL_INTRQDROPS	11	/* number of netisr q drops */
 #define	IPCTL_STATS		12	/* ipstat structure */
 #define	IPCTL_ACCEPTSOURCEROUTE	13	/* may accept source routed packets */
 #define	IPCTL_FASTFORWARDING	14	/* use fast IP forwarding code */
 					/* 15, unused, was: IPCTL_KEEPFAITH  */
 #define	IPCTL_GIF_TTL		16	/* default TTL for gif encap packet */
 #define	IPCTL_INTRDQMAXLEN	17	/* max length of direct netisr queue */
 #define	IPCTL_INTRDQDROPS	18	/* number of direct netisr q drops */
 
 #endif /* __BSD_VISIBLE */
 
 #ifdef _KERNEL
 
 struct ifnet; struct mbuf;	/* forward declarations for Standard C */
 struct in_ifaddr;
 
 int	 in_broadcast(struct in_addr, struct ifnet *);
 int	 in_ifaddr_broadcast(struct in_addr, struct in_ifaddr *);
 int	 in_canforward(struct in_addr);
 int	 in_localaddr(struct in_addr);
 int	 in_localip(struct in_addr);
 int	 in_ifhasaddr(struct ifnet *, struct in_addr);
 int	 inet_aton(const char *, struct in_addr *); /* in libkern */
 char	*inet_ntoa_r(struct in_addr ina, char *buf); /* in libkern */
 char	*inet_ntop(int, const void *, char *, socklen_t); /* in libkern */
 int	 inet_pton(int af, const char *, void *); /* in libkern */
 void	 in_ifdetach(struct ifnet *);
 
 #define	in_hosteq(s, t)	((s).s_addr == (t).s_addr)
 #define	in_nullhost(x)	((x).s_addr == INADDR_ANY)
 #define	in_allhosts(x)	((x).s_addr == htonl(INADDR_ALLHOSTS_GROUP))
 
 #define	satosin(sa)	((struct sockaddr_in *)(sa))
 #define	sintosa(sin)	((struct sockaddr *)(sin))
 #define	ifatoia(ifa)	((struct in_ifaddr *)(ifa))
 #endif /* _KERNEL */
 
 /* INET6 stuff */
 #if __POSIX_VISIBLE >= 200112
 #define	__KAME_NETINET_IN_H_INCLUDED_
 #include <netinet6/in6.h>
 #undef __KAME_NETINET_IN_H_INCLUDED_
 #endif
 
 #endif /* !_NETINET_IN_H_*/
Index: head/sys/netinet/in_mcast.c
===================================================================
--- head/sys/netinet/in_mcast.c	(revision 349368)
+++ head/sys/netinet/in_mcast.c	(revision 349369)
@@ -1,3162 +1,3055 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Bruce Simpson.
  * Copyright (c) 2005 Robert N. M. Watson.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPv4 multicast socket, group, and socket option processing module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 #include <sys/ktr.h>
 #include <sys/taskqueue.h>
 #include <sys/gtaskqueue.h>
 #include <sys/tree.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <net/ethernet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/igmp_var.h>
 
 #ifndef KTR_IGMPV3
 #define KTR_IGMPV3 KTR_INET
 #endif
 
 #ifndef __SOCKUNION_DECLARED
 union sockunion {
 	struct sockaddr_storage	ss;
 	struct sockaddr		sa;
 	struct sockaddr_dl	sdl;
 	struct sockaddr_in	sin;
 };
 typedef union sockunion sockunion_t;
 #define __SOCKUNION_DECLARED
 #endif /* __SOCKUNION_DECLARED */
 
 static MALLOC_DEFINE(M_INMFILTER, "in_mfilter",
     "IPv4 multicast PCB-layer source filter");
 static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group");
 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options");
 static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource",
     "IPv4 multicast IGMP-layer source filter");
 
 /*
  * Locking:
- * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
+ *
+ * - Lock order is: Giant, IN_MULTI_LOCK, INP_WLOCK,
+ *   IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
  * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however
  *   it can be taken by code in net/if.c also.
  * - ip_moptions and in_mfilter are covered by the INP_WLOCK.
  *
  * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly
  * any need for in_multi itself to be virtualized -- it is bound to an ifp
  * anyway no matter what happens.
  */
 struct mtx in_multi_list_mtx;
 MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF);
 
 struct mtx in_multi_free_mtx;
 MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF);
 
 struct sx in_multi_sx;
 SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx");
 
 int ifma_restart;
 
 /*
  * Functions with non-static linkage defined in this file should be
  * declared in in_var.h:
  *  imo_multi_filter()
  *  in_addmulti()
  *  in_delmulti()
  *  in_joingroup()
  *  in_joingroup_locked()
  *  in_leavegroup()
  *  in_leavegroup_locked()
  * and ip_var.h:
  *  inp_freemoptions()
  *  inp_getmoptions()
  *  inp_setmoptions()
  *
  * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti()
  * and in_delmulti().
  */
 static void	imf_commit(struct in_mfilter *);
 static int	imf_get_source(struct in_mfilter *imf,
 		    const struct sockaddr_in *psin,
 		    struct in_msource **);
 static struct in_msource *
 		imf_graft(struct in_mfilter *, const uint8_t,
 		    const struct sockaddr_in *);
 static void	imf_leave(struct in_mfilter *);
 static int	imf_prune(struct in_mfilter *, const struct sockaddr_in *);
 static void	imf_purge(struct in_mfilter *);
 static void	imf_rollback(struct in_mfilter *);
 static void	imf_reap(struct in_mfilter *);
-static int	imo_grow(struct ip_moptions *);
-static size_t	imo_match_group(const struct ip_moptions *,
+static struct in_mfilter *
+		imo_match_group(const struct ip_moptions *,
 		    const struct ifnet *, const struct sockaddr *);
 static struct in_msource *
-		imo_match_source(const struct ip_moptions *, const size_t,
-		    const struct sockaddr *);
+		imo_match_source(struct in_mfilter *, const struct sockaddr *);
 static void	ims_merge(struct ip_msource *ims,
 		    const struct in_msource *lims, const int rollback);
 static int	in_getmulti(struct ifnet *, const struct in_addr *,
 		    struct in_multi **);
 static int	inm_get_source(struct in_multi *inm, const in_addr_t haddr,
 		    const int noalloc, struct ip_msource **pims);
 #ifdef KTR
 static int	inm_is_ifp_detached(const struct in_multi *);
 #endif
 static int	inm_merge(struct in_multi *, /*const*/ struct in_mfilter *);
 static void	inm_purge(struct in_multi *);
 static void	inm_reap(struct in_multi *);
 static void inm_release(struct in_multi *);
 static struct ip_moptions *
 		inp_findmoptions(struct inpcb *);
 static int	inp_get_source_filters(struct inpcb *, struct sockopt *);
 static int	inp_join_group(struct inpcb *, struct sockopt *);
 static int	inp_leave_group(struct inpcb *, struct sockopt *);
 static struct ifnet *
 		inp_lookup_mcast_ifp(const struct inpcb *,
 		    const struct sockaddr_in *, const struct in_addr);
 static int	inp_block_unblock_source(struct inpcb *, struct sockopt *);
 static int	inp_set_multicast_if(struct inpcb *, struct sockopt *);
 static int	inp_set_source_filters(struct inpcb *, struct sockopt *);
 static int	sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS);
 
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0,
     "IPv4 multicast");
 
 static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER;
 SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc,
     CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0,
     "Max source filters per group");
 
 static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER;
 SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc,
     CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0,
     "Max source filters per socket");
 
 int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP;
 SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
     &in_mcast_loop, 0, "Loopback multicast datagrams by default");
 
 static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters,
     "Per-interface stack-wide source filters");
 
 #ifdef KTR
 /*
  * Inline function which wraps assertions for a valid ifp.
  * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
  * is detached.
  */
 static int __inline
 inm_is_ifp_detached(const struct in_multi *inm)
 {
 	struct ifnet *ifp;
 
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that netinet's notion of ifp is the
 		 * same as net's.
 		 */
 		KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	return (ifp == NULL);
 }
 #endif
 
 static struct grouptask free_gtask;
 static struct in_multi_head inm_free_list;
 static void inm_release_task(void *arg __unused);
 static void inm_init(void)
 {
 	SLIST_INIT(&inm_free_list);
 	taskqgroup_config_gtask_init(NULL, &free_gtask, inm_release_task, "inm release task");
 }
 
 #ifdef EARLY_AP_STARTUP
 SYSINIT(inm_init, SI_SUB_SMP + 1, SI_ORDER_FIRST,
 	inm_init, NULL);
 #else
 SYSINIT(inm_init, SI_SUB_ROOT_CONF - 1, SI_ORDER_FIRST,
 	inm_init, NULL);
 #endif
 
 
 void
 inm_release_list_deferred(struct in_multi_head *inmh)
 {
 
 	if (SLIST_EMPTY(inmh))
 		return;
 	mtx_lock(&in_multi_free_mtx);
 	SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele);
 	mtx_unlock(&in_multi_free_mtx);
 	GROUPTASK_ENQUEUE(&free_gtask);
 }
 
 void
 inm_disconnect(struct in_multi *inm)
 {
 	struct ifnet *ifp;
 	struct ifmultiaddr *ifma, *ll_ifma;
 
 	ifp = inm->inm_ifp;
 	IF_ADDR_WLOCK_ASSERT(ifp);
 	ifma = inm->inm_ifma;
 
 	if_ref(ifp);
 	if (ifma->ifma_flags & IFMA_F_ENQUEUED) {
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 	}
 	MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname);
 	if ((ll_ifma = ifma->ifma_llifma) != NULL) {
 		MPASS(ifma != ll_ifma);
 		ifma->ifma_llifma = NULL;
 		MPASS(ll_ifma->ifma_llifma == NULL);
 		MPASS(ll_ifma->ifma_ifp == ifp);
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
 				CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link);
 				ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 			}
 			MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname);
 			if_freemulti(ll_ifma);
 			ifma_restart = true;
 		}
 	}
 }
 
 void
 inm_release_deferred(struct in_multi *inm)
 {
 	struct in_multi_head tmp;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	MPASS(inm->inm_refcount > 0);
 	if (--inm->inm_refcount == 0) {
 		SLIST_INIT(&tmp);
 		inm_disconnect(inm);
 		inm->inm_ifma->ifma_protospec = NULL;
 		SLIST_INSERT_HEAD(&tmp, inm, inm_nrele);
 		inm_release_list_deferred(&tmp);
 	}
 }
 
 static void
 inm_release_task(void *arg __unused)
 {
 	struct in_multi_head inm_free_tmp;
 	struct in_multi *inm, *tinm;
 
 	SLIST_INIT(&inm_free_tmp);
 	mtx_lock(&in_multi_free_mtx);
 	SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele);
 	mtx_unlock(&in_multi_free_mtx);
 	IN_MULTI_LOCK();
 	SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele);
 		MPASS(inm);
 		inm_release(inm);
 	}
 	IN_MULTI_UNLOCK();
 }
 
 /*
  * Initialize an in_mfilter structure to a known state at t0, t1
  * with an empty source filter list.
  */
 static __inline void
 imf_init(struct in_mfilter *imf, const int st0, const int st1)
 {
 	memset(imf, 0, sizeof(struct in_mfilter));
 	RB_INIT(&imf->imf_sources);
 	imf->imf_st[0] = st0;
 	imf->imf_st[1] = st1;
 }
 
+struct in_mfilter *
+ip_mfilter_alloc(const int mflags, const int st0, const int st1)
+{
+	struct in_mfilter *imf;
+
+	imf = malloc(sizeof(*imf), M_INMFILTER, mflags);
+	if (imf != NULL)
+		imf_init(imf, st0, st1);
+
+	return (imf);
+}
+
+void
+ip_mfilter_free(struct in_mfilter *imf)
+{
+
+	imf_purge(imf);
+	free(imf, M_INMFILTER);
+}
+
 /*
  * Function for looking up an in_multi record for an IPv4 multicast address
  * on a given interface. ifp must be valid. If no record found, return NULL.
  * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held.
  */
 struct in_multi *
 inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina)
 {
 	struct ifmultiaddr *ifma;
 	struct in_multi *inm;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	inm = NULL;
 	CK_STAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 			ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (inm->inm_addr.s_addr == ina.s_addr)
 			break;
 		inm = NULL;
 	}
 	return (inm);
 }
 
 /*
  * Wrapper for inm_lookup_locked().
  * The IF_ADDR_LOCK will be taken on ifp and released on return.
  */
 struct in_multi *
 inm_lookup(struct ifnet *ifp, const struct in_addr ina)
 {
 	struct epoch_tracker et;
 	struct in_multi *inm;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	NET_EPOCH_ENTER(et);
 	inm = inm_lookup_locked(ifp, ina);
 	NET_EPOCH_EXIT(et);
 
 	return (inm);
 }
 
 /*
- * Resize the ip_moptions vector to the next power-of-two minus 1.
- * May be called with locks held; do not sleep.
- */
-static int
-imo_grow(struct ip_moptions *imo)
-{
-	struct in_multi		**nmships;
-	struct in_multi		**omships;
-	struct in_mfilter	 *nmfilters;
-	struct in_mfilter	 *omfilters;
-	size_t			  idx;
-	size_t			  newmax;
-	size_t			  oldmax;
-
-	nmships = NULL;
-	nmfilters = NULL;
-	omships = imo->imo_membership;
-	omfilters = imo->imo_mfilters;
-	oldmax = imo->imo_max_memberships;
-	newmax = ((oldmax + 1) * 2) - 1;
-
-	if (newmax <= IP_MAX_MEMBERSHIPS) {
-		nmships = (struct in_multi **)realloc(omships,
-		    sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT);
-		nmfilters = (struct in_mfilter *)realloc(omfilters,
-		    sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT);
-		if (nmships != NULL && nmfilters != NULL) {
-			/* Initialize newly allocated source filter heads. */
-			for (idx = oldmax; idx < newmax; idx++) {
-				imf_init(&nmfilters[idx], MCAST_UNDEFINED,
-				    MCAST_EXCLUDE);
-			}
-			imo->imo_max_memberships = newmax;
-			imo->imo_membership = nmships;
-			imo->imo_mfilters = nmfilters;
-		}
-	}
-
-	if (nmships == NULL || nmfilters == NULL) {
-		if (nmships != NULL)
-			free(nmships, M_IPMOPTS);
-		if (nmfilters != NULL)
-			free(nmfilters, M_INMFILTER);
-		return (ETOOMANYREFS);
-	}
-
-	return (0);
-}
-
-/*
  * Find an IPv4 multicast group entry for this ip_moptions instance
  * which matches the specified group, and optionally an interface.
  * Return its index into the array, or -1 if not found.
  */
-static size_t
+static struct in_mfilter *
 imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group)
 {
 	const struct sockaddr_in *gsin;
-	struct in_multi	**pinm;
-	int		  idx;
-	int		  nmships;
+	struct in_mfilter *imf;
+	struct in_multi	*inm;
 
 	gsin = (const struct sockaddr_in *)group;
 
-	/* The imo_membership array may be lazy allocated. */
-	if (imo->imo_membership == NULL || imo->imo_num_memberships == 0)
-		return (-1);
-
-	nmships = imo->imo_num_memberships;
-	pinm = &imo->imo_membership[0];
-	for (idx = 0; idx < nmships; idx++, pinm++) {
-		if (*pinm == NULL)
+	IP_MFILTER_FOREACH(imf, &imo->imo_head) {
+		inm = imf->imf_inm;
+		if (inm == NULL)
 			continue;
-		if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) &&
-		    in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) {
+		if ((ifp == NULL || (inm->inm_ifp == ifp)) &&
+		    in_hosteq(inm->inm_addr, gsin->sin_addr)) {
 			break;
 		}
 	}
-	if (idx >= nmships)
-		idx = -1;
-
-	return (idx);
+	return (imf);
 }
 
 /*
  * Find an IPv4 multicast source entry for this imo which matches
  * the given group index for this socket, and source address.
  *
  * NOTE: This does not check if the entry is in-mode, merely if
  * it exists, which may not be the desired behaviour.
  */
 static struct in_msource *
-imo_match_source(const struct ip_moptions *imo, const size_t gidx,
-    const struct sockaddr *src)
+imo_match_source(struct in_mfilter *imf, const struct sockaddr *src)
 {
 	struct ip_msource	 find;
-	struct in_mfilter	*imf;
 	struct ip_msource	*ims;
 	const sockunion_t	*psa;
 
 	KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__));
-	KASSERT(gidx != -1 && gidx < imo->imo_num_memberships,
-	    ("%s: invalid index %d\n", __func__, (int)gidx));
 
-	/* The imo_mfilters array may be lazy allocated. */
-	if (imo->imo_mfilters == NULL)
-		return (NULL);
-	imf = &imo->imo_mfilters[gidx];
-
 	/* Source trees are keyed in host byte order. */
 	psa = (const sockunion_t *)src;
 	find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 
 	return ((struct in_msource *)ims);
 }
 
 /*
  * Perform filtering for multicast datagrams on a socket by group and source.
  *
  * Returns 0 if a datagram should be allowed through, or various error codes
  * if the socket was not a member of the group, or the source was muted, etc.
  */
 int
 imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group, const struct sockaddr *src)
 {
-	size_t gidx;
+	struct in_mfilter *imf;
 	struct in_msource *ims;
 	int mode;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 
-	gidx = imo_match_group(imo, ifp, group);
-	if (gidx == -1)
+	imf = imo_match_group(imo, ifp, group);
+	if (imf == NULL)
 		return (MCAST_NOTGMEMBER);
 
 	/*
 	 * Check if the source was included in an (S,G) join.
 	 * Allow reception on exclusive memberships by default,
 	 * reject reception on inclusive memberships by default.
 	 * Exclude source only if an in-mode exclude filter exists.
 	 * Include source only if an in-mode include filter exists.
 	 * NOTE: We are comparing group state here at IGMP t1 (now)
 	 * with socket-layer t0 (since last downcall).
 	 */
-	mode = imo->imo_mfilters[gidx].imf_st[1];
-	ims = imo_match_source(imo, gidx, src);
+	mode = imf->imf_st[1];
+	ims = imo_match_source(imf, src);
 
 	if ((ims == NULL && mode == MCAST_INCLUDE) ||
 	    (ims != NULL && ims->imsl_st[0] != mode))
 		return (MCAST_NOTSMEMBER);
 
 	return (MCAST_PASS);
 }
 
 /*
  * Find and return a reference to an in_multi record for (ifp, group),
  * and bump its reference count.
  * If one does not exist, try to allocate it, and update link-layer multicast
  * filters on ifp to listen for group.
  * Assumes the IN_MULTI lock is held across the call.
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in_getmulti(struct ifnet *ifp, const struct in_addr *group,
     struct in_multi **pinm)
 {
 	struct sockaddr_in	 gsin;
 	struct ifmultiaddr	*ifma;
 	struct in_ifinfo	*ii;
 	struct in_multi		*inm;
 	int error;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET];
 	IN_MULTI_LIST_LOCK();
 	inm = inm_lookup(ifp, *group);
 	if (inm != NULL) {
 		/*
 		 * If we already joined this group, just bump the
 		 * refcount and return it.
 		 */
 		KASSERT(inm->inm_refcount >= 1,
 		    ("%s: bad refcount %d", __func__, inm->inm_refcount));
 		inm_acquire_locked(inm);
 		*pinm = inm;
 	}
 	IN_MULTI_LIST_UNLOCK();
 	if (inm != NULL)
 		return (0);
 	
 	memset(&gsin, 0, sizeof(gsin));
 	gsin.sin_family = AF_INET;
 	gsin.sin_len = sizeof(struct sockaddr_in);
 	gsin.sin_addr = *group;
 
 	/*
 	 * Check if a link-layer group is already associated
 	 * with this network-layer group on the given ifnet.
 	 */
 	error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma);
 	if (error != 0)
 		return (error);
 
 	/* XXX ifma_protospec must be covered by IF_ADDR_LOCK */
 	IN_MULTI_LIST_LOCK();
 	IF_ADDR_WLOCK(ifp);
 
 	/*
 	 * If something other than netinet is occupying the link-layer
 	 * group, print a meaningful error message and back out of
 	 * the allocation.
 	 * Otherwise, bump the refcount on the existing network-layer
 	 * group association and return it.
 	 */
 	if (ifma->ifma_protospec != NULL) {
 		inm = (struct in_multi *)ifma->ifma_protospec;
 #ifdef INVARIANTS
 		KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
 		    __func__));
 		KASSERT(ifma->ifma_addr->sa_family == AF_INET,
 		    ("%s: ifma not AF_INET", __func__));
 		KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
 		if (inm->inm_ifma != ifma || inm->inm_ifp != ifp ||
 		    !in_hosteq(inm->inm_addr, *group)) {
 			char addrbuf[INET_ADDRSTRLEN];
 
 			panic("%s: ifma %p is inconsistent with %p (%s)",
 			    __func__, ifma, inm, inet_ntoa_r(*group, addrbuf));
 		}
 #endif
 		inm_acquire_locked(inm);
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	IF_ADDR_WLOCK_ASSERT(ifp);
 
 	/*
 	 * A new in_multi record is needed; allocate and initialize it.
 	 * We DO NOT perform an IGMP join as the in_ layer may need to
 	 * push an initial source list down to IGMP to support SSM.
 	 *
 	 * The initial source filter state is INCLUDE, {} as per the RFC.
 	 */
 	inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO);
 	if (inm == NULL) {
 		IF_ADDR_WUNLOCK(ifp);
 		IN_MULTI_LIST_UNLOCK();
 		if_delmulti_ifma(ifma);
 		return (ENOMEM);
 	}
 	inm->inm_addr = *group;
 	inm->inm_ifp = ifp;
 	inm->inm_igi = ii->ii_igmp;
 	inm->inm_ifma = ifma;
 	inm->inm_refcount = 1;
 	inm->inm_state = IGMP_NOT_MEMBER;
 	mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES);
 	inm->inm_st[0].iss_fmode = MCAST_UNDEFINED;
 	inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 	RB_INIT(&inm->inm_srcs);
 
 	ifma->ifma_protospec = inm;
 
 	*pinm = inm;
  out_locked:
 	IF_ADDR_WUNLOCK(ifp);
 	IN_MULTI_LIST_UNLOCK();
 	return (0);
 }
 
 /*
  * Drop a reference to an in_multi record.
  *
  * If the refcount drops to 0, free the in_multi record and
  * delete the underlying link-layer membership.
  */
 static void
 inm_release(struct in_multi *inm)
 {
 	struct ifmultiaddr *ifma;
 	struct ifnet *ifp;
 
 	CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount);
 	MPASS(inm->inm_refcount == 0);
 	CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm);
 
 	ifma = inm->inm_ifma;
 	ifp = inm->inm_ifp;
 
 	/* XXX this access is not covered by IF_ADDR_LOCK */
 	CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma);
 	if (ifp != NULL) {
 		CURVNET_SET(ifp->if_vnet);
 		inm_purge(inm);
 		free(inm, M_IPMADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 		CURVNET_RESTORE();
 		if_rele(ifp);
 	} else {
 		inm_purge(inm);
 		free(inm, M_IPMADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 	}
 }
 
 /*
  * Clear recorded source entries for a group.
  * Used by the IGMP code. Caller must hold the IN_MULTI lock.
  * FIXME: Should reap.
  */
 void
 inm_clear_recorded(struct in_multi *inm)
 {
 	struct ip_msource	*ims;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 		if (ims->ims_stp) {
 			ims->ims_stp = 0;
 			--inm->inm_st[1].iss_rec;
 		}
 	}
 	KASSERT(inm->inm_st[1].iss_rec == 0,
 	    ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec));
 }
 
 /*
  * Record a source as pending for a Source-Group IGMPv3 query.
  * This lives here as it modifies the shared tree.
  *
  * inm is the group descriptor.
  * naddr is the address of the source to record in network-byte order.
  *
  * If the net.inet.igmp.sgalloc sysctl is non-zero, we will
  * lazy-allocate a source node in response to an SG query.
  * Otherwise, no allocation is performed. This saves some memory
  * with the trade-off that the source will not be reported to the
  * router if joined in the window between the query response and
  * the group actually being joined on the local host.
  *
  * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed.
  * This turns off the allocation of a recorded source entry if
  * the group has not been joined.
  *
  * Return 0 if the source didn't exist or was already marked as recorded.
  * Return 1 if the source was marked as recorded by this function.
  * Return <0 if any error occurred (negated errno code).
  */
 int
 inm_record_source(struct in_multi *inm, const in_addr_t naddr)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	find.ims_haddr = ntohl(naddr);
 	ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
 	if (ims && ims->ims_stp)
 		return (0);
 	if (ims == NULL) {
 		if (inm->inm_nsrc == in_mcast_maxgrpsrc)
 			return (-ENOSPC);
 		nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (-ENOMEM);
 		nims->ims_haddr = find.ims_haddr;
 		RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
 		++inm->inm_nsrc;
 		ims = nims;
 	}
 
 	/*
 	 * Mark the source as recorded and update the recorded
 	 * source count.
 	 */
 	++ims->ims_stp;
 	++inm->inm_st[1].iss_rec;
 
 	return (1);
 }
 
 /*
  * Return a pointer to an in_msource owned by an in_mfilter,
  * given its source address.
  * Lazy-allocate if needed. If this is a new entry its filter state is
  * undefined at t0.
  *
  * imf is the filter set being modified.
  * haddr is the source address in *host* byte-order.
  *
  * SMPng: May be called with locks held; malloc must not block.
  */
 static int
 imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin,
     struct in_msource **plims)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 	struct in_msource	*lims;
 	int			 error;
 
 	error = 0;
 	ims = NULL;
 	lims = NULL;
 
 	/* key is host byte order */
 	find.ims_haddr = ntohl(psin->sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 	lims = (struct in_msource *)ims;
 	if (lims == NULL) {
 		if (imf->imf_nsrc == in_mcast_maxsocksrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct in_msource), M_INMFILTER,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		lims = (struct in_msource *)nims;
 		lims->ims_haddr = find.ims_haddr;
 		lims->imsl_st[0] = MCAST_UNDEFINED;
 		RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
 		++imf->imf_nsrc;
 	}
 
 	*plims = lims;
 
 	return (error);
 }
 
 /*
  * Graft a source entry into an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being in the new filter mode at t1.
  *
  * Return the pointer to the new node, otherwise return NULL.
  */
 static struct in_msource *
 imf_graft(struct in_mfilter *imf, const uint8_t st1,
     const struct sockaddr_in *psin)
 {
 	struct ip_msource	*nims;
 	struct in_msource	*lims;
 
 	nims = malloc(sizeof(struct in_msource), M_INMFILTER,
 	    M_NOWAIT | M_ZERO);
 	if (nims == NULL)
 		return (NULL);
 	lims = (struct in_msource *)nims;
 	lims->ims_haddr = ntohl(psin->sin_addr.s_addr);
 	lims->imsl_st[0] = MCAST_UNDEFINED;
 	lims->imsl_st[1] = st1;
 	RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
 	++imf->imf_nsrc;
 
 	return (lims);
 }
 
 /*
  * Prune a source entry from an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being left at t1, it is not freed.
  *
  * Return 0 if no error occurred, otherwise return an errno value.
  */
 static int
 imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	/* key is host byte order */
 	find.ims_haddr = ntohl(psin->sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 	if (ims == NULL)
 		return (ENOENT);
 	lims = (struct in_msource *)ims;
 	lims->imsl_st[1] = MCAST_UNDEFINED;
 	return (0);
 }
 
 /*
  * Revert socket-layer filter set deltas at t1 to t0 state.
  */
 static void
 imf_rollback(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 	struct in_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == lims->imsl_st[1]) {
 			/* no change at t1 */
 			continue;
 		} else if (lims->imsl_st[0] != MCAST_UNDEFINED) {
 			/* revert change to existing source at t1 */
 			lims->imsl_st[1] = lims->imsl_st[0];
 		} else {
 			/* revert source added t1 */
 			CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 			RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 			free(ims, M_INMFILTER);
 			imf->imf_nsrc--;
 		}
 	}
 	imf->imf_st[1] = imf->imf_st[0];
 }
 
 /*
  * Mark socket-layer filter set as INCLUDE {} at t1.
  */
 static void
 imf_leave(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		lims->imsl_st[1] = MCAST_UNDEFINED;
 	}
 	imf->imf_st[1] = MCAST_INCLUDE;
 }
 
 /*
  * Mark socket-layer filter set deltas as committed.
  */
 static void
 imf_commit(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		lims->imsl_st[0] = lims->imsl_st[1];
 	}
 	imf->imf_st[0] = imf->imf_st[1];
 }
 
 /*
  * Reap unreferenced sources from socket-layer filter set.
  */
 static void
 imf_reap(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 	struct in_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		lims = (struct in_msource *)ims;
 		if ((lims->imsl_st[0] == MCAST_UNDEFINED) &&
 		    (lims->imsl_st[1] == MCAST_UNDEFINED)) {
 			CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims);
 			RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 			free(ims, M_INMFILTER);
 			imf->imf_nsrc--;
 		}
 	}
 }
 
 /*
  * Purge socket-layer filter set.
  */
 static void
 imf_purge(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 		free(ims, M_INMFILTER);
 		imf->imf_nsrc--;
 	}
 	imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED;
 	KASSERT(RB_EMPTY(&imf->imf_sources),
 	    ("%s: imf_sources not empty", __func__));
 }
 
 /*
  * Look up a source filter entry for a multicast group.
  *
  * inm is the group descriptor to work with.
  * haddr is the host-byte-order IPv4 address to look up.
  * noalloc may be non-zero to suppress allocation of sources.
  * *pims will be set to the address of the retrieved or allocated source.
  *
  * SMPng: NOTE: may be called with locks held.
  * Return 0 if successful, otherwise return a non-zero error code.
  */
 static int
 inm_get_source(struct in_multi *inm, const in_addr_t haddr,
     const int noalloc, struct ip_msource **pims)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 
 	find.ims_haddr = haddr;
 	ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
 	if (ims == NULL && !noalloc) {
 		if (inm->inm_nsrc == in_mcast_maxgrpsrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		nims->ims_haddr = haddr;
 		RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
 		++inm->inm_nsrc;
 		ims = nims;
 #ifdef KTR
 		CTR3(KTR_IGMPV3, "%s: allocated 0x%08x as %p", __func__,
 		    haddr, ims);
 #endif
 	}
 
 	*pims = ims;
 	return (0);
 }
 
 /*
  * Merge socket-layer source into IGMP-layer source.
  * If rollback is non-zero, perform the inverse of the merge.
  */
 static void
 ims_merge(struct ip_msource *ims, const struct in_msource *lims,
     const int rollback)
 {
 	int n = rollback ? -1 : 1;
 
 	if (lims->imsl_st[0] == MCAST_EXCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].ex -= n;
 	} else if (lims->imsl_st[0] == MCAST_INCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 in -= %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].in -= n;
 	}
 
 	if (lims->imsl_st[1] == MCAST_EXCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 ex += %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].ex += n;
 	} else if (lims->imsl_st[1] == MCAST_INCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 in += %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].in += n;
 	}
 }
 
 /*
  * Atomically update the global in_multi state, when a membership's
  * filter list is being updated in any way.
  *
  * imf is the per-inpcb-membership group filter pointer.
  * A fake imf may be passed for in-kernel consumers.
  *
  * XXX This is a candidate for a set-symmetric-difference style loop
  * which would eliminate the repeated lookup from root of ims nodes,
  * as they share the same key space.
  *
  * If any error occurred this function will back out of refcounts
  * and return a non-zero value.
  */
 static int
 inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *nims;
 	struct in_msource	*lims;
 	int			 schanged, error;
 	int			 nsrc0, nsrc1;
 
 	schanged = 0;
 	error = 0;
 	nsrc1 = nsrc0 = 0;
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	/*
 	 * Update the source filters first, as this may fail.
 	 * Maintain count of in-mode filters at t0, t1. These are
 	 * used to work out if we transition into ASM mode or not.
 	 * Maintain a count of source filters whose state was
 	 * actually modified by this operation.
 	 */
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++;
 		if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++;
 		if (lims->imsl_st[0] == lims->imsl_st[1]) continue;
 		error = inm_get_source(inm, lims->ims_haddr, 0, &nims);
 		++schanged;
 		if (error)
 			break;
 		ims_merge(nims, lims, 0);
 	}
 	if (error) {
 		struct ip_msource *bims;
 
 		RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) {
 			lims = (struct in_msource *)ims;
 			if (lims->imsl_st[0] == lims->imsl_st[1])
 				continue;
 			(void)inm_get_source(inm, lims->ims_haddr, 1, &bims);
 			if (bims == NULL)
 				continue;
 			ims_merge(bims, lims, 1);
 		}
 		goto out_reap;
 	}
 
 	CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1",
 	    __func__, nsrc0, nsrc1);
 
 	/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
 	if (imf->imf_st[0] == imf->imf_st[1] &&
 	    imf->imf_st[1] == MCAST_INCLUDE) {
 		if (nsrc1 == 0) {
 			CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
 			--inm->inm_st[1].iss_in;
 		}
 	}
 
 	/* Handle filter mode transition on socket. */
 	if (imf->imf_st[0] != imf->imf_st[1]) {
 		CTR3(KTR_IGMPV3, "%s: imf transition %d to %d",
 		    __func__, imf->imf_st[0], imf->imf_st[1]);
 
 		if (imf->imf_st[0] == MCAST_EXCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__);
 			--inm->inm_st[1].iss_ex;
 		} else if (imf->imf_st[0] == MCAST_INCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
 			--inm->inm_st[1].iss_in;
 		}
 
 		if (imf->imf_st[1] == MCAST_EXCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__);
 			inm->inm_st[1].iss_ex++;
 		} else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
 			CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__);
 			inm->inm_st[1].iss_in++;
 		}
 	}
 
 	/*
 	 * Track inm filter state in terms of listener counts.
 	 * If there are any exclusive listeners, stack-wide
 	 * membership is exclusive.
 	 * Otherwise, if only inclusive listeners, stack-wide is inclusive.
 	 * If no listeners remain, state is undefined at t1,
 	 * and the IGMP lifecycle for this group should finish.
 	 */
 	if (inm->inm_st[1].iss_ex > 0) {
 		CTR1(KTR_IGMPV3, "%s: transition to EX", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_EXCLUDE;
 	} else if (inm->inm_st[1].iss_in > 0) {
 		CTR1(KTR_IGMPV3, "%s: transition to IN", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_INCLUDE;
 	} else {
 		CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 	}
 
 	/* Decrement ASM listener count on transition out of ASM mode. */
 	if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
 		if ((imf->imf_st[1] != MCAST_EXCLUDE) ||
 		    (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) {
 			CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__);
 			--inm->inm_st[1].iss_asm;
 		}
 	}
 
 	/* Increment ASM listener count on transition to ASM mode. */
 	if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
 		CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__);
 		inm->inm_st[1].iss_asm++;
 	}
 
 	CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm);
 	inm_print(inm);
 
 out_reap:
 	if (schanged > 0) {
 		CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__);
 		inm_reap(inm);
 	}
 	return (error);
 }
 
 /*
  * Mark an in_multi's filter set deltas as committed.
  * Called by IGMP after a state change has been enqueued.
  */
 void
 inm_commit(struct in_multi *inm)
 {
 	struct ip_msource	*ims;
 
 	CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm);
 	CTR1(KTR_IGMPV3, "%s: pre commit:", __func__);
 	inm_print(inm);
 
 	RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 		ims->ims_st[0] = ims->ims_st[1];
 	}
 	inm->inm_st[0] = inm->inm_st[1];
 }
 
 /*
  * Reap unreferenced nodes from an in_multi's filter set.
  */
 static void
 inm_reap(struct in_multi *inm)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
 		if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 ||
 		    ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 ||
 		    ims->ims_stp != 0)
 			continue;
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
 		free(ims, M_IPMSOURCE);
 		inm->inm_nsrc--;
 	}
 }
 
 /*
  * Purge all source nodes from an in_multi's filter set.
  */
 static void
 inm_purge(struct in_multi *inm)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
 		free(ims, M_IPMSOURCE);
 		inm->inm_nsrc--;
 	}
 }
 
 /*
  * Join a multicast group; unlocked entry point.
  *
  * SMPng: XXX: in_joingroup() is called from in_control() when Giant
  * is not held. Fortunately, ifp is unlikely to have been detached
  * at this point, so we assume it's OK to recurse.
  */
 int
 in_joingroup(struct ifnet *ifp, const struct in_addr *gina,
     /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
 {
 	int error;
 
 	IN_MULTI_LOCK();
 	error = in_joingroup_locked(ifp, gina, imf, pinm);
 	IN_MULTI_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Join a multicast group; real entry point.
  *
  * Only preserves atomicity at inm level.
  * NOTE: imf argument cannot be const due to sys/tree.h limitations.
  *
  * If the IGMP downcall fails, the group is not joined, and an error
  * code is returned.
  */
 int
 in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina,
     /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
 {
 	struct in_mfilter	 timf;
 	struct in_multi		*inm;
 	int			 error;
 
 	IN_MULTI_LOCK_ASSERT();
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 
 	CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__,
 	    ntohl(gina->s_addr), ifp, ifp->if_xname);
 
 	error = 0;
 	inm = NULL;
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		imf = &timf;
 	}
 
 	error = in_getmulti(ifp, gina, &inm);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__);
 		return (error);
 	}
 	IN_MULTI_LIST_LOCK();
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		goto out_inm_release;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to update source", __func__);
 		goto out_inm_release;
 	}
 
  out_inm_release:
 	if (error) {
 
 		CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
 		inm_release_deferred(inm);
 	} else {
 		*pinm = inm;
 	}
 	IN_MULTI_LIST_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Leave a multicast group; unlocked entry point.
  */
 int
 in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	int error;
 
 	IN_MULTI_LOCK();
 	error = in_leavegroup_locked(inm, imf);
 	IN_MULTI_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Leave a multicast group; real entry point.
  * All source filters will be expunged.
  *
  * Only preserves atomicity at inm level.
  *
  * Holding the write lock for the INP which contains imf
  * is highly advisable. We can't assert for it as imf does not
  * contain a back-pointer to the owning inp.
  *
  * Note: This is not the same as inm_release(*) as this function also
  * makes a state change downcall into IGMP.
  */
 int
 in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	struct in_mfilter	 timf;
 	int			 error;
 
 	error = 0;
 
 	IN_MULTI_LOCK_ASSERT();
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 
 	CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__,
 	    inm, ntohl(inm->inm_addr.s_addr),
 	    (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname),
 	    imf);
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
 		imf = &timf;
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 *
 	 * As this particular invocation should not cause any memory
 	 * to be allocated, and there is no opportunity to roll back
 	 * the transaction, it MUST NOT fail.
 	 */
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	IN_MULTI_LIST_LOCK();
 	error = inm_merge(inm, imf);
 	KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	CURVNET_SET(inm->inm_ifp->if_vnet);
 	error = igmp_change_state(inm);
 	IF_ADDR_WLOCK(inm->inm_ifp);
 	inm_release_deferred(inm);
 	IF_ADDR_WUNLOCK(inm->inm_ifp);
 	IN_MULTI_LIST_UNLOCK();
 	CURVNET_RESTORE();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
 	CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
 
 	return (error);
 }
 
 /*#ifndef BURN_BRIDGES*/
 /*
  * Join an IPv4 multicast group in (*,G) exclusive mode.
  * The group must be a 224.0.0.0/24 link-scope group.
  * This KPI is for legacy kernel consumers only.
  */
 struct in_multi *
 in_addmulti(struct in_addr *ap, struct ifnet *ifp)
 {
 	struct in_multi *pinm;
 	int error;
 #ifdef INVARIANTS
 	char addrbuf[INET_ADDRSTRLEN];
 #endif
 
 	KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)),
 	    ("%s: %s not in 224.0.0.0/24", __func__,
 	    inet_ntoa_r(*ap, addrbuf)));
 
 	error = in_joingroup(ifp, ap, NULL, &pinm);
 	if (error != 0)
 		pinm = NULL;
 
 	return (pinm);
 }
 
 /*
  * Block or unblock an ASM multicast source on an inpcb.
  * This implements the delta-based API described in RFC 3678.
  *
  * The delta-based API applies only to exclusive-mode memberships.
  * An IGMP downcall will be performed.
  *
  * SMPng: NOTE: Must take Giant as a join may create a new ifma.
  *
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	struct rm_priotracker		 in_ifa_tracker;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_msource		*ims;
 	struct in_multi			*inm;
-	size_t				 idx;
 	uint16_t			 fmode;
 	int				 error, doblock;
 
 	ifp = NULL;
 	error = 0;
 	doblock = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 
 	switch (sopt->sopt_name) {
 	case IP_BLOCK_SOURCE:
 	case IP_UNBLOCK_SOURCE: {
 		struct ip_mreq_source	 mreqs;
 
 		error = sooptcopyin(sopt, &mreqs,
 		    sizeof(struct ip_mreq_source),
 		    sizeof(struct ip_mreq_source));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 
 		ssa->sin.sin_family = AF_INET;
 		ssa->sin.sin_len = sizeof(struct sockaddr_in);
 		ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 
 		if (!in_nullhost(mreqs.imr_interface)) {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			INADDR_TO_IFP(mreqs.imr_interface, ifp);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		}
 		if (sopt->sopt_name == IP_BLOCK_SOURCE)
 			doblock = 1;
 
 		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",
 		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp);
 		break;
 	    }
 
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = sooptcopyin(sopt, &gsr,
 		    sizeof(struct group_source_req),
 		    sizeof(struct group_source_req));
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (ssa->sin.sin_family != AF_INET ||
 		    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 
 		ifp = ifnet_byindex(gsr.gsr_interface);
 
 		if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
 			doblock = 1;
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
+	IN_MULTI_LOCK();
+
 	/*
 	 * Check if we are actually a member of this group.
 	 */
 	imo = inp_findmoptions(inp);
-	idx = imo_match_group(imo, ifp, &gsa->sa);
-	if (idx == -1 || imo->imo_mfilters == NULL) {
+	imf = imo_match_group(imo, ifp, &gsa->sa);
+	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
+	inm = imf->imf_inm;
 
-	KASSERT(imo->imo_mfilters != NULL,
-	    ("%s: imo_mfilters not allocated", __func__));
-	imf = &imo->imo_mfilters[idx];
-	inm = imo->imo_membership[idx];
-
 	/*
 	 * Attempting to use the delta-based API on an
 	 * non exclusive-mode membership is an error.
 	 */
 	fmode = imf->imf_st[0];
 	if (fmode != MCAST_EXCLUDE) {
 		error = EINVAL;
 		goto out_inp_locked;
 	}
 
 	/*
 	 * Deal with error cases up-front:
 	 *  Asked to block, but already blocked; or
 	 *  Asked to unblock, but nothing to unblock.
 	 * If adding a new block entry, allocate it.
 	 */
-	ims = imo_match_source(imo, idx, &ssa->sa);
+	ims = imo_match_source(imf, &ssa->sa);
 	if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
 		CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__,
 		    ntohl(ssa->sin.sin_addr.s_addr), doblock ? "" : "not ");
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	if (doblock) {
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
 		ims = imf_graft(imf, fmode, &ssa->sin);
 		if (ims == NULL)
 			error = ENOMEM;
 	} else {
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
 		error = imf_prune(imf, &ssa->sin);
 	}
 
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__);
 		goto out_imf_rollback;
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
-	IN_MULTI_LOCK();
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	IN_MULTI_LIST_LOCK();
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		IN_MULTI_LIST_UNLOCK();
-		goto out_in_multi_locked;
+		goto out_imf_rollback;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	IN_MULTI_LIST_UNLOCK();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
-out_in_multi_locked:
-
-	IN_MULTI_UNLOCK();
 out_imf_rollback:
 	if (error)
 		imf_rollback(imf);
 	else
 		imf_commit(imf);
 
 	imf_reap(imf);
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
+	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Given an inpcb, return its multicast options structure pointer.  Accepts
  * an unlocked inpcb pointer, but will return it locked.  May sleep.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  * SMPng: NOTE: Returns with the INP write lock held.
  */
 static struct ip_moptions *
 inp_findmoptions(struct inpcb *inp)
 {
 	struct ip_moptions	 *imo;
-	struct in_multi		**immp;
-	struct in_mfilter	 *imfp;
-	size_t			  idx;
 
 	INP_WLOCK(inp);
 	if (inp->inp_moptions != NULL)
 		return (inp->inp_moptions);
 
 	INP_WUNLOCK(inp);
 
 	imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
-	immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS,
-	    M_WAITOK | M_ZERO);
-	imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS,
-	    M_INMFILTER, M_WAITOK);
 
 	imo->imo_multicast_ifp = NULL;
 	imo->imo_multicast_addr.s_addr = INADDR_ANY;
 	imo->imo_multicast_vif = -1;
 	imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
 	imo->imo_multicast_loop = in_mcast_loop;
-	imo->imo_num_memberships = 0;
-	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
-	imo->imo_membership = immp;
+	STAILQ_INIT(&imo->imo_head);
 
-	/* Initialize per-group source filters. */
-	for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++)
-		imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE);
-	imo->imo_mfilters = imfp;
-
 	INP_WLOCK(inp);
 	if (inp->inp_moptions != NULL) {
-		free(imfp, M_INMFILTER);
-		free(immp, M_IPMOPTS);
 		free(imo, M_IPMOPTS);
 		return (inp->inp_moptions);
 	}
 	inp->inp_moptions = imo;
 	return (imo);
 }
 
 static void
 inp_gcmoptions(struct ip_moptions *imo)
 {
-	struct in_mfilter	*imf;
+	struct in_mfilter *imf;
 	struct in_multi *inm;
 	struct ifnet *ifp;
-	size_t			 idx, nmships;
 
-	nmships = imo->imo_num_memberships;
-	for (idx = 0; idx < nmships; ++idx) {
-		imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL;
-		if (imf)
-			imf_leave(imf);
-		inm = imo->imo_membership[idx];
-		ifp = inm->inm_ifp;
-		if (ifp != NULL) {
-			CURVNET_SET(ifp->if_vnet);
-			(void)in_leavegroup(inm, imf);
-			CURVNET_RESTORE();
-		} else {
-			(void)in_leavegroup(inm, imf);
+	while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) {
+		ip_mfilter_remove(&imo->imo_head, imf);
+
+		imf_leave(imf);
+		if ((inm = imf->imf_inm) != NULL) {
+			if ((ifp = inm->inm_ifp) != NULL) {
+				CURVNET_SET(ifp->if_vnet);
+				(void)in_leavegroup(inm, imf);
+				CURVNET_RESTORE();
+			} else {
+				(void)in_leavegroup(inm, imf);
+			}
 		}
-		if (imf)
-			imf_purge(imf);
+		ip_mfilter_free(imf);
 	}
-
-	if (imo->imo_mfilters)
-		free(imo->imo_mfilters, M_INMFILTER);
-	free(imo->imo_membership, M_IPMOPTS);
 	free(imo, M_IPMOPTS);
 }
 
 /*
  * Discard the IP multicast options (and source filters).  To minimize
  * the amount of work done while holding locks such as the INP's
  * pcbinfo lock (which is used in the receive path), the free
  * operation is deferred to the epoch callback task.
  */
 void
 inp_freemoptions(struct ip_moptions *imo)
 {
 	if (imo == NULL)
 		return;
 	inp_gcmoptions(imo);
 }
 
 /*
  * Atomically get source filters on a socket for an IPv4 multicast group.
  * Called with INP lock held; returns with lock released.
  */
 static int
 inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct ip_moptions	*imo;
 	struct in_mfilter	*imf;
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 	struct sockaddr_in	*psin;
 	struct sockaddr_storage	*ptss;
 	struct sockaddr_storage	*tss;
 	int			 error;
-	size_t			 idx, nsrcs, ncsrcs;
+	size_t			 nsrcs, ncsrcs;
 
 	INP_WLOCK_ASSERT(inp);
 
 	imo = inp->inp_moptions;
 	KASSERT(imo != NULL, ("%s: null ip_moptions", __func__));
 
 	INP_WUNLOCK(inp);
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
 		return (EINVAL);
 
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	if (ifp == NULL)
 		return (EINVAL);
 
 	INP_WLOCK(inp);
 
 	/*
 	 * Lookup group on the socket.
 	 */
 	gsa = (sockunion_t *)&msfr.msfr_group;
-	idx = imo_match_group(imo, ifp, &gsa->sa);
-	if (idx == -1 || imo->imo_mfilters == NULL) {
+	imf = imo_match_group(imo, ifp, &gsa->sa);
+	if (imf == NULL) {
 		INP_WUNLOCK(inp);
 		return (EADDRNOTAVAIL);
 	}
-	imf = &imo->imo_mfilters[idx];
 
 	/*
 	 * Ignore memberships which are in limbo.
 	 */
 	if (imf->imf_st[1] == MCAST_UNDEFINED) {
 		INP_WUNLOCK(inp);
 		return (EAGAIN);
 	}
 	msfr.msfr_fmode = imf->imf_st[1];
 
 	/*
 	 * If the user specified a buffer, copy out the source filter
 	 * entries to userland gracefully.
 	 * We only copy out the number of entries which userland
 	 * has asked for, but we always tell userland how big the
 	 * buffer really needs to be.
 	 */
 	if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
 		msfr.msfr_nsrcs = in_mcast_maxsocksrc;
 	tss = NULL;
 	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
 		tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_NOWAIT | M_ZERO);
 		if (tss == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOBUFS);
 		}
 	}
 
 	/*
 	 * Count number of sources in-mode at t0.
 	 * If buffer space exists and remains, copy out source entries.
 	 */
 	nsrcs = msfr.msfr_nsrcs;
 	ncsrcs = 0;
 	ptss = tss;
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == MCAST_UNDEFINED ||
 		    lims->imsl_st[0] != imf->imf_st[0])
 			continue;
 		++ncsrcs;
 		if (tss != NULL && nsrcs > 0) {
 			psin = (struct sockaddr_in *)ptss;
 			psin->sin_family = AF_INET;
 			psin->sin_len = sizeof(struct sockaddr_in);
 			psin->sin_addr.s_addr = htonl(lims->ims_haddr);
 			psin->sin_port = 0;
 			++ptss;
 			--nsrcs;
 		}
 	}
 
 	INP_WUNLOCK(inp);
 
 	if (tss != NULL) {
 		error = copyout(tss, msfr.msfr_srcs,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		free(tss, M_TEMP);
 		if (error)
 			return (error);
 	}
 
 	msfr.msfr_nsrcs = ncsrcs;
 	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
 
 	return (error);
 }
 
 /*
  * Return the IP multicast options in response to user getsockopt().
  */
 int
 inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct rm_priotracker	 in_ifa_tracker;
 	struct ip_mreqn		 mreqn;
 	struct ip_moptions	*imo;
 	struct ifnet		*ifp;
 	struct in_ifaddr	*ia;
 	int			 error, optval;
 	u_char			 coptval;
 
 	INP_WLOCK(inp);
 	imo = inp->inp_moptions;
 	/*
 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
 	 * or is a divert socket, reject it.
 	 */
 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = 0;
 	switch (sopt->sopt_name) {
 	case IP_MULTICAST_VIF:
 		if (imo != NULL)
 			optval = imo->imo_multicast_vif;
 		else
 			optval = -1;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MULTICAST_IF:
 		memset(&mreqn, 0, sizeof(struct ip_mreqn));
 		if (imo != NULL) {
 			ifp = imo->imo_multicast_ifp;
 			if (!in_nullhost(imo->imo_multicast_addr)) {
 				mreqn.imr_address = imo->imo_multicast_addr;
 			} else if (ifp != NULL) {
 				struct epoch_tracker et;
 
 				mreqn.imr_ifindex = ifp->if_index;
 				NET_EPOCH_ENTER(et);
 				IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 				if (ia != NULL)
 					mreqn.imr_address =
 					    IA_SIN(ia)->sin_addr;
 				NET_EPOCH_EXIT(et);
 			}
 		}
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
 			error = sooptcopyout(sopt, &mreqn,
 			    sizeof(struct ip_mreqn));
 		} else {
 			error = sooptcopyout(sopt, &mreqn.imr_address,
 			    sizeof(struct in_addr));
 		}
 		break;
 
 	case IP_MULTICAST_TTL:
 		if (imo == NULL)
 			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
 		else
 			optval = coptval = imo->imo_multicast_ttl;
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(u_char))
 			error = sooptcopyout(sopt, &coptval, sizeof(u_char));
 		else
 			error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MULTICAST_LOOP:
 		if (imo == NULL)
 			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
 		else
 			optval = coptval = imo->imo_multicast_loop;
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(u_char))
 			error = sooptcopyout(sopt, &coptval, sizeof(u_char));
 		else
 			error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MSFILTER:
 		if (imo == NULL) {
 			error = EADDRNOTAVAIL;
 			INP_WUNLOCK(inp);
 		} else {
 			error = inp_get_source_filters(inp, sopt);
 		}
 		break;
 
 	default:
 		INP_WUNLOCK(inp);
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Look up the ifnet to use for a multicast group membership,
  * given the IPv4 address of an interface, and the IPv4 group address.
  *
  * This routine exists to support legacy multicast applications
  * which do not understand that multicast memberships are scoped to
  * specific physical links in the networking stack, or which need
  * to join link-scope groups before IPv4 addresses are configured.
  *
  * If inp is non-NULL, use this socket's current FIB number for any
  * required FIB lookup.
  * If ina is INADDR_ANY, look up the group address in the unicast FIB,
  * and use its ifp; usually, this points to the default next-hop.
  *
  * If the FIB lookup fails, attempt to use the first non-loopback
  * interface with multicast capability in the system as a
  * last resort. The legacy IPv4 ASM API requires that we do
  * this in order to allow groups to be joined when the routing
  * table has not yet been populated during boot.
  *
  * Returns NULL if no ifp could be found.
  *
  * FUTURE: Implement IPv4 source-address selection.
  */
 static struct ifnet *
 inp_lookup_mcast_ifp(const struct inpcb *inp,
     const struct sockaddr_in *gsin, const struct in_addr ina)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct ifnet *ifp;
 	struct nhop4_basic nh4;
 	uint32_t fibnum;
 
 	KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__));
 	KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)),
 	    ("%s: not multicast", __func__));
 
 	ifp = NULL;
 	if (!in_nullhost(ina)) {
 		IN_IFADDR_RLOCK(&in_ifa_tracker);
 		INADDR_TO_IFP(ina, ifp);
 		IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 	} else {
 		fibnum = inp ? inp->inp_inc.inc_fibnum : 0;
 		if (fib4_lookup_nh_basic(fibnum, gsin->sin_addr, 0, 0, &nh4)==0)
 			ifp = nh4.nh_ifp;
 		else {
 			struct in_ifaddr *ia;
 			struct ifnet *mifp;
 
 			mifp = NULL;
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 				mifp = ia->ia_ifp;
 				if (!(mifp->if_flags & IFF_LOOPBACK) &&
 				     (mifp->if_flags & IFF_MULTICAST)) {
 					ifp = mifp;
 					break;
 				}
 			}
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		}
 	}
 
 	return (ifp);
 }
 
 /*
  * Join an IPv4 multicast group, possibly with a source.
  */
 static int
 inp_join_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_multi			*inm;
 	struct in_msource		*lims;
-	size_t				 idx;
 	int				 error, is_new;
 
 	ifp = NULL;
-	imf = NULL;
 	lims = NULL;
 	error = 0;
-	is_new = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	switch (sopt->sopt_name) {
 	case IP_ADD_MEMBERSHIP: {
 		struct ip_mreqn mreqn;
 
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn))
 			error = sooptcopyin(sopt, &mreqn,
 			    sizeof(struct ip_mreqn), sizeof(struct ip_mreqn));
 		else
 			error = sooptcopyin(sopt, &mreqn,
 			    sizeof(struct ip_mreq), sizeof(struct ip_mreq));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqn.imr_multiaddr;
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn) &&
 		    mreqn.imr_ifindex != 0)
 			ifp = ifnet_byindex(mreqn.imr_ifindex);
 		else
 			ifp = inp_lookup_mcast_ifp(inp, &gsa->sin,
 			    mreqn.imr_address);
 		break;
 	}
 	case IP_ADD_SOURCE_MEMBERSHIP: {
 		struct ip_mreq_source	 mreqs;
 
 		error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source),
 			    sizeof(struct ip_mreq_source));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = ssa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = ssa->sin.sin_len =
 		    sizeof(struct sockaddr_in);
 
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 
 		ifp = inp_lookup_mcast_ifp(inp, &gsa->sin,
 		    mreqs.imr_interface);
 		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",
 		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp);
 		break;
 	}
 
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_JOIN_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		/*
 		 * Overwrite the port field if present, as the sockaddr
 		 * being copied in may be matched with a binary comparison.
 		 */
 		gsa->sin.sin_port = 0;
 		if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			if (ssa->sin.sin_family != AF_INET ||
 			    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 				return (EINVAL);
 			ssa->sin.sin_port = 0;
 		}
 
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EADDRNOTAVAIL);
 
+	IN_MULTI_LOCK();
+
+	/*
+	 * Find the membership in the membership list.
+	 */
 	imo = inp_findmoptions(inp);
-	idx = imo_match_group(imo, ifp, &gsa->sa);
-	if (idx == -1) {
+	imf = imo_match_group(imo, ifp, &gsa->sa);
+	if (imf == NULL) {
 		is_new = 1;
+		inm = NULL;
+
+		if (ip_mfilter_count(&imo->imo_head) >= IP_MAX_MEMBERSHIPS) {
+			error = ENOMEM;
+			goto out_inp_locked;
+		}
 	} else {
-		inm = imo->imo_membership[idx];
-		imf = &imo->imo_mfilters[idx];
+		is_new = 0;
+		inm = imf->imf_inm;
+
 		if (ssa->ss.ss_family != AF_UNSPEC) {
 			/*
 			 * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
 			 * is an error. On an existing inclusive membership,
 			 * it just adds the source to the filter list.
 			 */
 			if (imf->imf_st[1] != MCAST_INCLUDE) {
 				error = EINVAL;
 				goto out_inp_locked;
 			}
 			/*
 			 * Throw out duplicates.
 			 *
 			 * XXX FIXME: This makes a naive assumption that
 			 * even if entries exist for *ssa in this imf,
 			 * they will be rejected as dupes, even if they
 			 * are not valid in the current mode (in-mode).
 			 *
 			 * in_msource is transactioned just as for anything
 			 * else in SSM -- but note naive use of inm_graft()
 			 * below for allocating new filter entries.
 			 *
 			 * This is only an issue if someone mixes the
 			 * full-state SSM API with the delta-based API,
 			 * which is discouraged in the relevant RFCs.
 			 */
-			lims = imo_match_source(imo, idx, &ssa->sa);
+			lims = imo_match_source(imf, &ssa->sa);
 			if (lims != NULL /*&&
 			    lims->imsl_st[1] == MCAST_INCLUDE*/) {
 				error = EADDRNOTAVAIL;
 				goto out_inp_locked;
 			}
 		} else {
 			/*
 			 * MCAST_JOIN_GROUP on an existing exclusive
 			 * membership is an error; return EADDRINUSE
 			 * to preserve 4.4BSD API idempotence, and
 			 * avoid tedious detour to code below.
 			 * NOTE: This is bending RFC 3678 a bit.
 			 *
 			 * On an existing inclusive membership, this is also
 			 * an error; if you want to change filter mode,
 			 * you must use the userland API setsourcefilter().
 			 * XXX We don't reject this for imf in UNDEFINED
 			 * state at t1, because allocation of a filter
 			 * is atomic with allocation of a membership.
 			 */
 			error = EINVAL;
 			if (imf->imf_st[1] == MCAST_EXCLUDE)
 				error = EADDRINUSE;
 			goto out_inp_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
-	if (is_new) {
-		if (imo->imo_num_memberships == imo->imo_max_memberships) {
-			error = imo_grow(imo);
-			if (error)
-				goto out_inp_locked;
-		}
-		/*
-		 * Allocate the new slot upfront so we can deal with
-		 * grafting the new source filter in same code path
-		 * as for join-source on existing membership.
-		 */
-		idx = imo->imo_num_memberships;
-		imo->imo_membership[idx] = NULL;
-		imo->imo_num_memberships++;
-		KASSERT(imo->imo_mfilters != NULL,
-		    ("%s: imf_mfilters vector was not allocated", __func__));
-		imf = &imo->imo_mfilters[idx];
-		KASSERT(RB_EMPTY(&imf->imf_sources),
-		    ("%s: imf_sources not empty", __func__));
-	}
-
 	/*
 	 * Graft new source into filter list for this inpcb's
 	 * membership of the group. The in_multi may not have
 	 * been allocated yet if this is a new membership, however,
 	 * the in_mfilter slot will be allocated and must be initialized.
 	 *
 	 * Note: Grafting of exclusive mode filters doesn't happen
 	 * in this path.
 	 * XXX: Should check for non-NULL lims (node exists but may
 	 * not be in-mode) for interop with full-state API.
 	 */
 	if (ssa->ss.ss_family != AF_UNSPEC) {
 		/* Membership starts in IN mode */
 		if (is_new) {
 			CTR1(KTR_IGMPV3, "%s: new join w/source", __func__);
-			imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE);
+			imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE);
+			if (imf == NULL) {
+				error = ENOMEM;
+				goto out_inp_locked;
+			}
 		} else {
 			CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
 		}
 		lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin);
 		if (lims == NULL) {
 			CTR1(KTR_IGMPV3, "%s: merge imf state failed",
 			    __func__);
 			error = ENOMEM;
-			goto out_imo_free;
+			goto out_inp_locked;
 		}
 	} else {
 		/* No address specified; Membership starts in EX mode */
 		if (is_new) {
 			CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__);
-			imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE);
+			imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE);
+			if (imf == NULL) {
+				error = ENOMEM;
+				goto out_inp_locked;
+			}
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
-	in_pcbref(inp);
-	INP_WUNLOCK(inp);
-	IN_MULTI_LOCK();
-
 	if (is_new) {
+		in_pcbref(inp);
+		INP_WUNLOCK(inp);
+
 		error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf,
-		    &inm);
+		    &imf->imf_inm);
+
+		INP_WLOCK(inp);
+		if (in_pcbrele_wlocked(inp)) {
+			error = ENXIO;
+			goto out_inp_unlocked;
+		}
 		if (error) {
                         CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", 
                             __func__);
-                        IN_MULTI_LIST_UNLOCK();
-			goto out_imo_free;
+			goto out_inp_locked;
 		}
-		inm_acquire(inm);
-		imo->imo_membership[idx] = inm;
+		inm_acquire(imf->imf_inm);
 	} else {
 		CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 		IN_MULTI_LIST_LOCK();
 		error = inm_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
 				 __func__);
 			IN_MULTI_LIST_UNLOCK();
-			goto out_in_multi_locked;
+			imf_rollback(imf);
+			imf_reap(imf);
+			goto out_inp_locked;
 		}
 		CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 		error = igmp_change_state(inm);
 		IN_MULTI_LIST_UNLOCK();
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
 			    __func__);
-			goto out_in_multi_locked;
+			imf_rollback(imf);
+			imf_reap(imf);
+			goto out_inp_locked;
 		}
 	}
+	if (is_new)
+		ip_mfilter_insert(&imo->imo_head, imf);
 
-out_in_multi_locked:
+	imf_commit(imf);
+	imf = NULL;
 
+out_inp_locked:
+	INP_WUNLOCK(inp);
+out_inp_unlocked:
 	IN_MULTI_UNLOCK();
-	INP_WLOCK(inp);
-	if (in_pcbrele_wlocked(inp))
-		return (ENXIO);
-	if (error) {
-		imf_rollback(imf);
-		if (is_new)
-			imf_purge(imf);
-		else
-			imf_reap(imf);
-	} else {
-		imf_commit(imf);
-	}
 
-out_imo_free:
-	if (error && is_new) {
-		inm = imo->imo_membership[idx];
-		if (inm != NULL) {
+	if (is_new && imf) {
+		if (imf->imf_inm != NULL) {
 			IN_MULTI_LIST_LOCK();
-			inm_release_deferred(inm);
+			inm_release_deferred(imf->imf_inm);
 			IN_MULTI_LIST_UNLOCK();
 		}
-		imo->imo_membership[idx] = NULL;
-		--imo->imo_num_memberships;
+		ip_mfilter_free(imf);
 	}
-
-out_inp_locked:
-	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Leave an IPv4 multicast group on an inpcb, possibly with a source.
  */
 static int
 inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	struct ip_mreq_source		 mreqs;
 	struct rm_priotracker		 in_ifa_tracker;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_msource		*ims;
 	struct in_multi			*inm;
-	size_t				 idx;
-	int				 error, is_final;
+	int				 error;
+	bool				 is_final;
 
 	ifp = NULL;
 	error = 0;
-	is_final = 1;
+	is_final = true;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	switch (sopt->sopt_name) {
 	case IP_DROP_MEMBERSHIP:
 	case IP_DROP_SOURCE_MEMBERSHIP:
 		if (sopt->sopt_name == IP_DROP_MEMBERSHIP) {
 			error = sooptcopyin(sopt, &mreqs,
 			    sizeof(struct ip_mreq),
 			    sizeof(struct ip_mreq));
 			/*
 			 * Swap interface and sourceaddr arguments,
 			 * as ip_mreq and ip_mreq_source are laid
 			 * out differently.
 			 */
 			mreqs.imr_interface = mreqs.imr_sourceaddr;
 			mreqs.imr_sourceaddr.s_addr = INADDR_ANY;
 		} else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
 			error = sooptcopyin(sopt, &mreqs,
 			    sizeof(struct ip_mreq_source),
 			    sizeof(struct ip_mreq_source));
 		}
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 
 		if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
 			ssa->sin.sin_family = AF_INET;
 			ssa->sin.sin_len = sizeof(struct sockaddr_in);
 			ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 		}
 
 		/*
 		 * Attempt to look up hinted ifp from interface address.
 		 * Fallthrough with null ifp iff lookup fails, to
 		 * preserve 4.4BSD mcast API idempotence.
 		 * XXX NOTE WELL: The RFC 3678 API is preferred because
 		 * using an IPv4 address as a key is racy.
 		 */
 		if (!in_nullhost(mreqs.imr_interface)) {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			INADDR_TO_IFP(mreqs.imr_interface, ifp);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		}
 		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",
 		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp);
 
 		break;
 
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			if (ssa->sin.sin_family != AF_INET ||
 			    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 				return (EINVAL);
 		}
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 
 		ifp = ifnet_byindex(gsr.gsr_interface);
 
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
+	IN_MULTI_LOCK();
+
 	/*
-	 * Find the membership in the membership array.
+	 * Find the membership in the membership list.
 	 */
 	imo = inp_findmoptions(inp);
-	idx = imo_match_group(imo, ifp, &gsa->sa);
-	if (idx == -1) {
+	imf = imo_match_group(imo, ifp, &gsa->sa);
+	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
-	inm = imo->imo_membership[idx];
-	imf = &imo->imo_mfilters[idx];
+	inm = imf->imf_inm;
 
 	if (ssa->ss.ss_family != AF_UNSPEC)
-		is_final = 0;
+		is_final = false;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If we were instructed only to leave a given source, do so.
 	 * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
 	 */
 	if (is_final) {
+		ip_mfilter_remove(&imo->imo_head, imf);
 		imf_leave(imf);
 	} else {
 		if (imf->imf_st[0] == MCAST_EXCLUDE) {
 			error = EADDRNOTAVAIL;
 			goto out_inp_locked;
 		}
-		ims = imo_match_source(imo, idx, &ssa->sa);
+		ims = imo_match_source(imf, &ssa->sa);
 		if (ims == NULL) {
 			CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent",
 			    __func__, ntohl(ssa->sin.sin_addr.s_addr), "not ");
 			error = EADDRNOTAVAIL;
 			goto out_inp_locked;
 		}
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
 		error = imf_prune(imf, &ssa->sin);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: merge imf state failed",
 			    __func__);
 			goto out_inp_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
-	in_pcbref(inp);
-	INP_WUNLOCK(inp);
-	IN_MULTI_LOCK();
-
-	if (is_final) {
-		/*
-		 * Give up the multicast address record to which
-		 * the membership points.
-		 */
-		(void)in_leavegroup_locked(inm, imf);
-	} else {
+	if (!is_final) {
 		CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 		IN_MULTI_LIST_LOCK();
 		error = inm_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
 			    __func__);
 			IN_MULTI_LIST_UNLOCK();
-			goto out_in_multi_locked;
+			imf_rollback(imf);
+			imf_reap(imf);
+			goto out_inp_locked;
 		}
 
 		CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 		error = igmp_change_state(inm);
 		IN_MULTI_LIST_UNLOCK();
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
 			    __func__);
+			imf_rollback(imf);
+			imf_reap(imf);
+			goto out_inp_locked;
 		}
 	}
-
-out_in_multi_locked:
-
-	IN_MULTI_UNLOCK();
-	INP_WLOCK(inp);
-	if (in_pcbrele_wlocked(inp))
-		return (ENXIO);
-
-	if (error)
-		imf_rollback(imf);
-	else
-		imf_commit(imf);
-
+	imf_commit(imf);
 	imf_reap(imf);
 
-	if (is_final) {
-		/* Remove the gap in the membership and filter array. */
-		KASSERT(RB_EMPTY(&imf->imf_sources),
-		    ("%s: imf_sources not empty", __func__));
-		for (++idx; idx < imo->imo_num_memberships; ++idx) {
-			imo->imo_membership[idx - 1] = imo->imo_membership[idx];
-			imo->imo_mfilters[idx - 1] = imo->imo_mfilters[idx];
-		}
-		imf_init(&imo->imo_mfilters[idx - 1], MCAST_UNDEFINED,
-		    MCAST_EXCLUDE);
-		imo->imo_num_memberships--;
-	}
-
 out_inp_locked:
 	INP_WUNLOCK(inp);
+
+	if (is_final && imf) {
+		/*
+		 * Give up the multicast address record to which
+		 * the membership points.
+		 */
+		(void) in_leavegroup_locked(imf->imf_inm, imf);
+		ip_mfilter_free(imf);
+	}
+
+	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Select the interface for transmitting IPv4 multicast datagrams.
  *
  * Either an instance of struct in_addr or an instance of struct ip_mreqn
  * may be passed to this socket option. An address of INADDR_ANY or an
  * interface index of 0 is used to remove a previous selection.
  * When no interface is selected, one is chosen for every send.
  */
 static int
 inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct rm_priotracker	 in_ifa_tracker;
 	struct in_addr		 addr;
 	struct ip_mreqn		 mreqn;
 	struct ifnet		*ifp;
 	struct ip_moptions	*imo;
 	int			 error;
 
 	if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
 		/*
 		 * An interface index was specified using the
 		 * Linux-derived ip_mreqn structure.
 		 */
 		error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn),
 		    sizeof(struct ip_mreqn));
 		if (error)
 			return (error);
 
 		if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex)
 			return (EINVAL);
 
 		if (mreqn.imr_ifindex == 0) {
 			ifp = NULL;
 		} else {
 			ifp = ifnet_byindex(mreqn.imr_ifindex);
 			if (ifp == NULL)
 				return (EADDRNOTAVAIL);
 		}
 	} else {
 		/*
 		 * An interface was specified by IPv4 address.
 		 * This is the traditional BSD usage.
 		 */
 		error = sooptcopyin(sopt, &addr, sizeof(struct in_addr),
 		    sizeof(struct in_addr));
 		if (error)
 			return (error);
 		if (in_nullhost(addr)) {
 			ifp = NULL;
 		} else {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			INADDR_TO_IFP(addr, ifp);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			if (ifp == NULL)
 				return (EADDRNOTAVAIL);
 		}
 		CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = 0x%08x", __func__, ifp,
 		    ntohl(addr.s_addr));
 	}
 
 	/* Reject interfaces which do not support multicast. */
 	if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EOPNOTSUPP);
 
 	imo = inp_findmoptions(inp);
 	imo->imo_multicast_ifp = ifp;
 	imo->imo_multicast_addr.s_addr = INADDR_ANY;
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 /*
  * Atomically set source filters on a socket for an IPv4 multicast group.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  */
 static int
 inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct in_mfilter	*imf;
 	struct ip_moptions	*imo;
 	struct in_multi		*inm;
-	size_t			 idx;
 	int			 error;
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
 		return (ENOBUFS);
 
 	if ((msfr.msfr_fmode != MCAST_EXCLUDE &&
 	     msfr.msfr_fmode != MCAST_INCLUDE))
 		return (EINVAL);
 
 	if (msfr.msfr_group.ss_family != AF_INET ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	gsa->sin.sin_port = 0;	/* ignore port */
 
 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
 		return (EADDRNOTAVAIL);
 
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 
+	IN_MULTI_LOCK();
+
 	/*
 	 * Take the INP write lock.
 	 * Check if this socket is a member of this group.
 	 */
 	imo = inp_findmoptions(inp);
-	idx = imo_match_group(imo, ifp, &gsa->sa);
-	if (idx == -1 || imo->imo_mfilters == NULL) {
+	imf = imo_match_group(imo, ifp, &gsa->sa);
+	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
-	inm = imo->imo_membership[idx];
-	imf = &imo->imo_mfilters[idx];
+	inm = imf->imf_inm;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	imf->imf_st[1] = msfr.msfr_fmode;
 
 	/*
 	 * Apply any new source filters, if present.
 	 * Make a copy of the user-space source vector so
 	 * that we may copy them with a single copyin. This
 	 * allows us to deal with page faults up-front.
 	 */
 	if (msfr.msfr_nsrcs > 0) {
 		struct in_msource	*lims;
 		struct sockaddr_in	*psin;
 		struct sockaddr_storage	*kss, *pkss;
 		int			 i;
 
 		INP_WUNLOCK(inp);
  
 		CTR2(KTR_IGMPV3, "%s: loading %lu source list entries",
 		    __func__, (unsigned long)msfr.msfr_nsrcs);
 		kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_WAITOK);
 		error = copyin(msfr.msfr_srcs, kss,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		if (error) {
 			free(kss, M_TEMP);
 			return (error);
 		}
 
 		INP_WLOCK(inp);
 
 		/*
 		 * Mark all source filters as UNDEFINED at t1.
 		 * Restore new group filter mode, as imf_leave()
 		 * will set it to INCLUDE.
 		 */
 		imf_leave(imf);
 		imf->imf_st[1] = msfr.msfr_fmode;
 
 		/*
 		 * Update socket layer filters at t1, lazy-allocating
 		 * new entries. This saves a bunch of memory at the
 		 * cost of one RB_FIND() per source entry; duplicate
 		 * entries in the msfr_nsrcs vector are ignored.
 		 * If we encounter an error, rollback transaction.
 		 *
 		 * XXX This too could be replaced with a set-symmetric
 		 * difference like loop to avoid walking from root
 		 * every time, as the key space is common.
 		 */
 		for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
 			psin = (struct sockaddr_in *)pkss;
 			if (psin->sin_family != AF_INET) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 			if (psin->sin_len != sizeof(struct sockaddr_in)) {
 				error = EINVAL;
 				break;
 			}
 			error = imf_get_source(imf, psin, &lims);
 			if (error)
 				break;
 			lims->imsl_st[1] = imf->imf_st[1];
 		}
 		free(kss, M_TEMP);
 	}
 
 	if (error)
 		goto out_imf_rollback;
 
 	INP_WLOCK_ASSERT(inp);
-	IN_MULTI_LOCK();
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	IN_MULTI_LIST_LOCK();
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		IN_MULTI_LIST_UNLOCK();
-		goto out_in_multi_locked;
+		goto out_imf_rollback;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	IN_MULTI_LIST_UNLOCK();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
-out_in_multi_locked:
-
-	IN_MULTI_UNLOCK();
-
 out_imf_rollback:
 	if (error)
 		imf_rollback(imf);
 	else
 		imf_commit(imf);
 
 	imf_reap(imf);
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
+	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Set the IP multicast options in response to user setsockopt().
  *
  * Many of the socket options handled in this function duplicate the
  * functionality of socket options in the regular unicast API. However,
  * it is not possible to merge the duplicate code, because the idempotence
  * of the IPv4 multicast part of the BSD Sockets API must be preserved;
  * the effects of these options must be treated as separate and distinct.
  *
  * SMPng: XXX: Unlocked read of inp_socket believed OK.
  * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING
  * is refactored to no longer use vifs.
  */
 int
 inp_setmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip_moptions	*imo;
 	int			 error;
 
 	error = 0;
 
 	/*
 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
 	 * or is a divert socket, reject it.
 	 */
 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
 		return (EOPNOTSUPP);
 
 	switch (sopt->sopt_name) {
 	case IP_MULTICAST_VIF: {
 		int vifi;
 		/*
 		 * Select a multicast VIF for transmission.
 		 * Only useful if multicast forwarding is active.
 		 */
 		if (legal_vif_num == NULL) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int));
 		if (error)
 			break;
 		if (!legal_vif_num(vifi) && (vifi != -1)) {
 			error = EINVAL;
 			break;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_vif = vifi;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_MULTICAST_IF:
 		error = inp_set_multicast_if(inp, sopt);
 		break;
 
 	case IP_MULTICAST_TTL: {
 		u_char ttl;
 
 		/*
 		 * Set the IP time-to-live for outgoing multicast packets.
 		 * The original multicast API required a char argument,
 		 * which is inconsistent with the rest of the socket API.
 		 * We allow either a char or an int.
 		 */
 		if (sopt->sopt_valsize == sizeof(u_char)) {
 			error = sooptcopyin(sopt, &ttl, sizeof(u_char),
 			    sizeof(u_char));
 			if (error)
 				break;
 		} else {
 			u_int ittl;
 
 			error = sooptcopyin(sopt, &ittl, sizeof(u_int),
 			    sizeof(u_int));
 			if (error)
 				break;
 			if (ittl > 255) {
 				error = EINVAL;
 				break;
 			}
 			ttl = (u_char)ittl;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_ttl = ttl;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_MULTICAST_LOOP: {
 		u_char loop;
 
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.  The original multicast API required a
 		 * char argument, which is inconsistent with the rest
 		 * of the socket API.  We allow either a char or an int.
 		 */
 		if (sopt->sopt_valsize == sizeof(u_char)) {
 			error = sooptcopyin(sopt, &loop, sizeof(u_char),
 			    sizeof(u_char));
 			if (error)
 				break;
 		} else {
 			u_int iloop;
 
 			error = sooptcopyin(sopt, &iloop, sizeof(u_int),
 					    sizeof(u_int));
 			if (error)
 				break;
 			loop = (u_char)iloop;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_loop = !!loop;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_ADD_MEMBERSHIP:
 	case IP_ADD_SOURCE_MEMBERSHIP:
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		error = inp_join_group(inp, sopt);
 		break;
 
 	case IP_DROP_MEMBERSHIP:
 	case IP_DROP_SOURCE_MEMBERSHIP:
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		error = inp_leave_group(inp, sopt);
 		break;
 
 	case IP_BLOCK_SOURCE:
 	case IP_UNBLOCK_SOURCE:
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = inp_block_unblock_source(inp, sopt);
 		break;
 
 	case IP_MSFILTER:
 		error = inp_set_source_filters(inp, sopt);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Expose IGMP's multicast filter mode and source list(s) to userland,
  * keyed by (ifindex, group).
  * The filter mode is written out as a uint32_t, followed by
  * 0..n of struct in_addr.
  * For use by ifmcstat(8).
  * SMPng: NOTE: unlocked read of ifindex space.
  */
 static int
 sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS)
 {
 	struct in_addr			 src, group;
 	struct epoch_tracker		 et;
 	struct ifnet			*ifp;
 	struct ifmultiaddr		*ifma;
 	struct in_multi			*inm;
 	struct ip_msource		*ims;
 	int				*name;
 	int				 retval;
 	u_int				 namelen;
 	uint32_t			 fmode, ifindex;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 2)
 		return (EINVAL);
 
 	ifindex = name[0];
 	if (ifindex <= 0 || ifindex > V_if_index) {
 		CTR2(KTR_IGMPV3, "%s: ifindex %u out of range",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 
 	group.s_addr = name[1];
 	if (!IN_MULTICAST(ntohl(group.s_addr))) {
 		CTR2(KTR_IGMPV3, "%s: group 0x%08x is not multicast",
 		    __func__, ntohl(group.s_addr));
 		return (EINVAL);
 	}
 
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 
 	retval = sysctl_wire_old_buffer(req,
 	    sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr)));
 	if (retval)
 		return (retval);
 
 	IN_MULTI_LIST_LOCK();
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (!in_hosteq(inm->inm_addr, group))
 			continue;
 		fmode = inm->inm_st[1].iss_fmode;
 		retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
 		if (retval != 0)
 			break;
 		RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 			CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__,
 			    ims->ims_haddr);
 			/*
 			 * Only copy-out sources which are in-mode.
 			 */
 			if (fmode != ims_get_mode(inm, ims, 1)) {
 				CTR1(KTR_IGMPV3, "%s: skip non-in-mode",
 				    __func__);
 				continue;
 			}
 			src.s_addr = htonl(ims->ims_haddr);
 			retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr));
 			if (retval != 0)
 				break;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	IN_MULTI_LIST_UNLOCK();
 
 	return (retval);
 }
 
 #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3)
 
 static const char *inm_modestrs[] = {
 	[MCAST_UNDEFINED] = "un",
 	[MCAST_INCLUDE] = "in",
 	[MCAST_EXCLUDE] = "ex",
 };
 _Static_assert(MCAST_UNDEFINED == 0 &&
 	       MCAST_EXCLUDE + 1 == nitems(inm_modestrs),
 	       "inm_modestrs: no longer matches #defines");
 
 static const char *
 inm_mode_str(const int mode)
 {
 
 	if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
 		return (inm_modestrs[mode]);
 	return ("??");
 }
 
 static const char *inm_statestrs[] = {
 	[IGMP_NOT_MEMBER] = "not-member",
 	[IGMP_SILENT_MEMBER] = "silent",
 	[IGMP_REPORTING_MEMBER] = "reporting",
 	[IGMP_IDLE_MEMBER] = "idle",
 	[IGMP_LAZY_MEMBER] = "lazy",
 	[IGMP_SLEEPING_MEMBER] = "sleeping",
 	[IGMP_AWAKENING_MEMBER] = "awakening",
 	[IGMP_G_QUERY_PENDING_MEMBER] = "query-pending",
 	[IGMP_SG_QUERY_PENDING_MEMBER] = "sg-query-pending",
 	[IGMP_LEAVING_MEMBER] = "leaving",
 };
 _Static_assert(IGMP_NOT_MEMBER == 0 &&
 	       IGMP_LEAVING_MEMBER + 1 == nitems(inm_statestrs),
 	       "inm_statetrs: no longer matches #defines");
 
 static const char *
 inm_state_str(const int state)
 {
 
 	if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER)
 		return (inm_statestrs[state]);
 	return ("??");
 }
 
 /*
  * Dump an in_multi structure to the console.
  */
 void
 inm_print(const struct in_multi *inm)
 {
 	int t;
 	char addrbuf[INET_ADDRSTRLEN];
 
 	if ((ktr_mask & KTR_IGMPV3) == 0)
 		return;
 
 	printf("%s: --- begin inm %p ---\n", __func__, inm);
 	printf("addr %s ifp %p(%s) ifma %p\n",
 	    inet_ntoa_r(inm->inm_addr, addrbuf),
 	    inm->inm_ifp,
 	    inm->inm_ifp->if_xname,
 	    inm->inm_ifma);
 	printf("timer %u state %s refcount %u scq.len %u\n",
 	    inm->inm_timer,
 	    inm_state_str(inm->inm_state),
 	    inm->inm_refcount,
 	    inm->inm_scq.mq_len);
 	printf("igi %p nsrc %lu sctimer %u scrv %u\n",
 	    inm->inm_igi,
 	    inm->inm_nsrc,
 	    inm->inm_sctimer,
 	    inm->inm_scrv);
 	for (t = 0; t < 2; t++) {
 		printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
 		    inm_mode_str(inm->inm_st[t].iss_fmode),
 		    inm->inm_st[t].iss_asm,
 		    inm->inm_st[t].iss_ex,
 		    inm->inm_st[t].iss_in,
 		    inm->inm_st[t].iss_rec);
 	}
 	printf("%s: --- end inm %p ---\n", __func__, inm);
 }
 
 #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */
 
 void
 inm_print(const struct in_multi *inm)
 {
 
 }
 
 #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */
 
 RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp);
Index: head/sys/netinet/in_pcb.c
===================================================================
--- head/sys/netinet/in_pcb.c	(revision 349368)
+++ head/sys/netinet/in_pcb.c	(revision 349369)
@@ -1,3425 +1,3427 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 #include "opt_pcbgroup.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_llatbl.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
+#ifdef INET
+#include <netinet/in_var.h>
+#endif
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
 #ifdef TCPHPTS
 #include <netinet/tcp_hpts.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
-#endif
-#ifdef INET
-#include <netinet/in_var.h>
-#endif
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #endif /* INET6 */
+#endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <security/mac/mac_framework.h>
 
 #define	INPCBLBGROUP_SIZMIN	8
 #define	INPCBLBGROUP_SIZMAX	256
 
 static struct callout	ipport_tick_callout;
 
 /*
  * These configure the range of local port addresses assigned to
  * "unspecified" outgoing connections/packets/whatever.
  */
 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
 
 /*
  * Reserved ports accessible only to root. There are significant
  * security considerations that must be accounted for when changing these,
  * but the security benefits can be great. Please be careful.
  */
 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_reservedlow);
 
 /* Variables dealing with random ephemeral port allocation. */
 VNET_DEFINE(int, ipport_randomized) = 1;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomcps) = 10;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomtime) = 45;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_stoprandom);		/* toggled by ipport_tick */
 VNET_DEFINE(int, ipport_tcpallocs);
 VNET_DEFINE_STATIC(int, ipport_tcplastcount);
 
 #define	V_ipport_tcplastcount		VNET(ipport_tcplastcount)
 
 static void	in_pcbremlists(struct inpcb *inp);
 #ifdef INET
 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
 			    struct in_addr faddr, u_int fport_arg,
 			    struct in_addr laddr, u_int lport_arg,
 			    int lookupflags, struct ifnet *ifp);
 
 #define RANGECHK(var, min, max) \
 	if ((var) < (min)) { (var) = (min); } \
 	else if ((var) > (max)) { (var) = (max); }
 
 static int
 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, arg1, arg2, req);
 	if (error == 0) {
 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
 	}
 	return (error);
 }
 
 #undef RANGECHK
 
 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
     "IP Ports");
 
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
 	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
 	&VNET_NAME(ipport_reservedhigh), 0, "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
 	"allocations before switching to a sequental one");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomtime), 0,
 	"Minimum time to keep sequental port "
 	"allocation before switching to a random one");
 #endif /* INET */
 
 /*
  * in_pcb.c: manage the Protocol Control Blocks.
  *
  * NOTE: It is assumed that most of these functions will be called with
  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
  * functions often modify hash chains or addresses in pcbs.
  */
 
 static struct inpcblbgroup *
 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
     uint16_t port, const union in_dependaddr *addr, int size)
 {
 	struct inpcblbgroup *grp;
 	size_t bytes;
 
 	bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
 	grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
 	if (!grp)
 		return (NULL);
 	grp->il_vflag = vflag;
 	grp->il_lport = port;
 	grp->il_dependladdr = *addr;
 	grp->il_inpsiz = size;
 	CK_LIST_INSERT_HEAD(hdr, grp, il_list);
 	return (grp);
 }
 
 static void
 in_pcblbgroup_free_deferred(epoch_context_t ctx)
 {
 	struct inpcblbgroup *grp;
 
 	grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
 	free(grp, M_PCB);
 }
 
 static void
 in_pcblbgroup_free(struct inpcblbgroup *grp)
 {
 
 	CK_LIST_REMOVE(grp, il_list);
 	epoch_call(net_epoch_preempt, &grp->il_epoch_ctx,
 	    in_pcblbgroup_free_deferred);
 }
 
 static struct inpcblbgroup *
 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
     struct inpcblbgroup *old_grp, int size)
 {
 	struct inpcblbgroup *grp;
 	int i;
 
 	grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
 	    old_grp->il_lport, &old_grp->il_dependladdr, size);
 	if (grp == NULL)
 		return (NULL);
 
 	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
 	    ("invalid new local group size %d and old local group count %d",
 	     grp->il_inpsiz, old_grp->il_inpcnt));
 
 	for (i = 0; i < old_grp->il_inpcnt; ++i)
 		grp->il_inp[i] = old_grp->il_inp[i];
 	grp->il_inpcnt = old_grp->il_inpcnt;
 	in_pcblbgroup_free(old_grp);
 	return (grp);
 }
 
 /*
  * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
  * and shrink group if possible.
  */
 static void
 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
     int i)
 {
 	struct inpcblbgroup *grp, *new_grp;
 
 	grp = *grpp;
 	for (; i + 1 < grp->il_inpcnt; ++i)
 		grp->il_inp[i] = grp->il_inp[i + 1];
 	grp->il_inpcnt--;
 
 	if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
 	    grp->il_inpcnt <= grp->il_inpsiz / 4) {
 		/* Shrink this group. */
 		new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
 		if (new_grp != NULL)
 			*grpp = new_grp;
 	}
 }
 
 /*
  * Add PCB to load balance group for SO_REUSEPORT_LB option.
  */
 static int
 in_pcbinslbgrouphash(struct inpcb *inp)
 {
 	const static struct timeval interval = { 60, 0 };
 	static struct timeval lastprint;
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	uint32_t idx;
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	/*
 	 * Don't allow jailed socket to join local group.
 	 */
 	if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
 		return (0);
 
 #ifdef INET6
 	/*
 	 * Don't allow IPv4 mapped INET6 wild socket.
 	 */
 	if ((inp->inp_vflag & INP_IPV4) &&
 	    inp->inp_laddr.s_addr == INADDR_ANY &&
 	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
 		return (0);
 	}
 #endif
 
 	idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
 	hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		if (grp->il_vflag == inp->inp_vflag &&
 		    grp->il_lport == inp->inp_lport &&
 		    memcmp(&grp->il_dependladdr,
 		    &inp->inp_inc.inc_ie.ie_dependladdr,
 		    sizeof(grp->il_dependladdr)) == 0)
 			break;
 	}
 	if (grp == NULL) {
 		/* Create new load balance group. */
 		grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
 		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
 		    INPCBLBGROUP_SIZMIN);
 		if (grp == NULL)
 			return (ENOBUFS);
 	} else if (grp->il_inpcnt == grp->il_inpsiz) {
 		if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
 			if (ratecheck(&lastprint, &interval))
 				printf("lb group port %d, limit reached\n",
 				    ntohs(grp->il_lport));
 			return (0);
 		}
 
 		/* Expand this local group. */
 		grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
 		if (grp == NULL)
 			return (ENOBUFS);
 	}
 
 	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
 	    ("invalid local group size %d and count %d", grp->il_inpsiz,
 	    grp->il_inpcnt));
 
 	grp->il_inp[grp->il_inpcnt] = inp;
 	grp->il_inpcnt++;
 	return (0);
 }
 
 /*
  * Remove PCB from load balance group.
  */
 static void
 in_pcbremlbgrouphash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	int i;
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		for (i = 0; i < grp->il_inpcnt; ++i) {
 			if (grp->il_inp[i] != inp)
 				continue;
 
 			if (grp->il_inpcnt == 1) {
 				/* We are the last, free this local group. */
 				in_pcblbgroup_free(grp);
 			} else {
 				/* Pull up inpcbs, shrink group if possible. */
 				in_pcblbgroup_reorder(hdr, &grp, i);
 			}
 			return;
 		}
 	}
 }
 
 /*
  * Different protocols initialize their inpcbs differently - giving
  * different name to the lock.  But they all are disposed the same.
  */
 static void
 inpcb_fini(void *mem, int size)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_DESTROY(inp);
 }
 
 /*
  * Initialize an inpcbinfo -- we should be able to reduce the number of
  * arguments in time.
  */
 void
 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
     char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
 {
 
 	porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
 
 	INP_INFO_LOCK_INIT(pcbinfo, name);
 	INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");	/* XXXRW: argument? */
 	INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
 #ifdef VIMAGE
 	pcbinfo->ipi_vnet = curvnet;
 #endif
 	pcbinfo->ipi_listhead = listhead;
 	CK_LIST_INIT(pcbinfo->ipi_listhead);
 	pcbinfo->ipi_count = 0;
 	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
 	    &pcbinfo->ipi_hashmask);
 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_porthashmask);
 	pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_lbgrouphashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
 #endif
 	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
 	    NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
 	uma_zone_set_warning(pcbinfo->ipi_zone,
 	    "kern.ipc.maxsockets limit reached");
 }
 
 /*
  * Destroy an inpcbinfo.
  */
 void
 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
 {
 
 	KASSERT(pcbinfo->ipi_count == 0,
 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
 
 	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
 	    pcbinfo->ipi_porthashmask);
 	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
 	    pcbinfo->ipi_lbgrouphashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_destroy(pcbinfo);
 #endif
 	uma_zdestroy(pcbinfo->ipi_zone);
 	INP_LIST_LOCK_DESTROY(pcbinfo);
 	INP_HASH_LOCK_DESTROY(pcbinfo);
 	INP_INFO_LOCK_DESTROY(pcbinfo);
 }
 
 /*
  * Allocate a PCB and associate it with the socket.
  * On success return with the PCB locked.
  */
 int
 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 {
 	struct inpcb *inp;
 	int error;
 
 #ifdef INVARIANTS
 	if (pcbinfo == &V_tcbinfo) {
 		INP_INFO_RLOCK_ASSERT(pcbinfo);
 	} else {
 		INP_INFO_WLOCK_ASSERT(pcbinfo);
 	}
 #endif
 
 	error = 0;
 	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
 	if (inp == NULL)
 		return (ENOBUFS);
 	bzero(&inp->inp_start_zero, inp_zero_size);
 #ifdef NUMA
 	inp->inp_numa_domain = M_NODOM;
 #endif
 	inp->inp_pcbinfo = pcbinfo;
 	inp->inp_socket = so;
 	inp->inp_cred = crhold(so->so_cred);
 	inp->inp_inc.inc_fibnum = so->so_fibnum;
 #ifdef MAC
 	error = mac_inpcb_init(inp, M_NOWAIT);
 	if (error != 0)
 		goto out;
 	mac_inpcb_create(so, inp);
 #endif
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	error = ipsec_init_pcbpolicy(inp);
 	if (error != 0) {
 #ifdef MAC
 		mac_inpcb_destroy(inp);
 #endif
 		goto out;
 	}
 #endif /*IPSEC*/
 #ifdef INET6
 	if (INP_SOCKAF(so) == AF_INET6) {
 		inp->inp_vflag |= INP_IPV6PROTO;
 		if (V_ip6_v6only)
 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
 	}
 #endif
 	INP_WLOCK(inp);
 	INP_LIST_WLOCK(pcbinfo);
 	CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
 	pcbinfo->ipi_count++;
 	so->so_pcb = (caddr_t)inp;
 #ifdef INET6
 	if (V_ip6_auto_flowlabel)
 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
 #endif
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	refcount_init(&inp->inp_refcount, 1);	/* Reference from inpcbinfo */
 
 	/*
 	 * Routes in inpcb's can cache L2 as well; they are guaranteed
 	 * to be cleaned up.
 	 */
 	inp->inp_route.ro_flags = RT_LLE_CACHE;
 	INP_LIST_WUNLOCK(pcbinfo);
 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
 out:
 	if (error != 0) {
 		crfree(inp->inp_cred);
 		uma_zfree(pcbinfo->ipi_zone, inp);
 	}
 #endif
 	return (error);
 }
 
 #ifdef INET
 int
 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
 	    &inp->inp_lport, cred);
 	if (error)
 		return (error);
 	if (in_pcbinshash(inp) != 0) {
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 #endif
 
 /*
  * Select a local port (number) to use.
  */
 #if defined(INET) || defined(INET6)
 int
 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
     struct ucred *cred, int lookupflags)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcb *tmpinp;
 	unsigned short *lastport;
 	int count, dorandom, error;
 	u_short aux, first, last, lport;
 #ifdef INET
 	struct in_addr laddr;
 #endif
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	/*
 	 * Because no actual state changes occur here, a global write lock on
 	 * the pcbinfo isn't required.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (inp->inp_flags & INP_HIGHPORT) {
 		first = V_ipport_hifirstauto;	/* sysctl */
 		last  = V_ipport_hilastauto;
 		lastport = &pcbinfo->ipi_lasthi;
 	} else if (inp->inp_flags & INP_LOWPORT) {
 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
 		if (error)
 			return (error);
 		first = V_ipport_lowfirstauto;	/* 1023 */
 		last  = V_ipport_lowlastauto;	/* 600 */
 		lastport = &pcbinfo->ipi_lastlow;
 	} else {
 		first = V_ipport_firstauto;	/* sysctl */
 		last  = V_ipport_lastauto;
 		lastport = &pcbinfo->ipi_lastport;
 	}
 	/*
 	 * For UDP(-Lite), use random port allocation as long as the user
 	 * allows it.  For TCP (and as of yet unknown) connections,
 	 * use random port allocation only if the user allows it AND
 	 * ipport_tick() allows it.
 	 */
 	if (V_ipport_randomized &&
 		(!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
 		pcbinfo == &V_ulitecbinfo))
 		dorandom = 1;
 	else
 		dorandom = 0;
 	/*
 	 * It makes no sense to do random port allocation if
 	 * we have the only port available.
 	 */
 	if (first == last)
 		dorandom = 0;
 	/* Make sure to not include UDP(-Lite) packets in the count. */
 	if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
 		V_ipport_tcpallocs++;
 	/*
 	 * Instead of having two loops further down counting up or down
 	 * make sure that first is always <= last and go with only one
 	 * code path implementing all logic.
 	 */
 	if (first > last) {
 		aux = first;
 		first = last;
 		last = aux;
 	}
 
 #ifdef INET
 	/* Make the compiler happy. */
 	laddr.s_addr = 0;
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
 		KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p",
 		    __func__, inp));
 		laddr = *laddrp;
 	}
 #endif
 	tmpinp = NULL;	/* Make compiler happy. */
 	lport = *lportp;
 
 	if (dorandom)
 		*lastport = first + (arc4random() % (last - first));
 
 	count = last - first;
 
 	do {
 		if (count-- < 0)	/* completely used? */
 			return (EADDRNOTAVAIL);
 		++*lastport;
 		if (*lastport < first || *lastport > last)
 			*lastport = first;
 		lport = htons(*lastport);
 
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV6) != 0)
 			tmpinp = in6_pcblookup_local(pcbinfo,
 			    &inp->in6p_laddr, lport, lookupflags, cred);
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 			tmpinp = in_pcblookup_local(pcbinfo, laddr,
 			    lport, lookupflags, cred);
 #endif
 	} while (tmpinp != NULL);
 
 #ifdef INET
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4)
 		laddrp->s_addr = laddr.s_addr;
 #endif
 	*lportp = lport;
 
 	return (0);
 }
 
 /*
  * Return cached socket options.
  */
 int
 inp_so_options(const struct inpcb *inp)
 {
 	int so_options;
 
 	so_options = 0;
 
 	if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
 		so_options |= SO_REUSEPORT_LB;
 	if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
 		so_options |= SO_REUSEPORT;
 	if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
 		so_options |= SO_REUSEADDR;
 	return (so_options);
 }
 #endif /* INET || INET6 */
 
 /*
  * Check if a new BINDMULTI socket is allowed to be created.
  *
  * ni points to the new inp.
  * oi points to the exisitng inp.
  *
  * This checks whether the existing inp also has BINDMULTI and
  * whether the credentials match.
  */
 int
 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
 {
 	/* Check permissions match */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    (ni->inp_cred->cr_uid !=
 	    oi->inp_cred->cr_uid))
 		return (0);
 
 	/* Check the existing inp has BINDMULTI set */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    ((oi->inp_flags2 & INP_BINDMULTI) == 0))
 		return (0);
 
 	/*
 	 * We're okay - either INP_BINDMULTI isn't set on ni, or
 	 * it is and it matches the checks.
 	 */
 	return (1);
 }
 
 #ifdef INET
 /*
  * Set up a bind operation on a PCB, performing port allocation
  * as required, but do not actually modify the PCB. Callers can
  * either complete the bind by setting inp_laddr/inp_lport and
  * calling in_pcbinshash(), or they can just use the resulting
  * port and address to authorise the sending of a once-off packet.
  *
  * On error, the values of *laddrp and *lportp are not changed.
  */
 int
 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
     u_short *lportp, struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	struct sockaddr_in *sin;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct in_addr laddr;
 	u_short lport = 0;
 	int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
 	int error;
 
 	/*
 	 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
 	 * so that we don't have to add to the (already messy) code below.
 	 */
 	int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
 
 	/*
 	 * No state changes, so read locks are sufficient here.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
 			return (error);
 	} else {
 		sin = (struct sockaddr_in *)nam;
 		if (nam->sa_len != sizeof (*sin))
 			return (EINVAL);
 #ifdef notdef
 		/*
 		 * We should check the family, but old programs
 		 * incorrectly fail to initialize it.
 		 */
 		if (sin->sin_family != AF_INET)
 			return (EAFNOSUPPORT);
 #endif
 		error = prison_local_ip4(cred, &sin->sin_addr);
 		if (error)
 			return (error);
 		if (sin->sin_port != *lportp) {
 			/* Don't allow the port to change. */
 			if (*lportp != 0)
 				return (EINVAL);
 			lport = sin->sin_port;
 		}
 		/* NB: lport is left as 0 if the port isn't being changed. */
 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow complete duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 			/*
 			 * XXX: How to deal with SO_REUSEPORT_LB here?
 			 * Treat same as SO_REUSEPORT for now.
 			 */
 			if ((so->so_options &
 			    (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
 				reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
 			sin->sin_port = 0;		/* yech... */
 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 			/*
 			 * Is the address a local IP address?
 			 * If INP_BINDANY is set, then the socket may be bound
 			 * to any endpoint address, local or not.
 			 */
 			if ((inp->inp_flags & INP_BINDANY) == 0 &&
 			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
 				return (EADDRNOTAVAIL);
 		}
 		laddr = sin->sin_addr;
 		if (lport) {
 			struct inpcb *t;
 			struct tcptw *tw;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
 				return (EACCES);
 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
 			    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 				    lport, INPLOOKUP_WILDCARD, cred);
 	/*
 	 * XXX
 	 * This entire block sorely needs a rewrite.
 	 */
 				if (t &&
 				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				     (t->inp_flags2 & INP_REUSEPORT) ||
 				     (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 
 				/*
 				 * If the socket is a BINDMULTI socket, then
 				 * the credentials need to match and the
 				 * original socket also has to have been bound
 				 * with BINDMULTI.
 				 */
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 			    lport, lookupflags, cred);
 			if (t && (t->inp_flags & INP_TIMEWAIT)) {
 				/*
 				 * XXXRW: If an incpb has had its timewait
 				 * state recycled, we treat the address as
 				 * being in use (for now).  This is better
 				 * than a panic, but not desirable.
 				 */
 				tw = intotw(t);
 				if (tw == NULL ||
 				    ((reuseport & tw->tw_so_options) == 0 &&
 					(reuseport_lb &
 				            tw->tw_so_options) == 0)) {
 					return (EADDRINUSE);
 				}
 			} else if (t &&
 				   ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				   (reuseport & inp_so_options(t)) == 0 &&
 				   (reuseport_lb & inp_so_options(t)) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
 				    ntohl(t->inp_laddr.s_addr) !=
 				    INADDR_ANY ||
 				    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
 				    (t->inp_vflag & INP_IPV6PROTO) == 0)
 #endif
 						return (EADDRINUSE);
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 		}
 	}
 	if (*lportp != 0)
 		lport = *lportp;
 	if (lport == 0) {
 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
 		if (error != 0)
 			return (error);
 
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	return (0);
 }
 
 /*
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
     struct ucred *cred, struct mbuf *m)
 {
 	u_short lport, fport;
 	in_addr_t laddr, faddr;
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	lport = inp->inp_lport;
 	laddr = inp->inp_laddr.s_addr;
 	anonport = (lport == 0);
 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
 	    NULL, cred);
 	if (error)
 		return (error);
 
 	/* Do the initial binding of the local address if required. */
 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 		inp->inp_lport = lport;
 		inp->inp_laddr.s_addr = laddr;
 		if (in_pcbinshash(inp) != 0) {
 			inp->inp_laddr.s_addr = INADDR_ANY;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 
 	/* Commit the remaining changes. */
 	inp->inp_lport = lport;
 	inp->inp_laddr.s_addr = laddr;
 	inp->inp_faddr.s_addr = faddr;
 	inp->inp_fport = fport;
 	in_pcbrehash_mbuf(inp, m);
 
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 
 int
 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 
 	return (in_pcbconnect_mbuf(inp, nam, cred, NULL));
 }
 
 /*
  * Do proper source address selection on an unbound socket in case
  * of connect. Take jails into account as well.
  */
 int
 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
     struct ucred *cred)
 {
 	struct ifaddr *ifa;
 	struct sockaddr *sa;
 	struct sockaddr_in *sin;
 	struct route sro;
 	struct epoch_tracker et;
 	int error;
 
 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
 	/*
 	 * Bypass source address selection and use the primary jail IP
 	 * if requested.
 	 */
 	if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
 		return (0);
 
 	error = 0;
 	bzero(&sro, sizeof(sro));
 
 	sin = (struct sockaddr_in *)&sro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_addr.s_addr = faddr->s_addr;
 
 	/*
 	 * If route is known our src addr is taken from the i/f,
 	 * else punt.
 	 *
 	 * Find out route to destination.
 	 */
 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
 		in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
 
 	/*
 	 * If we found a route, use the address corresponding to
 	 * the outgoing interface.
 	 * 
 	 * Otherwise assume faddr is reachable on a directly connected
 	 * network and try to find a corresponding interface to take
 	 * the source address from.
 	 */
 	NET_EPOCH_ENTER(et);
 	if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL) {
 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
 						inp->inp_socket->so_fibnum));
 
 		}
 		if (ia == NULL) {
 			error = ENETUNREACH;
 			goto done;
 		}
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		ifp = ia->ia_ifp;
 		ia = NULL;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * If the outgoing interface on the route found is not
 	 * a loopback interface, use the address from that interface.
 	 * In case of jails do those three steps:
 	 * 1. check if the interface address belongs to the jail. If so use it.
 	 * 2. check if we have any address on the outgoing interface
 	 *    belonging to this jail. If so use it.
 	 * 3. as a last resort return the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		/* If not jailed, use the default returned. */
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		/* 1. Check if the iface address belongs to the jail. */
 		sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/*
 		 * 2. Check if we have any address on the outgoing interface
 		 *    belonging to this jail.
 		 */
 		ia = NULL;
 		ifp = sro.ro_rt->rt_ifp;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * The outgoing interface is marked with 'loopback net', so a route
 	 * to ourselves is here.
 	 * Try to find the interface of the destination address and then
 	 * take the address from there. That interface is not necessarily
 	 * a loopback interface.
 	 * In case of jails, check that it is an address of the jail
 	 * and if we cannot find, fall back to the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
 		struct sockaddr_in sain;
 		struct in_ifaddr *ia;
 
 		bzero(&sain, sizeof(struct sockaddr_in));
 		sain.sin_family = AF_INET;
 		sain.sin_len = sizeof(struct sockaddr_in);
 		sain.sin_addr.s_addr = faddr->s_addr;
 
 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain),
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0,
 						inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			if (ia == NULL) {
 				error = ENETUNREACH;
 				goto done;
 			}
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		if (ia != NULL) {
 			struct ifnet *ifp;
 
 			ifp = ia->ia_ifp;
 			ia = NULL;
 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				sa = ifa->ifa_addr;
 				if (sa->sa_family != AF_INET)
 					continue;
 				sin = (struct sockaddr_in *)sa;
 				if (prison_check_ip4(cred,
 				    &sin->sin_addr) == 0) {
 					ia = (struct in_ifaddr *)ifa;
 					break;
 				}
 			}
 			if (ia != NULL) {
 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 				goto done;
 			}
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 done:
 	NET_EPOCH_EXIT(et);
 	if (sro.ro_rt != NULL)
 		RTFREE(sro.ro_rt);
 	return (error);
 }
 
 /*
  * Set up for a connect from a socket to the specified address.
  * On entry, *laddrp and *lportp should contain the current local
  * address and port for the PCB; these are updated to the values
  * that should be placed in inp_laddr and inp_lport to complete
  * the connect.
  *
  * On success, *faddrp and *fportp will be set to the remote address
  * and port. These are not updated in the error case.
  *
  * If the operation fails because the connection already exists,
  * *oinpp will be set to the PCB of that connection so that the
  * caller can decide to override it. In all other cases, *oinpp
  * is set to NULL.
  */
 int
 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
     struct inpcb **oinpp, struct ucred *cred)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 	struct in_ifaddr *ia;
 	struct inpcb *oinp;
 	struct in_addr laddr, faddr;
 	u_short lport, fport;
 	int error;
 
 	/*
 	 * Because a global state change doesn't actually occur here, a read
 	 * lock is sufficient.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (oinpp != NULL)
 		*oinpp = NULL;
 	if (nam->sa_len != sizeof (*sin))
 		return (EINVAL);
 	if (sin->sin_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (sin->sin_port == 0)
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	lport = *lportp;
 	faddr = sin->sin_addr;
 	fport = sin->sin_port;
 
 	if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
 		/*
 		 * If the destination address is INADDR_ANY,
 		 * use the primary local address.
 		 * If the supplied address is INADDR_BROADCAST,
 		 * and the primary interface supports broadcast,
 		 * choose the broadcast address for that interface.
 		 */
 		if (faddr.s_addr == INADDR_ANY) {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			faddr =
 			    IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			if (cred != NULL &&
 			    (error = prison_get_ip4(cred, &faddr)) != 0)
 				return (error);
 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
 			    IFF_BROADCAST)
 				faddr = satosin(&CK_STAILQ_FIRST(
 				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		}
 	}
 	if (laddr.s_addr == INADDR_ANY) {
 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
 		/*
 		 * If the destination address is multicast and an outgoing
 		 * interface has been set as a multicast option, prefer the
 		 * address of that interface as our source address.
 		 */
 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
 		    inp->inp_moptions != NULL) {
 			struct ip_moptions *imo;
 			struct ifnet *ifp;
 
 			imo = inp->inp_moptions;
 			if (imo->imo_multicast_ifp != NULL) {
 				ifp = imo->imo_multicast_ifp;
 				IN_IFADDR_RLOCK(&in_ifa_tracker);
 				CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 					if ((ia->ia_ifp == ifp) &&
 					    (cred == NULL ||
 					    prison_check_ip4(cred,
 					    &ia->ia_addr.sin_addr) == 0))
 						break;
 				}
 				if (ia == NULL)
 					error = EADDRNOTAVAIL;
 				else {
 					laddr = ia->ia_addr.sin_addr;
 					error = 0;
 				}
 				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			}
 		}
 		if (error)
 			return (error);
 	}
 	oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport,
 	    laddr, lport, 0, NULL);
 	if (oinp != NULL) {
 		if (oinpp != NULL)
 			*oinpp = oinp;
 		return (EADDRINUSE);
 	}
 	if (lport == 0) {
 		error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
 		    cred);
 		if (error)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	*faddrp = faddr.s_addr;
 	*fportp = fport;
 	return (0);
 }
 
 void
 in_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	inp->inp_fport = 0;
 	in_pcbrehash(inp);
 }
 #endif /* INET */
 
 /*
  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
  * For most protocols, this will be invoked immediately prior to calling
  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
  * socket, in which case in_pcbfree() is deferred.
  */
 void
 in_pcbdetach(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 
 #ifdef RATELIMIT
 	if (inp->inp_snd_tag != NULL)
 		in_pcbdetach_txrtlmt(inp);
 #endif
 	inp->inp_socket->so_pcb = NULL;
 	inp->inp_socket = NULL;
 }
 
 /*
  * in_pcbref() bumps the reference count on an inpcb in order to maintain
  * stability of an inpcb pointer despite the inpcb lock being released.  This
  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
  * but where the inpcb lock may already held, or when acquiring a reference
  * via a pcbgroup.
  *
  * in_pcbref() should be used only to provide brief memory stability, and
  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
  * garbage collect the inpcb if it has been in_pcbfree()'d from another
  * context.  Until in_pcbrele() has returned that the inpcb is still valid,
  * lock and rele are the *only* safe operations that may be performed on the
  * inpcb.
  *
  * While the inpcb will not be freed, releasing the inpcb lock means that the
  * connection's state may change, so the caller should be careful to
  * revalidate any cached state on reacquiring the lock.  Drop the reference
  * using in_pcbrele().
  */
 void
 in_pcbref(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	refcount_acquire(&inp->inp_refcount);
 }
 
 /*
  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
  * return a flag indicating whether or not the inpcb remains valid.  If it is
  * valid, we return with the inpcb lock held.
  *
  * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
  * reference on an inpcb.  Historically more work was done here (actually, in
  * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
  * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
  * about memory stability (and continued use of the write lock).
  */
 int
 in_pcbrele_rlocked(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_RLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0) {
 		/*
 		 * If the inpcb has been freed, let the caller know, even if
 		 * this isn't the last reference.
 		 */
 		if (inp->inp_flags2 & INP_FREED) {
 			INP_RUNLOCK(inp);
 			return (1);
 		}
 		return (0);
 	}
 	
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 #ifdef TCPHPTS
 	if (inp->inp_in_hpts || inp->inp_in_input) {
 		struct tcp_hpts_entry *hpts;
 		/*
 		 * We should not be on the hpts at 
 		 * this point in any form. we must
 		 * get the lock to be sure.
 		 */
 		hpts = tcp_hpts_lock(inp);
 		if (inp->inp_in_hpts)
 			panic("Hpts:%p inp:%p at free still on hpts",
 			      hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 		hpts = tcp_input_lock(inp);
 		if (inp->inp_in_input) 
 			panic("Hpts:%p inp:%p at free still on input hpts",
 			      hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 	}
 #endif
 	INP_RUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 int
 in_pcbrele_wlocked(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0) {
 		/*
 		 * If the inpcb has been freed, let the caller know, even if
 		 * this isn't the last reference.
 		 */
 		if (inp->inp_flags2 & INP_FREED) {
 			INP_WUNLOCK(inp);
 			return (1);
 		}
 		return (0);
 	}
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 #ifdef TCPHPTS
 	if (inp->inp_in_hpts || inp->inp_in_input) {
 		struct tcp_hpts_entry *hpts;
 		/*
 		 * We should not be on the hpts at 
 		 * this point in any form. we must
 		 * get the lock to be sure.
 		 */
 		hpts = tcp_hpts_lock(inp);
 		if (inp->inp_in_hpts)
 			panic("Hpts:%p inp:%p at free still on hpts",
 			      hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 		hpts = tcp_input_lock(inp);
 		if (inp->inp_in_input) 
 			panic("Hpts:%p inp:%p at free still on input hpts",
 			      hpts, inp);
 		mtx_unlock(&hpts->p_mtx);
 	}
 #endif
 	INP_WUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 /*
  * Temporary wrapper.
  */
 int
 in_pcbrele(struct inpcb *inp)
 {
 
 	return (in_pcbrele_wlocked(inp));
 }
 
 void
 in_pcblist_rele_rlocked(epoch_context_t ctx)
 {
 	struct in_pcblist *il;
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int i, n;
 
 	il = __containerof(ctx, struct in_pcblist, il_epoch_ctx);
 	pcbinfo = il->il_pcbinfo;
 	n = il->il_count;
 	INP_INFO_WLOCK(pcbinfo);
 	for (i = 0; i < n; i++) {
 		inp = il->il_inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 	free(il, M_TEMP);
 }
 
 static void
 inpcbport_free(epoch_context_t ctx)
 {
 	struct inpcbport *phd;
 
 	phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx);
 	free(phd, M_PCB);
 }
 
 static void
 in_pcbfree_deferred(epoch_context_t ctx)
 {
 	struct inpcb *inp;
 	int released __unused;
 
 	inp = __containerof(ctx, struct inpcb, inp_epoch_ctx);
 
 	INP_WLOCK(inp);
 	CURVNET_SET(inp->inp_vnet);
 #ifdef INET
 	struct ip_moptions *imo = inp->inp_moptions;
 	inp->inp_moptions = NULL;
 #endif
 	/* XXXRW: Do as much as possible here. */
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (inp->inp_sp != NULL)
 		ipsec_delete_pcbpolicy(inp);
 #endif
 #ifdef INET6
 	struct ip6_moptions *im6o = NULL;
 	if (inp->inp_vflag & INP_IPV6PROTO) {
 		ip6_freepcbopts(inp->in6p_outputopts);
 		im6o = inp->in6p_moptions;
 		inp->in6p_moptions = NULL;
 	}
 #endif
 	if (inp->inp_options)
 		(void)m_free(inp->inp_options);
 	inp->inp_vflag = 0;
 	crfree(inp->inp_cred);
 #ifdef MAC
 	mac_inpcb_destroy(inp);
 #endif
 	released = in_pcbrele_wlocked(inp);
 	MPASS(released);
 #ifdef INET6
 	ip6_freemoptions(im6o);
 #endif
 #ifdef INET
 	inp_freemoptions(imo);
 #endif	
 	CURVNET_RESTORE();
 }
 
 /*
  * Unconditionally schedule an inpcb to be freed by decrementing its
  * reference count, which should occur only after the inpcb has been detached
  * from its socket.  If another thread holds a temporary reference (acquired
  * using in_pcbref()) then the free is deferred until that reference is
  * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
  * work, including removal from global lists, is done in this context, where
  * the pcbinfo lock is held.
  */
 void
 in_pcbfree(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 	KASSERT((inp->inp_flags2 & INP_FREED) == 0,
 	    ("%s: called twice for pcb %p", __func__, inp));
 	if (inp->inp_flags2 & INP_FREED) {
 		INP_WUNLOCK(inp);
 		return;
 	}
 
 #ifdef INVARIANTS
 	if (pcbinfo == &V_tcbinfo) {
 		INP_INFO_LOCK_ASSERT(pcbinfo);
 	} else {
 		INP_INFO_WLOCK_ASSERT(pcbinfo);
 	}
 #endif
 	INP_WLOCK_ASSERT(inp);
 	INP_LIST_WLOCK(pcbinfo);
 	in_pcbremlists(inp);
 	INP_LIST_WUNLOCK(pcbinfo);
 	RO_INVALIDATE_CACHE(&inp->inp_route);
 	/* mark as destruction in progress */
 	inp->inp_flags2 |= INP_FREED;
 	INP_WUNLOCK(inp);
 	epoch_call(net_epoch_preempt, &inp->inp_epoch_ctx, in_pcbfree_deferred);
 }
 
 /*
  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
  * port reservation, and preventing it from being returned by inpcb lookups.
  *
  * It is used by TCP to mark an inpcb as unused and avoid future packet
  * delivery or event notification when a socket remains open but TCP has
  * closed.  This might occur as a result of a shutdown()-initiated TCP close
  * or a RST on the wire, and allows the port binding to be reused while still
  * maintaining the invariant that so_pcb always points to a valid inpcb until
  * in_pcbdetach().
  *
  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
  * in_pcbnotifyall() and in_pcbpurgeif0()?
  */
 void
 in_pcbdrop(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 #ifdef INVARIANTS
 	if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
 		MPASS(inp->inp_refcount > 1);
 #endif
 
 	/*
 	 * XXXRW: Possibly we should protect the setting of INP_DROPPED with
 	 * the hash lock...?
 	 */
 	inp->inp_flags |= INP_DROPPED;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(inp->inp_pcbinfo);
 		in_pcbremlbgrouphash(inp);
 		CK_LIST_REMOVE(inp, inp_hash);
 		CK_LIST_REMOVE(inp, inp_portlist);
 		if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			CK_LIST_REMOVE(phd, phd_hash);
 			epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free);
 		}
 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 #ifdef PCBGROUP
 		in_pcbgroup_remove(inp);
 #endif
 	}
 }
 
 #ifdef INET
 /*
  * Common routines to return the socket addresses associated with inpcbs.
  */
 struct sockaddr *
 in_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in *sin;
 
 	sin = malloc(sizeof *sin, M_SONAME,
 		M_WAITOK | M_ZERO);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = *addr_p;
 	sin->sin_port = port;
 
 	return (struct sockaddr *)sin;
 }
 
 int
 in_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->inp_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 void
 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb *inp, *inp_temp;
 
 	INP_INFO_WLOCK(pcbinfo);
 	CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 #endif
 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
 		    inp->inp_socket == NULL) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 		if ((*notify)(inp, errno))
 			INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 void
 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb *inp;
+	struct in_multi *inm;
+	struct in_mfilter *imf;
 	struct ip_moptions *imo;
-	int i, gap;
 
 	INP_INFO_WLOCK(pcbinfo);
 	CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		imo = inp->inp_moptions;
 		if ((inp->inp_vflag & INP_IPV4) &&
 		    imo != NULL) {
 			/*
 			 * Unselect the outgoing interface if it is being
 			 * detached.
 			 */
 			if (imo->imo_multicast_ifp == ifp)
 				imo->imo_multicast_ifp = NULL;
 
 			/*
 			 * Drop multicast group membership if we joined
 			 * through the interface being detached.
 			 *
 			 * XXX This can all be deferred to an epoch_call
 			 */
-			for (i = 0, gap = 0; i < imo->imo_num_memberships;
-			    i++) {
-				if (imo->imo_membership[i]->inm_ifp == ifp) {
-					IN_MULTI_LOCK_ASSERT();
-					in_leavegroup_locked(imo->imo_membership[i], NULL);
-					gap++;
-				} else if (gap != 0)
-					imo->imo_membership[i - gap] =
-					    imo->imo_membership[i];
+restart:
+			IP_MFILTER_FOREACH(imf, &imo->imo_head) {
+				if ((inm = imf->imf_inm) == NULL)
+					continue;
+				if (inm->inm_ifp != ifp)
+					continue;
+				ip_mfilter_remove(&imo->imo_head, imf);
+				IN_MULTI_LOCK_ASSERT();
+				in_leavegroup_locked(inm, NULL);
+				ip_mfilter_free(imf);
+				goto restart;
 			}
-			imo->imo_num_memberships -= gap;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 /*
  * Lookup a PCB based on the local address and port.  Caller must hold the
  * hash lock.  No inpcb locks or references are acquired.
  */
 #define INP_LOOKUP_MAPPED_PCB_COST	3
 struct inpcb *
 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
     u_short lport, int lookupflags, struct ucred *cred)
 {
 	struct inpcb *inp;
 #ifdef INET6
 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
 #else
 	int matchwild = 3;
 #endif
 	int wildcard;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
 			    inp->inp_laddr.s_addr == laddr.s_addr &&
 			    inp->inp_lport == lport) {
 				/*
 				 * Found?
 				 */
 				if (cred == NULL ||
 				    prison_equal_ip4(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		CK_LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 				if (cred != NULL &&
 				    !prison_equal_ip4(inp->inp_cred->cr_prison,
 					cred->cr_prison))
 					continue;
 #ifdef INET6
 				/* XXX inp locking */
 				if ((inp->inp_vflag & INP_IPV4) == 0)
 					continue;
 				/*
 				 * We never select the PCB that has
 				 * INP_IPV6 flag and is bound to :: if
 				 * we have another PCB which is bound
 				 * to 0.0.0.0.  If a PCB has the
 				 * INP_IPV6 flag, then we set its cost
 				 * higher than IPv4 only PCBs.
 				 *
 				 * Note that the case only happens
 				 * when a socket is bound to ::, under
 				 * the condition that the use of the
 				 * mapped address is allowed.
 				 */
 				if ((inp->inp_vflag & INP_IPV6) != 0)
 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 #endif
 				if (inp->inp_faddr.s_addr != INADDR_ANY)
 					wildcard++;
 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
 					if (laddr.s_addr == INADDR_ANY)
 						wildcard++;
 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
 						continue;
 				} else {
 					if (laddr.s_addr != INADDR_ANY)
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0)
 						break;
 				}
 			}
 		}
 		return (match);
 	}
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
 static struct inpcb *
 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
     const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
     uint16_t fport, int lookupflags)
 {
 	struct inpcb *local_wild;
 	const struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	uint32_t idx;
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
 
 	/*
 	 * Order of socket selection:
 	 * 1. non-wild.
 	 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
 	 *
 	 * NOTE:
 	 * - Load balanced group does not contain jailed sockets
 	 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
 	 */
 	local_wild = NULL;
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 #ifdef INET6
 		if (!(grp->il_vflag & INP_IPV4))
 			continue;
 #endif
 		if (grp->il_lport != lport)
 			continue;
 
 		idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) %
 		    grp->il_inpcnt;
 		if (grp->il_laddr.s_addr == laddr->s_addr)
 			return (grp->il_inp[idx]);
 		if (grp->il_laddr.s_addr == INADDR_ANY &&
 		    (lookupflags & INPLOOKUP_WILDCARD) != 0)
 			local_wild = grp->il_inp[idx];
 	}
 	return (local_wild);
 }
 
 #ifdef PCBGROUP
 /*
  * Lookup PCB in hash list, using pcbgroup tables.
  */
 static struct inpcb *
 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
     u_int lport_arg, int lookupflags, struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 	bool locked;
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	INP_GROUP_LOCK(pcbgroup);
 	head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbgroup->ipg_hashmask)];
 	CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				goto found;
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL) {
 		inp = tmpinp;
 		goto found;
 	}
 
 #ifdef	RSS
 	/*
 	 * For incoming connections, we may wish to do a wildcard
 	 * match for an RSS-local socket.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		struct inpcbhead *head;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
 		    lport, 0, pcbgroup->ipg_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 #ifdef INET6
 		if (inp == NULL)
 			inp = local_wild_mapped;
 #endif
 		if (inp != NULL)
 			goto found;
 	}
 #endif
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		struct inpcbhead *head;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_wildmask)];
 		CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 #ifdef INET6
 		if (inp == NULL)
 			inp = local_wild_mapped;
 #endif
 		if (inp != NULL)
 			goto found;
 	} /* if (lookupflags & INPLOOKUP_WILDCARD) */
 	INP_GROUP_UNLOCK(pcbgroup);
 	return (NULL);
 
 found:
 	if (lookupflags & INPLOOKUP_WLOCKPCB)
 		locked = INP_TRY_WLOCK(inp);
 	else if (lookupflags & INPLOOKUP_RLOCKPCB)
 		locked = INP_TRY_RLOCK(inp);
 	else
 		panic("%s: locking bug", __func__);
 	if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) {
 		if (lookupflags & INPLOOKUP_WLOCKPCB)
 			INP_WUNLOCK(inp);
 		else
 			INP_RUNLOCK(inp);
 		return (NULL);
 	} else if (!locked)
 		in_pcbref(inp);
 	INP_GROUP_UNLOCK(pcbgroup);
 	if (!locked) {
 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
 			INP_WLOCK(inp);
 			if (in_pcbrele_wlocked(inp))
 				return (NULL);
 		} else {
 			INP_RLOCK(inp);
 			if (in_pcbrele_rlocked(inp))
 				return (NULL);
 		}
 	}
 #ifdef INVARIANTS
 	if (lookupflags & INPLOOKUP_WLOCKPCB)
 		INP_WLOCK_ASSERT(inp);
 	else
 		INP_RLOCK_ASSERT(inp);
 #endif
 	return (inp);
 }
 #endif /* PCBGROUP */
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
  * that the caller has locked the hash list, and will not perform any further
  * locking or reference operations on either the hash list or the connection.
  */
 static struct inpcb *
 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
     struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 #ifdef INVARIANTS
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	if (!mtx_owned(&pcbinfo->ipi_hash_lock))
 		MPASS(in_epoch_verbose(net_epoch_preempt, 1));
 #endif
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
 	CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				return (inp);
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
 
 	/*
 	 * Then look in lb group (for wildcard match).
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
 		    fport, lookupflags);
 		if (inp != NULL)
 			return (inp);
 	}
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					return (inp);
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		if (jail_wild != NULL)
 			return (jail_wild);
 		if (local_exact != NULL)
 			return (local_exact);
 		if (local_wild != NULL)
 			return (local_wild);
 #ifdef INET6
 		if (local_wild_mapped != NULL)
 			return (local_wild_mapped);
 #endif
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 
 	return (NULL);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
  * hash list lock, and will return the inpcb locked (i.e., requires
  * INPLOOKUP_LOCKPCB).
  */
 static struct inpcb *
 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp)
 {
 	struct inpcb *inp;
 
 	INP_HASH_RLOCK(pcbinfo);
 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 	    (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
 	if (inp != NULL) {
 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
 			INP_WLOCK(inp);
 			if (__predict_false(inp->inp_flags2 & INP_FREED)) {
 				INP_WUNLOCK(inp);
 				inp = NULL;
 			}
 		} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 			INP_RLOCK(inp);
 			if (__predict_false(inp->inp_flags2 & INP_FREED)) {
 				INP_RUNLOCK(inp);
 				inp = NULL;
 			}
 		} else
 			panic("%s: locking bug", __func__);
 #ifdef INVARIANTS
 		if (inp != NULL) {
 			if (lookupflags & INPLOOKUP_WLOCKPCB)
 				INP_WLOCK_ASSERT(inp);
 			else
 				INP_RLOCK_ASSERT(inp);
 		}
 #endif
 	}
 	INP_HASH_RUNLOCK(pcbinfo);
 	return (inp);
 }
 
 /*
  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
  * from which a pre-calculated hash value may be extracted.
  *
  * Possibly more of this logic should be in in_pcbgroup.c.
  */
 struct inpcb *
 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 {
 #if defined(PCBGROUP) && !defined(RSS)
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	/*
 	 * When not using RSS, use connection groups in preference to the
 	 * reservation table when looking up 4-tuples.  When using RSS, just
 	 * use the reservation table, due to the cost of the Toeplitz hash
 	 * in software.
 	 *
 	 * XXXRW: This policy belongs in the pcbgroup code, as in principle
 	 * we could be doing RSS with a non-Toeplitz hash that is affordable
 	 * in software.
 	 */
 #if defined(PCBGROUP) && !defined(RSS)
 	if (in_pcbgroup_enabled(pcbinfo)) {
 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 	}
 #endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 
 struct inpcb *
 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, struct mbuf *m)
 {
 #ifdef PCBGROUP
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 #ifdef PCBGROUP
 	/*
 	 * If we can use a hardware-generated hash to look up the connection
 	 * group, use that connection group to find the inpcb.  Otherwise
 	 * fall back on a software hash -- or the reservation table if we're
 	 * using RSS.
 	 *
 	 * XXXRW: As above, that policy belongs in the pcbgroup code.
 	 */
 	if (in_pcbgroup_enabled(pcbinfo) &&
 	    !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
 		pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
 		    m->m_pkthdr.flowid);
 		if (pcbgroup != NULL)
 			return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
 			    fport, laddr, lport, lookupflags, ifp));
 #ifndef RSS
 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 #endif
 	}
 #endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 #endif /* INET */
 
 /*
  * Insert PCB onto various hash lists.
  */
 static int
 in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
 {
 	struct inpcbhead *pcbhash;
 	struct inpcbporthead *pcbporthash;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbport *phd;
 	u_int32_t hashkey_faddr;
 	int so_options;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
 	    ("in_pcbinshash: INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 	else
 #endif
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	pcbporthash = &pcbinfo->ipi_porthashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 
 	/*
 	 * Add entry to load balance group.
 	 * Only do this if SO_REUSEPORT_LB is set.
 	 */
 	so_options = inp_so_options(inp);
 	if (so_options & SO_REUSEPORT_LB) {
 		int ret = in_pcbinslbgrouphash(inp);
 		if (ret) {
 			/* pcb lb group malloc fail (ret=ENOBUFS). */
 			return (ret);
 		}
 	}
 
 	/*
 	 * Go through port list and look for a head for this lport.
 	 */
 	CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
 		if (phd->phd_port == inp->inp_lport)
 			break;
 	}
 	/*
 	 * If none exists, malloc one and tack it on.
 	 */
 	if (phd == NULL) {
 		phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
 		if (phd == NULL) {
 			return (ENOBUFS); /* XXX */
 		}
 		bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context));
 		phd->phd_port = inp->inp_lport;
 		CK_LIST_INIT(&phd->phd_pcblist);
 		CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 	}
 	inp->inp_phd = phd;
 	CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 	CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 	inp->inp_flags |= INP_INHASHLIST;
 #ifdef PCBGROUP
 	if (do_pcbgroup_update)
 		in_pcbgroup_update(inp);
 #endif
 	return (0);
 }
 
 /*
  * For now, there are two public interfaces to insert an inpcb into the hash
  * lists -- one that does update pcbgroups, and one that doesn't.  The latter
  * is used only in the TCP syncache, where in_pcbinshash is called before the
  * full 4-tuple is set for the inpcb, and we don't want to install in the
  * pcbgroup until later.
  *
  * XXXRW: This seems like a misfeature.  in_pcbinshash should always update
  * connection groups, and partially initialised inpcbs should not be exposed
  * to either reservation hash tables or pcbgroups.
  */
 int
 in_pcbinshash(struct inpcb *inp)
 {
 
 	return (in_pcbinshash_internal(inp, 1));
 }
 
 int
 in_pcbinshash_nopcbgroup(struct inpcb *inp)
 {
 
 	return (in_pcbinshash_internal(inp, 0));
 }
 
 /*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
  * not change after in_pcbinshash() has been called.
  */
 void
 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *head;
 	u_int32_t hashkey_faddr;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT(inp->inp_flags & INP_INHASHLIST,
 	    ("in_pcbrehash: !INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 	else
 #endif
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	CK_LIST_REMOVE(inp, inp_hash);
 	CK_LIST_INSERT_HEAD(head, inp, inp_hash);
 
 #ifdef PCBGROUP
 	if (m != NULL)
 		in_pcbgroup_update_mbuf(inp, m);
 	else
 		in_pcbgroup_update(inp);
 #endif
 }
 
 void
 in_pcbrehash(struct inpcb *inp)
 {
 
 	in_pcbrehash_mbuf(inp, NULL);
 }
 
 /*
  * Remove PCB from various lists.
  */
 static void
 in_pcbremlists(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 #ifdef INVARIANTS
 	if (pcbinfo == &V_tcbinfo) {
 		INP_INFO_RLOCK_ASSERT(pcbinfo);
 	} else {
 		INP_INFO_WLOCK_ASSERT(pcbinfo);
 	}
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 	INP_LIST_WLOCK_ASSERT(pcbinfo);
 
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(pcbinfo);
 
 		/* XXX: Only do if SO_REUSEPORT_LB set? */
 		in_pcbremlbgrouphash(inp);
 
 		CK_LIST_REMOVE(inp, inp_hash);
 		CK_LIST_REMOVE(inp, inp_portlist);
 		if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			CK_LIST_REMOVE(phd, phd_hash);
 			epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free);
 		}
 		INP_HASH_WUNLOCK(pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 	}
 	CK_LIST_REMOVE(inp, inp_list);
 	pcbinfo->ipi_count--;
 #ifdef PCBGROUP
 	in_pcbgroup_remove(inp);
 #endif
 }
 
 /*
  * Check for alternatives when higher level complains
  * about service problems.  For now, invalidate cached
  * routing information.  If the route was created dynamically
  * (by a redirect), time to try a default gateway again.
  */
 void
 in_losing(struct inpcb *inp)
 {
 
 	RO_INVALIDATE_CACHE(&inp->inp_route);
 	return;
 }
 
 /*
  * A set label operation has occurred at the socket layer, propagate the
  * label change into the in_pcb for the socket.
  */
 void
 in_pcbsosetlabel(struct socket *so)
 {
 #ifdef MAC
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
 
 	INP_WLOCK(inp);
 	SOCK_LOCK(so);
 	mac_inpcb_sosetlabel(so, inp);
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 #endif
 }
 
 /*
  * ipport_tick runs once per second, determining if random port allocation
  * should be continued.  If more than ipport_randomcps ports have been
  * allocated in the last second, then we return to sequential port
  * allocation. We return to random allocation only once we drop below
  * ipport_randomcps for at least ipport_randomtime seconds.
  */
 static void
 ipport_tick(void *xtp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);	/* XXX appease INVARIANTS here */
 		if (V_ipport_tcpallocs <=
 		    V_ipport_tcplastcount + V_ipport_randomcps) {
 			if (V_ipport_stoprandom > 0)
 				V_ipport_stoprandom--;
 		} else
 			V_ipport_stoprandom = V_ipport_randomtime;
 		V_ipport_tcplastcount = V_ipport_tcpallocs;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
 }
 
 static void
 ip_fini(void *xtp)
 {
 
 	callout_stop(&ipport_tick_callout);
 }
 
 /* 
  * The ipport_callout should start running at about the time we attach the
  * inet or inet6 domains.
  */
 static void
 ipport_tick_init(const void *unused __unused)
 {
 
 	/* Start ipport_tick. */
 	callout_init(&ipport_tick_callout, 1);
 	callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 }
 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, 
     ipport_tick_init, NULL);
 
 void
 inp_wlock(struct inpcb *inp)
 {
 
 	INP_WLOCK(inp);
 }
 
 void
 inp_wunlock(struct inpcb *inp)
 {
 
 	INP_WUNLOCK(inp);
 }
 
 void
 inp_rlock(struct inpcb *inp)
 {
 
 	INP_RLOCK(inp);
 }
 
 void
 inp_runlock(struct inpcb *inp)
 {
 
 	INP_RUNLOCK(inp);
 }
 
 #ifdef INVARIANT_SUPPORT
 void
 inp_lock_assert(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 }
 
 void
 inp_unlock_assert(struct inpcb *inp)
 {
 
 	INP_UNLOCK_ASSERT(inp);
 }
 #endif
 
 void
 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
 {
 	struct inpcb *inp;
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		func(inp, arg);
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 }
 
 struct socket *
 inp_inpcbtosocket(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return (inp->inp_socket);
 }
 
 struct tcpcb *
 inp_inpcbtotcpcb(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return ((struct tcpcb *)inp->inp_ppcb);
 }
 
 int
 inp_ip_tos_get(const struct inpcb *inp)
 {
 
 	return (inp->inp_ip_tos);
 }
 
 void
 inp_ip_tos_set(struct inpcb *inp, int val)
 {
 
 	inp->inp_ip_tos = val;
 }
 
 void
 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
     uint32_t *faddr, uint16_t *fp)
 {
 
 	INP_LOCK_ASSERT(inp);
 	*laddr = inp->inp_laddr.s_addr;
 	*faddr = inp->inp_faddr.s_addr;
 	*lp = inp->inp_lport;
 	*fp = inp->inp_fport;
 }
 
 struct inpcb *
 so_sotoinpcb(struct socket *so)
 {
 
 	return (sotoinpcb(so));
 }
 
 struct tcpcb *
 so_sototcpcb(struct socket *so)
 {
 
 	return (sototcpcb(so));
 }
 
 /*
  * Create an external-format (``xinpcb'') structure using the information in
  * the kernel-format in_pcb structure pointed to by inp.  This is done to
  * reduce the spew of irrelevant information over this interface, to isolate
  * user code from changes in the kernel structure, and potentially to provide
  * information-hiding if we decide that some of this information should be
  * hidden from users.
  */
 void
 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
 {
 
 	bzero(xi, sizeof(*xi));
 	xi->xi_len = sizeof(struct xinpcb);
 	if (inp->inp_socket)
 		sotoxsocket(inp->inp_socket, &xi->xi_socket);
 	bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
 	xi->inp_gencnt = inp->inp_gencnt;
 	xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
 	xi->inp_flow = inp->inp_flow;
 	xi->inp_flowid = inp->inp_flowid;
 	xi->inp_flowtype = inp->inp_flowtype;
 	xi->inp_flags = inp->inp_flags;
 	xi->inp_flags2 = inp->inp_flags2;
 	xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
 	xi->in6p_cksum = inp->in6p_cksum;
 	xi->in6p_hops = inp->in6p_hops;
 	xi->inp_ip_tos = inp->inp_ip_tos;
 	xi->inp_vflag = inp->inp_vflag;
 	xi->inp_ip_ttl = inp->inp_ip_ttl;
 	xi->inp_ip_p = inp->inp_ip_p;
 	xi->inp_ip_minttl = inp->inp_ip_minttl;
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
 {
 	char faddr_str[48], laddr_str[48];
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inc);
 
 	indent += 2;
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		/* IPv6. */
 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
 	} else
 #endif
 	{
 		/* IPv4. */
 		inet_ntoa_r(inc->inc_laddr, laddr_str);
 		inet_ntoa_r(inc->inc_faddr, faddr_str);
 	}
 	db_print_indent(indent);
 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
 	    ntohs(inc->inc_lport));
 	db_print_indent(indent);
 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
 	    ntohs(inc->inc_fport));
 }
 
 static void
 db_print_inpflags(int inp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_flags & INP_RECVOPTS) {
 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVRETOPTS) {
 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVDSTADDR) {
 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ORIGDSTADDR) {
 		db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HDRINCL) {
 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HIGHPORT) {
 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_LOWPORT) {
 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ANONPORT) {
 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVIF) {
 		db_printf("%sINP_RECVIF", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_MTUDISC) {
 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTTL) {
 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_DONTFRAG) {
 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTOS) {
 		db_printf("%sINP_RECVTOS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_IPV6_V6ONLY) {
 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_PKTINFO) {
 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPLIMIT) {
 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPOPTS) {
 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_DSTOPTS) {
 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDR) {
 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_TCLASS) {
 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_TIMEWAIT) {
 		db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_ONESBCAST) {
 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_DROPPED) {
 		db_printf("%sINP_DROPPED", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_SOCKREF) {
 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & IN6P_RFC2292) {
 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_MTU) {
 		db_printf("IN6P_MTU%s", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_inpvflag(u_char inp_vflag)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_vflag & INP_IPV4) {
 		db_printf("%sINP_IPV4", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6) {
 		db_printf("%sINP_IPV6", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6PROTO) {
 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
 		comma  = 1;
 	}
 }
 
 static void
 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
 
 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
 
 	db_print_indent(indent);
 	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
 	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
 
 	db_print_indent(indent);
 	db_printf("inp_label: %p   inp_flags: 0x%x (",
 	   inp->inp_label, inp->inp_flags);
 	db_print_inpflags(inp->inp_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
 	    inp->inp_vflag);
 	db_print_inpvflag(inp->inp_vflag);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
 
 	db_print_indent(indent);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
 		    "in6p_moptions: %p\n", inp->in6p_options,
 		    inp->in6p_outputopts, inp->in6p_moptions);
 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
 		    inp->in6p_hops);
 	} else
 #endif
 	{
 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
 		    inp->inp_options, inp->inp_moptions);
 	}
 
 	db_print_indent(indent);
 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
 	    (uintmax_t)inp->inp_gencnt);
 }
 
 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 {
 	struct inpcb *inp;
 
 	if (!have_addr) {
 		db_printf("usage: show inpcb <addr>\n");
 		return;
 	}
 	inp = (struct inpcb *)addr;
 
 	db_print_inpcb(inp, "inpcb", 0);
 }
 #endif /* DDB */
 
 #ifdef RATELIMIT
 /*
  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
  * if any.
  */
 int
 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
 {
 	union if_snd_tag_modify_params params = {
 		.rate_limit.max_rate = max_pacing_rate,
 	};
 	struct m_snd_tag *mst;
 	struct ifnet *ifp;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
 	ifp = mst->ifp;
 	if (ifp == NULL)
 		return (EINVAL);
 
 	if (ifp->if_snd_tag_modify == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = ifp->if_snd_tag_modify(mst, &params);
 	}
 	return (error);
 }
 
 /*
  * Query existing TX rate limit based on the existing
  * "inp->inp_snd_tag", if any.
  */
 int
 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
 {
 	union if_snd_tag_query_params params = { };
 	struct m_snd_tag *mst;
 	struct ifnet *ifp;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
 	ifp = mst->ifp;
 	if (ifp == NULL)
 		return (EINVAL);
 
 	if (ifp->if_snd_tag_query == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = ifp->if_snd_tag_query(mst, &params);
 		if (error == 0 &&  p_max_pacing_rate != NULL)
 			*p_max_pacing_rate = params.rate_limit.max_rate;
 	}
 	return (error);
 }
 
 /*
  * Query existing TX queue level based on the existing
  * "inp->inp_snd_tag", if any.
  */
 int
 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
 {
 	union if_snd_tag_query_params params = { };
 	struct m_snd_tag *mst;
 	struct ifnet *ifp;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
 	ifp = mst->ifp;
 	if (ifp == NULL)
 		return (EINVAL);
 
 	if (ifp->if_snd_tag_query == NULL)
 		return (EOPNOTSUPP);
 
 	error = ifp->if_snd_tag_query(mst, &params);
 	if (error == 0 &&  p_txqueue_level != NULL)
 		*p_txqueue_level = params.rate_limit.queue_level;
 	return (error);
 }
 
 /*
  * Allocate a new TX rate limit send tag from the network interface
  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
  */
 int
 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
 {
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
 		    IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
 		.rate_limit.hdr.flowid = flowid,
 		.rate_limit.hdr.flowtype = flowtype,
 		.rate_limit.max_rate = max_pacing_rate,
 	};
 	int error;
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (inp->inp_snd_tag != NULL)
 		return (EINVAL);
 
 	if (ifp->if_snd_tag_alloc == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
 	}
 	return (error);
 }
 
 /*
  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
  * if any:
  */
 void
 in_pcbdetach_txrtlmt(struct inpcb *inp)
 {
 	struct m_snd_tag *mst;
 
 	INP_WLOCK_ASSERT(inp);
 
 	mst = inp->inp_snd_tag;
 	inp->inp_snd_tag = NULL;
 
 	if (mst == NULL)
 		return;
 
 	m_snd_tag_rele(mst);
 }
 
 /*
  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
  * is set in the fast path and will attach/detach/modify the TX rate
  * limit send tag based on the socket's so_max_pacing_rate value.
  */
 void
 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
 {
 	struct socket *socket;
 	uint32_t max_pacing_rate;
 	bool did_upgrade;
 	int error;
 
 	if (inp == NULL)
 		return;
 
 	socket = inp->inp_socket;
 	if (socket == NULL)
 		return;
 
 	if (!INP_WLOCKED(inp)) {
 		/*
 		 * NOTE: If the write locking fails, we need to bail
 		 * out and use the non-ratelimited ring for the
 		 * transmit until there is a new chance to get the
 		 * write lock.
 		 */
 		if (!INP_TRY_UPGRADE(inp))
 			return;
 		did_upgrade = 1;
 	} else {
 		did_upgrade = 0;
 	}
 
 	/*
 	 * NOTE: The so_max_pacing_rate value is read unlocked,
 	 * because atomic updates are not required since the variable
 	 * is checked at every mbuf we send. It is assumed that the
 	 * variable read itself will be atomic.
 	 */
 	max_pacing_rate = socket->so_max_pacing_rate;
 
 	/*
 	 * If the existing send tag is for the wrong interface due to
 	 * a route change, first drop the existing tag.  Set the
 	 * CHANGED flag so that we will keep trying to allocate a new
 	 * tag if we fail to allocate one this time.
 	 */
 	if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
 		in_pcbdetach_txrtlmt(inp);
 		inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 	}
 
 	/*
 	 * NOTE: When attaching to a network interface a reference is
 	 * made to ensure the network interface doesn't go away until
 	 * all ratelimit connections are gone. The network interface
 	 * pointers compared below represent valid network interfaces,
 	 * except when comparing towards NULL.
 	 */
 	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
 		error = 0;
 	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
 		if (inp->inp_snd_tag != NULL)
 			in_pcbdetach_txrtlmt(inp);
 		error = 0;
 	} else if (inp->inp_snd_tag == NULL) {
 		/*
 		 * In order to utilize packet pacing with RSS, we need
 		 * to wait until there is a valid RSS hash before we
 		 * can proceed:
 		 */
 		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
 			error = EAGAIN;
 		} else {
 			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
 			    mb->m_pkthdr.flowid, max_pacing_rate);
 		}
 	} else {
 		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
 	}
 	if (error == 0 || error == EOPNOTSUPP)
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
 
 /*
  * Track route changes for TX rate limiting.
  */
 void
 in_pcboutput_eagain(struct inpcb *inp)
 {
 	bool did_upgrade;
 
 	if (inp == NULL)
 		return;
 
 	if (inp->inp_snd_tag == NULL)
 		return;
 
 	if (!INP_WLOCKED(inp)) {
 		/*
 		 * NOTE: If the write locking fails, we need to bail
 		 * out and use the non-ratelimited ring for the
 		 * transmit until there is a new chance to get the
 		 * write lock.
 		 */
 		if (!INP_TRY_UPGRADE(inp))
 			return;
 		did_upgrade = 1;
 	} else {
 		did_upgrade = 0;
 	}
 
 	/* detach rate limiting */
 	in_pcbdetach_txrtlmt(inp);
 
 	/* make sure new mbuf send tag allocation is made */
 	inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
 #endif /* RATELIMIT */
Index: head/sys/netinet/in_var.h
===================================================================
--- head/sys/netinet/in_var.h	(revision 349368)
+++ head/sys/netinet/in_var.h	(revision 349369)
@@ -1,431 +1,483 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1985, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_var.h	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IN_VAR_H_
 #define _NETINET_IN_VAR_H_
 
 /*
  * Argument structure for SIOCAIFADDR.
  */
 struct	in_aliasreq {
 	char	ifra_name[IFNAMSIZ];		/* if name, e.g. "en0" */
 	struct	sockaddr_in ifra_addr;
 	struct	sockaddr_in ifra_broadaddr;
 #define ifra_dstaddr ifra_broadaddr
 	struct	sockaddr_in ifra_mask;
 	int	ifra_vhid;
 };
 
 #ifdef _KERNEL
 #include <sys/queue.h>
 #include <sys/fnv_hash.h>
 #include <sys/tree.h>
 
 struct igmp_ifsoftc;
 struct in_multi;
 struct lltable;
 SLIST_HEAD(in_multi_head, in_multi);
 
 /*
  * IPv4 per-interface state.
  */
 struct in_ifinfo {
 	struct lltable		*ii_llt;	/* ARP state */
 	struct igmp_ifsoftc	*ii_igmp;	/* IGMP state */
 	struct in_multi		*ii_allhosts;	/* 224.0.0.1 membership */
 };
 
 /*
  * Interface address, Internet version.  One of these structures
  * is allocated for each Internet address on an interface.
  * The ifaddr structure contains the protocol-independent part
  * of the structure and is assumed to be first.
  */
 struct in_ifaddr {
 	struct	ifaddr ia_ifa;		/* protocol-independent info */
 #define	ia_ifp		ia_ifa.ifa_ifp
 #define ia_flags	ia_ifa.ifa_flags
 					/* ia_subnet{,mask} in host order */
 	u_long	ia_subnet;		/* subnet address */
 	u_long	ia_subnetmask;		/* mask of subnet */
 	LIST_ENTRY(in_ifaddr) ia_hash;	/* entry in bucket of inet addresses */
 	CK_STAILQ_ENTRY(in_ifaddr) ia_link;	/* list of internet addresses */
 	struct	sockaddr_in ia_addr;	/* reserve space for interface name */
 	struct	sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */
 #define	ia_broadaddr	ia_dstaddr
 	struct	sockaddr_in ia_sockmask; /* reserve space for general netmask */
 	struct	callout ia_garp_timer;	/* timer for retransmitting GARPs */
 	int	ia_garp_count;		/* count of retransmitted GARPs */
 };
 
 /*
  * Given a pointer to an in_ifaddr (ifaddr),
  * return a pointer to the addr as a sockaddr_in.
  */
 #define IA_SIN(ia)    (&(((struct in_ifaddr *)(ia))->ia_addr))
 #define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr))
 #define IA_MASKSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_sockmask))
 
 #define IN_LNAOF(in, ifa) \
 	((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask))
 
 extern	u_char	inetctlerrmap[];
 
 #define LLTABLE(ifp)	\
 	((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt
 /*
  * Hash table for IP addresses.
  */
 CK_STAILQ_HEAD(in_ifaddrhead, in_ifaddr);
 LIST_HEAD(in_ifaddrhashhead, in_ifaddr);
 
 VNET_DECLARE(struct in_ifaddrhashhead *, in_ifaddrhashtbl);
 VNET_DECLARE(struct in_ifaddrhead, in_ifaddrhead);
 VNET_DECLARE(u_long, in_ifaddrhmask);		/* mask for hash table */
 
 #define	V_in_ifaddrhashtbl	VNET(in_ifaddrhashtbl)
 #define	V_in_ifaddrhead		VNET(in_ifaddrhead)
 #define	V_in_ifaddrhmask	VNET(in_ifaddrhmask)
 
 #define INADDR_NHASH_LOG2       9
 #define INADDR_NHASH		(1 << INADDR_NHASH_LOG2)
 #define INADDR_HASHVAL(x)	fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT)
 #define INADDR_HASH(x) \
 	(&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask])
 
 extern	struct rmlock in_ifaddr_lock;
 
 #define	IN_IFADDR_LOCK_ASSERT()	rm_assert(&in_ifaddr_lock, RA_LOCKED)
 #define	IN_IFADDR_RLOCK(t)	rm_rlock(&in_ifaddr_lock, (t))
 #define	IN_IFADDR_RLOCK_ASSERT()	rm_assert(&in_ifaddr_lock, RA_RLOCKED)
 #define	IN_IFADDR_RUNLOCK(t)	rm_runlock(&in_ifaddr_lock, (t))
 #define	IN_IFADDR_WLOCK()	rm_wlock(&in_ifaddr_lock)
 #define	IN_IFADDR_WLOCK_ASSERT()	rm_assert(&in_ifaddr_lock, RA_WLOCKED)
 #define	IN_IFADDR_WUNLOCK()	rm_wunlock(&in_ifaddr_lock)
 
 /*
  * Macro for finding the internet address structure (in_ifaddr)
  * corresponding to one of our IP addresses (in_addr).
  */
 #define INADDR_TO_IFADDR(addr, ia) \
 	/* struct in_addr addr; */ \
 	/* struct in_ifaddr *ia; */ \
 do { \
 \
 	LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \
 		if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \
 			break; \
 } while (0)
 
 /*
  * Macro for finding the interface (ifnet structure) corresponding to one
  * of our IP addresses.
  */
 #define INADDR_TO_IFP(addr, ifp) \
 	/* struct in_addr addr; */ \
 	/* struct ifnet *ifp; */ \
 { \
 	struct in_ifaddr *ia; \
 \
 	INADDR_TO_IFADDR(addr, ia); \
 	(ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \
 }
 
 /*
  * Macro for finding the internet address structure (in_ifaddr) corresponding
  * to a given interface (ifnet structure).
  */
 #define IFP_TO_IA(ifp, ia, t)						\
 	/* struct ifnet *ifp; */					\
 	/* struct in_ifaddr *ia; */					\
 	/* struct rm_priotracker *t; */					\
 do {									\
 	IN_IFADDR_RLOCK((t));						\
 	for ((ia) = CK_STAILQ_FIRST(&V_in_ifaddrhead);			\
 	    (ia) != NULL && (ia)->ia_ifp != (ifp);			\
 	    (ia) = CK_STAILQ_NEXT((ia), ia_link))				\
 		continue;						\
 	IN_IFADDR_RUNLOCK((t));						\
 } while (0)
 
 /*
  * Legacy IPv4 IGMP per-link structure.
  */
 struct router_info {
 	struct ifnet *rti_ifp;
 	int    rti_type; /* type of router which is querier on this interface */
 	int    rti_time; /* # of slow timeouts since last old query */
 	SLIST_ENTRY(router_info) rti_list;
 };
 
 /*
  * IPv4 multicast IGMP-layer source entry.
  */
 struct ip_msource {
 	RB_ENTRY(ip_msource)	ims_link;	/* RB tree links */
 	in_addr_t		ims_haddr;	/* host byte order */
 	struct ims_st {
 		uint16_t	ex;		/* # of exclusive members */
 		uint16_t	in;		/* # of inclusive members */
 	}			ims_st[2];	/* state at t0, t1 */
 	uint8_t			ims_stp;	/* pending query */
 };
 
 /*
  * IPv4 multicast PCB-layer source entry.
  */
 struct in_msource {
 	RB_ENTRY(ip_msource)	ims_link;	/* RB tree links */
 	in_addr_t		ims_haddr;	/* host byte order */
 	uint8_t			imsl_st[2];	/* state before/at commit */
 };
 
 RB_HEAD(ip_msource_tree, ip_msource);	/* define struct ip_msource_tree */
 
 static __inline int
 ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b)
 {
 
 	if (a->ims_haddr < b->ims_haddr)
 		return (-1);
 	if (a->ims_haddr == b->ims_haddr)
 		return (0);
 	return (1);
 }
 RB_PROTOTYPE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp);
 
 /*
  * IPv4 multicast PCB-layer group filter descriptor.
  */
 struct in_mfilter {
 	struct ip_msource_tree	imf_sources; /* source list for (S,G) */
 	u_long			imf_nsrc;    /* # of source entries */
 	uint8_t			imf_st[2];   /* state before/at commit */
+	struct in_multi	       *imf_inm;     /* associated multicast address */
+	STAILQ_ENTRY(in_mfilter) imf_entry;  /* list entry */
 };
+
+/*
+ * Helper types and functions for IPv4 multicast filters.
+ */
+STAILQ_HEAD(ip_mfilter_head, in_mfilter);
+
+struct in_mfilter *ip_mfilter_alloc(int mflags, int st0, int st1);
+void ip_mfilter_free(struct in_mfilter *);
+
+static inline void
+ip_mfilter_init(struct ip_mfilter_head *head)
+{
+
+	STAILQ_INIT(head);
+}
+
+static inline struct in_mfilter *
+ip_mfilter_first(const struct ip_mfilter_head *head)
+{
+
+	return (STAILQ_FIRST(head));
+}
+
+static inline void
+ip_mfilter_insert(struct ip_mfilter_head *head, struct in_mfilter *imf)
+{
+
+	STAILQ_INSERT_TAIL(head, imf, imf_entry);
+}
+
+static inline void
+ip_mfilter_remove(struct ip_mfilter_head *head, struct in_mfilter *imf)
+{
+
+	STAILQ_REMOVE(head, imf, in_mfilter, imf_entry);
+}
+
+#define	IP_MFILTER_FOREACH(imf, head) \
+	STAILQ_FOREACH(imf, head, imf_entry)
+
+static inline size_t
+ip_mfilter_count(struct ip_mfilter_head *head)
+{
+	struct in_mfilter *imf;
+	size_t num = 0;
+
+	STAILQ_FOREACH(imf, head, imf_entry)
+		num++;
+	return (num);
+}
 
 /*
  * IPv4 group descriptor.
  *
  * For every entry on an ifnet's if_multiaddrs list which represents
  * an IP multicast group, there is one of these structures.
  *
  * If any source filters are present, then a node will exist in the RB-tree
  * to permit fast lookup by source whenever an operation takes place.
  * This permits pre-order traversal when we issue reports.
  * Source filter trees are kept separately from the socket layer to
  * greatly simplify locking.
  *
  * When IGMPv3 is active, inm_timer is the response to group query timer.
  * The state-change timer inm_sctimer is separate; whenever state changes
  * for the group the state change record is generated and transmitted,
  * and kept if retransmissions are necessary.
  *
  * FUTURE: inm_link is now only used when groups are being purged
  * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but
  * because it is at the very start of the struct, we can't do this
  * w/o breaking the ABI for ifmcstat.
  */
 struct in_multi {
 	LIST_ENTRY(in_multi) inm_link;	/* to-be-released by in_ifdetach */
 	struct	in_addr inm_addr;	/* IP multicast address, convenience */
 	struct	ifnet *inm_ifp;		/* back pointer to ifnet */
 	struct	ifmultiaddr *inm_ifma;	/* back pointer to ifmultiaddr */
 	u_int	inm_timer;		/* IGMPv1/v2 group / v3 query timer */
 	u_int	inm_state;		/* state of the membership */
 	void	*inm_rti;		/* unused, legacy field */
 	u_int	inm_refcount;		/* reference count */
 
 	/* New fields for IGMPv3 follow. */
 	struct igmp_ifsoftc	*inm_igi;	/* IGMP info */
 	SLIST_ENTRY(in_multi)	 inm_nrele;	/* to-be-released by IGMP */
 	struct ip_msource_tree	 inm_srcs;	/* tree of sources */
 	u_long			 inm_nsrc;	/* # of tree entries */
 
 	struct mbufq		 inm_scq;	/* queue of pending
 						 * state-change packets */
 	struct timeval		 inm_lastgsrtv;	/* Time of last G-S-R query */
 	uint16_t		 inm_sctimer;	/* state-change timer */
 	uint16_t		 inm_scrv;	/* state-change rexmit count */
 
 	/*
 	 * SSM state counters which track state at T0 (the time the last
 	 * state-change report's RV timer went to zero) and T1
 	 * (time of pending report, i.e. now).
 	 * Used for computing IGMPv3 state-change reports. Several refcounts
 	 * are maintained here to optimize for common use-cases.
 	 */
 	struct inm_st {
 		uint16_t	iss_fmode;	/* IGMP filter mode */
 		uint16_t	iss_asm;	/* # of ASM listeners */
 		uint16_t	iss_ex;		/* # of exclusive members */
 		uint16_t	iss_in;		/* # of inclusive members */
 		uint16_t	iss_rec;	/* # of recorded sources */
 	}			inm_st[2];	/* state at t0, t1 */
 };
 
 /*
  * Helper function to derive the filter mode on a source entry
  * from its internal counters. Predicates are:
  *  A source is only excluded if all listeners exclude it.
  *  A source is only included if no listeners exclude it,
  *  and at least one listener includes it.
  * May be used by ifmcstat(8).
  */
 static __inline uint8_t
 ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims,
     uint8_t t)
 {
 
 	t = !!t;
 	if (inm->inm_st[t].iss_ex > 0 &&
 	    inm->inm_st[t].iss_ex == ims->ims_st[t].ex)
 		return (MCAST_EXCLUDE);
 	else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0)
 		return (MCAST_INCLUDE);
 	return (MCAST_UNDEFINED);
 }
 
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet);
 SYSCTL_DECL(_net_inet_ip);
 SYSCTL_DECL(_net_inet_raw);
 #endif
 
 /*
  * Lock macros for IPv4 layer multicast address lists.  IPv4 lock goes
  * before link layer multicast locks in the lock order.  In most cases,
  * consumers of IN_*_MULTI() macros should acquire the locks before
  * calling them; users of the in_{add,del}multi() functions should not.
  */
 extern struct mtx in_multi_list_mtx;
 extern struct sx in_multi_sx;
 
 #define	IN_MULTI_LIST_LOCK()		mtx_lock(&in_multi_list_mtx)
 #define	IN_MULTI_LIST_UNLOCK()	mtx_unlock(&in_multi_list_mtx)
 #define	IN_MULTI_LIST_LOCK_ASSERT()	mtx_assert(&in_multi_list_mtx, MA_OWNED)
 #define	IN_MULTI_LIST_UNLOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_NOTOWNED)
 
 #define	IN_MULTI_LOCK()		sx_xlock(&in_multi_sx)
 #define	IN_MULTI_UNLOCK()	sx_xunlock(&in_multi_sx)
 #define	IN_MULTI_LOCK_ASSERT()	sx_assert(&in_multi_sx, SA_XLOCKED)
 #define	IN_MULTI_UNLOCK_ASSERT() sx_assert(&in_multi_sx, SA_XUNLOCKED)
 
 void inm_disconnect(struct in_multi *inm);
 extern int ifma_restart;
 
 /* Acquire an in_multi record. */
 static __inline void
 inm_acquire_locked(struct in_multi *inm)
 {
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	++inm->inm_refcount;
 }
 
 static __inline void
 inm_acquire(struct in_multi *inm)
 {
 	IN_MULTI_LIST_LOCK();
 	inm_acquire_locked(inm);
 	IN_MULTI_LIST_UNLOCK();
 }
 
 static __inline void
 inm_rele_locked(struct in_multi_head *inmh, struct in_multi *inm)
 {
 	MPASS(inm->inm_refcount > 0);
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	if (--inm->inm_refcount == 0) {
 		MPASS(inmh != NULL);
 		inm_disconnect(inm);
 		inm->inm_ifma->ifma_protospec = NULL;
 		SLIST_INSERT_HEAD(inmh, inm, inm_nrele);
 	}
 }
 
 /*
  * Return values for imo_multi_filter().
  */
 #define MCAST_PASS		0	/* Pass */
 #define MCAST_NOTGMEMBER	1	/* This host not a member of group */
 #define MCAST_NOTSMEMBER	2	/* This host excluded source */
 #define MCAST_MUTED		3	/* [deprecated] */
 
 struct	rtentry;
 struct	route;
 struct	ip_moptions;
 
 struct in_multi *inm_lookup_locked(struct ifnet *, const struct in_addr);
 struct in_multi *inm_lookup(struct ifnet *, const struct in_addr);
 int	imo_multi_filter(const struct ip_moptions *, const struct ifnet *,
 	    const struct sockaddr *, const struct sockaddr *);
 void	inm_commit(struct in_multi *);
 void	inm_clear_recorded(struct in_multi *);
 void	inm_print(const struct in_multi *);
 int	inm_record_source(struct in_multi *inm, const in_addr_t);
 void	inm_release_deferred(struct in_multi *);
 void	inm_release_list_deferred(struct in_multi_head *);
 struct	in_multi *
 in_addmulti(struct in_addr *, struct ifnet *);
 int	in_joingroup(struct ifnet *, const struct in_addr *,
 	    /*const*/ struct in_mfilter *, struct in_multi **);
 int	in_joingroup_locked(struct ifnet *, const struct in_addr *,
 	    /*const*/ struct in_mfilter *, struct in_multi **);
 int	in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *);
 int	in_leavegroup_locked(struct in_multi *,
 	    /*const*/ struct in_mfilter *);
 int	in_control(struct socket *, u_long, caddr_t, struct ifnet *,
 	    struct thread *);
 int	in_addprefix(struct in_ifaddr *, int);
 int	in_scrubprefix(struct in_ifaddr *, u_int);
 void	in_ifscrub_all(void);
 void	ip_input(struct mbuf *);
 void	ip_direct_input(struct mbuf *);
 void	in_ifadown(struct ifaddr *ifa, int);
 struct	mbuf	*ip_tryforward(struct mbuf *);
 void	*in_domifattach(struct ifnet *);
 void	in_domifdetach(struct ifnet *, void *);
 
 
 /* XXX */
 void	 in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum);
 void	 in_rtredirect(struct sockaddr *, struct sockaddr *,
 	    struct sockaddr *, int, struct sockaddr *, u_int);
 #endif /* _KERNEL */
 
 /* INET6 stuff */
 #include <netinet6/in6_var.h>
 
 #endif /* _NETINET_IN_VAR_H_ */
Index: head/sys/netinet/ip_carp.c
===================================================================
--- head/sys/netinet/ip_carp.c	(revision 349368)
+++ head/sys/netinet/ip_carp.c	(revision 349369)
@@ -1,2303 +1,2303 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002 Michael Shalayeff.
  * Copyright (c) 2003 Ryan McBride.
  * Copyright (c) 2011 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bpf.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 #include <sys/counter.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip.h>
 #include <machine/in_cksum.h>
 #endif
 #ifdef INET
 #include <netinet/ip_var.h>
 #include <netinet/if_ether.h>
 #endif
 
 #ifdef INET6
 #include <netinet/icmp6.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 
 #include <crypto/sha1.h>
 
 static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses");
 
 struct carp_softc {
 	struct ifnet		*sc_carpdev;	/* Pointer to parent ifnet. */
 	struct ifaddr		**sc_ifas;	/* Our ifaddrs. */
 	struct sockaddr_dl	sc_addr;	/* Our link level address. */
 	struct callout		sc_ad_tmo;	/* Advertising timeout. */
 #ifdef INET
 	struct callout		sc_md_tmo;	/* Master down timeout. */
 #endif
 #ifdef INET6
 	struct callout 		sc_md6_tmo;	/* XXX: Master down timeout. */
 #endif
 	struct mtx		sc_mtx;
 
 	int			sc_vhid;
 	int			sc_advskew;
 	int			sc_advbase;
 
 	int			sc_naddrs;
 	int			sc_naddrs6;
 	int			sc_ifasiz;
 	enum { INIT = 0, BACKUP, MASTER }	sc_state;
 	int			sc_suppress;
 	int			sc_sendad_errors;
 #define	CARP_SENDAD_MAX_ERRORS	3
 	int			sc_sendad_success;
 #define	CARP_SENDAD_MIN_SUCCESS 3
 
 	int			sc_init_counter;
 	uint64_t		sc_counter;
 
 	/* authentication */
 #define	CARP_HMAC_PAD	64
 	unsigned char sc_key[CARP_KEY_LEN];
 	unsigned char sc_pad[CARP_HMAC_PAD];
 	SHA1_CTX sc_sha1;
 
 	TAILQ_ENTRY(carp_softc)	sc_list;	/* On the carp_if list. */
 	LIST_ENTRY(carp_softc)	sc_next;	/* On the global list. */
 };
 
 struct carp_if {
 #ifdef INET
 	int	cif_naddrs;
 #endif
 #ifdef INET6
 	int	cif_naddrs6;
 #endif
 	TAILQ_HEAD(, carp_softc) cif_vrs;
 #ifdef INET
 	struct ip_moptions 	 cif_imo;
 #endif
 #ifdef INET6
 	struct ip6_moptions 	 cif_im6o;
 #endif
 	struct ifnet	*cif_ifp;
 	struct mtx	cif_mtx;
 	uint32_t	cif_flags;
 #define	CIF_PROMISC	0x00000001
 };
 
 #define	CARP_INET	0
 #define	CARP_INET6	1
 static int proto_reg[] = {-1, -1};
 
 /*
  * Brief design of carp(4).
  *
  * Any carp-capable ifnet may have a list of carp softcs hanging off
  * its ifp->if_carp pointer. Each softc represents one unique virtual
  * host id, or vhid. The softc has a back pointer to the ifnet. All
  * softcs are joined in a global list, which has quite limited use.
  *
  * Any interface address that takes part in CARP negotiation has a
  * pointer to the softc of its vhid, ifa->ifa_carp. That could be either
  * AF_INET or AF_INET6 address.
  *
  * Although, one can get the softc's backpointer to ifnet and traverse
  * through its ifp->if_addrhead queue to find all interface addresses
  * involved in CARP, we keep a growable array of ifaddr pointers. This
  * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that
  * do calls into the network stack, thus avoiding LORs.
  *
  * Locking:
  *
  * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(),
  * callout-driven events and ioctl()s.
  *
  * To traverse the list of softcs on an ifnet we use CIF_LOCK() or carp_sx.
  * To traverse the global list we use the mutex carp_mtx.
  *
  * Known issues with locking:
  *
  * - Sending ad, we put the pointer to the softc in an mtag, and no reference
  *   counting is done on the softc.
  * - On module unload we may race (?) with packet processing thread
  *   dereferencing our function pointers.
  */
 
 /* Accept incoming CARP packets. */
 VNET_DEFINE_STATIC(int, carp_allow) = 1;
 #define	V_carp_allow	VNET(carp_allow)
 
 /* Set DSCP in outgoing CARP packets. */
 VNET_DEFINE_STATIC(int, carp_dscp) = 56;
 #define	V_carp_dscp	VNET(carp_dscp)
 
 /* Preempt slower nodes. */
 VNET_DEFINE_STATIC(int, carp_preempt) = 0;
 #define	V_carp_preempt	VNET(carp_preempt)
 
 /* Log level. */
 VNET_DEFINE_STATIC(int, carp_log) = 1;
 #define	V_carp_log	VNET(carp_log)
 
 /* Global advskew demotion. */
 VNET_DEFINE_STATIC(int, carp_demotion) = 0;
 #define	V_carp_demotion	VNET(carp_demotion)
 
 /* Send error demotion factor. */
 VNET_DEFINE_STATIC(int, carp_senderr_adj) = CARP_MAXSKEW;
 #define	V_carp_senderr_adj	VNET(carp_senderr_adj)
 
 /* Iface down demotion factor. */
 VNET_DEFINE_STATIC(int, carp_ifdown_adj) = CARP_MAXSKEW;
 #define	V_carp_ifdown_adj	VNET(carp_ifdown_adj)
 
 static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS);
 static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS);
 static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_net_inet, IPPROTO_CARP,	carp,	CTLFLAG_RW, 0,	"CARP");
 SYSCTL_PROC(_net_inet_carp, OID_AUTO, allow,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_allow_sysctl, "I",
     "Accept incoming CARP packets");
 SYSCTL_PROC(_net_inet_carp, OID_AUTO, dscp,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_dscp_sysctl, "I",
     "DSCP value for carp packets");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_log), 0, "CARP log level");
 SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
     0, 0, carp_demote_adj_sysctl, "I",
     "Adjust demotion factor (skew of advskew)");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor,
     CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor,
     CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_ifdown_adj), 0,
     "Interface down demotion factor adjustment");
 
 VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats);
 VNET_PCPUSTAT_SYSINIT(carpstats);
 VNET_PCPUSTAT_SYSUNINIT(carpstats);
 
 #define	CARPSTATS_ADD(name, val)	\
     counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \
 	sizeof(uint64_t)], (val))
 #define	CARPSTATS_INC(name)		CARPSTATS_ADD(name, 1)
 
 SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats,
     carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)");
 
 #define	CARP_LOCK_INIT(sc)	mtx_init(&(sc)->sc_mtx, "carp_softc",   \
 	NULL, MTX_DEF)
 #define	CARP_LOCK_DESTROY(sc)	mtx_destroy(&(sc)->sc_mtx)
 #define	CARP_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
 #define	CARP_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
 #define	CARP_UNLOCK(sc)		mtx_unlock(&(sc)->sc_mtx)
 #define	CIF_LOCK_INIT(cif)	mtx_init(&(cif)->cif_mtx, "carp_if",   \
 	NULL, MTX_DEF)
 #define	CIF_LOCK_DESTROY(cif)	mtx_destroy(&(cif)->cif_mtx)
 #define	CIF_LOCK_ASSERT(cif)	mtx_assert(&(cif)->cif_mtx, MA_OWNED)
 #define	CIF_LOCK(cif)		mtx_lock(&(cif)->cif_mtx)
 #define	CIF_UNLOCK(cif)		mtx_unlock(&(cif)->cif_mtx)
 #define	CIF_FREE(cif)	do {				\
 		CIF_LOCK(cif);				\
 		if (TAILQ_EMPTY(&(cif)->cif_vrs))	\
 			carp_free_if(cif);		\
 		else					\
 			CIF_UNLOCK(cif);		\
 } while (0)
 
 #define	CARP_LOG(...)	do {				\
 	if (V_carp_log > 0)				\
 		log(LOG_INFO, "carp: " __VA_ARGS__);	\
 } while (0)
 
 #define	CARP_DEBUG(...)	do {				\
 	if (V_carp_log > 1)				\
 		log(LOG_DEBUG, __VA_ARGS__);		\
 } while (0)
 
 #define	IFNET_FOREACH_IFA(ifp, ifa)					\
 	CK_STAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link)	\
 		if ((ifa)->ifa_carp != NULL)
 
 #define	CARP_FOREACH_IFA(sc, ifa)					\
 	CARP_LOCK_ASSERT(sc);						\
 	for (int _i = 0;						\
 		_i < (sc)->sc_naddrs + (sc)->sc_naddrs6 &&		\
 		((ifa) = sc->sc_ifas[_i]) != NULL;			\
 		++_i)
 
 #define	IFNET_FOREACH_CARP(ifp, sc)					\
 	KASSERT(mtx_owned(&ifp->if_carp->cif_mtx) ||			\
 	    sx_xlocked(&carp_sx), ("cif_vrs not locked"));		\
 	TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list)
 
 #define	DEMOTE_ADVSKEW(sc)					\
     (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ?	\
     CARP_MAXSKEW : ((sc)->sc_advskew + V_carp_demotion))
 
 static void	carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
 static struct carp_softc
 		*carp_alloc(struct ifnet *);
 static void	carp_destroy(struct carp_softc *);
 static struct carp_if
 		*carp_alloc_if(struct ifnet *);
 static void	carp_free_if(struct carp_if *);
 static void	carp_set_state(struct carp_softc *, int, const char* reason);
 static void	carp_sc_state(struct carp_softc *);
 static void	carp_setrun(struct carp_softc *, sa_family_t);
 static void	carp_master_down(void *);
 static void	carp_master_down_locked(struct carp_softc *,
     		    const char* reason);
 static void	carp_send_ad(void *);
 static void	carp_send_ad_locked(struct carp_softc *);
 static void	carp_addroute(struct carp_softc *);
 static void	carp_ifa_addroute(struct ifaddr *);
 static void	carp_delroute(struct carp_softc *);
 static void	carp_ifa_delroute(struct ifaddr *);
 static void	carp_send_ad_all(void *, int);
 static void	carp_demote_adj(int, char *);
 
 static LIST_HEAD(, carp_softc) carp_list;
 static struct mtx carp_mtx;
 static struct sx carp_sx;
 static struct task carp_sendall_task =
     TASK_INITIALIZER(0, carp_send_ad_all, NULL);
 
 static void
 carp_hmac_prepare(struct carp_softc *sc)
 {
 	uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
 	uint8_t vhid = sc->sc_vhid & 0xff;
 	struct ifaddr *ifa;
 	int i, found;
 #ifdef INET
 	struct in_addr last, cur, in;
 #endif
 #ifdef INET6
 	struct in6_addr last6, cur6, in6;
 #endif
 
 	CARP_LOCK_ASSERT(sc);
 
 	/* Compute ipad from key. */
 	bzero(sc->sc_pad, sizeof(sc->sc_pad));
 	bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
 	for (i = 0; i < sizeof(sc->sc_pad); i++)
 		sc->sc_pad[i] ^= 0x36;
 
 	/* Precompute first part of inner hash. */
 	SHA1Init(&sc->sc_sha1);
 	SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
 	SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
 	SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
 	SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
 #ifdef INET
 	cur.s_addr = 0;
 	do {
 		found = 0;
 		last = cur;
 		cur.s_addr = 0xffffffff;
 		CARP_FOREACH_IFA(sc, ifa) {
 			in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
 			if (ifa->ifa_addr->sa_family == AF_INET &&
 			    ntohl(in.s_addr) > ntohl(last.s_addr) &&
 			    ntohl(in.s_addr) < ntohl(cur.s_addr)) {
 				cur.s_addr = in.s_addr;
 				found++;
 			}
 		}
 		if (found)
 			SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur));
 	} while (found);
 #endif /* INET */
 #ifdef INET6
 	memset(&cur6, 0, sizeof(cur6));
 	do {
 		found = 0;
 		last6 = cur6;
 		memset(&cur6, 0xff, sizeof(cur6));
 		CARP_FOREACH_IFA(sc, ifa) {
 			in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
 			if (IN6_IS_SCOPE_EMBED(&in6))
 				in6.s6_addr16[1] = 0;
 			if (ifa->ifa_addr->sa_family == AF_INET6 &&
 			    memcmp(&in6, &last6, sizeof(in6)) > 0 &&
 			    memcmp(&in6, &cur6, sizeof(in6)) < 0) {
 				cur6 = in6;
 				found++;
 			}
 		}
 		if (found)
 			SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6));
 	} while (found);
 #endif /* INET6 */
 
 	/* convert ipad to opad */
 	for (i = 0; i < sizeof(sc->sc_pad); i++)
 		sc->sc_pad[i] ^= 0x36 ^ 0x5c;
 }
 
 static void
 carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2],
     unsigned char md[20])
 {
 	SHA1_CTX sha1ctx;
 
 	CARP_LOCK_ASSERT(sc);
 
 	/* fetch first half of inner hash */
 	bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
 
 	SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
 	SHA1Final(md, &sha1ctx);
 
 	/* outer hash */
 	SHA1Init(&sha1ctx);
 	SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
 	SHA1Update(&sha1ctx, md, 20);
 	SHA1Final(md, &sha1ctx);
 }
 
 static int
 carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2],
     unsigned char md[20])
 {
 	unsigned char md2[20];
 
 	CARP_LOCK_ASSERT(sc);
 
 	carp_hmac_generate(sc, counter, md2);
 
 	return (bcmp(md, md2, sizeof(md2)));
 }
 
 /*
  * process input packet.
  * we have rearranged checks order compared to the rfc,
  * but it seems more efficient this way or not possible otherwise.
  */
 #ifdef INET
 int
 carp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct carp_header *ch;
 	int iplen, len;
 
 	iplen = *offp;
 	*mp = NULL;
 
 	CARPSTATS_INC(carps_ipackets);
 
 	if (!V_carp_allow) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that the IP TTL is 255.  */
 	if (ip->ip_ttl != CARP_DFLTTL) {
 		CARPSTATS_INC(carps_badttl);
 		CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
 		    ip->ip_ttl,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	iplen = ip->ip_hl << 2;
 
 	if (m->m_pkthdr.len < iplen + sizeof(*ch)) {
 		CARPSTATS_INC(carps_badlen);
 		CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) "
 		    "on %s\n", __func__, m->m_len - sizeof(struct ip),
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (iplen + sizeof(*ch) < m->m_len) {
 		if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) {
 			CARPSTATS_INC(carps_hdrops);
 			CARP_DEBUG("%s: pullup failed\n", __func__);
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 	}
 	ch = (struct carp_header *)((char *)ip + iplen);
 
 	/*
 	 * verify that the received packet length is
 	 * equal to the CARP header
 	 */
 	len = iplen + sizeof(*ch);
 	if (len > m->m_pkthdr.len) {
 		CARPSTATS_INC(carps_badlen);
 		CARP_DEBUG("%s: packet too short %d on %s\n", __func__,
 		    m->m_pkthdr.len,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if ((m = m_pullup(m, len)) == NULL) {
 		CARPSTATS_INC(carps_hdrops);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 	ch = (struct carp_header *)((char *)ip + iplen);
 
 	/* verify the CARP checksum */
 	m->m_data += iplen;
 	if (in_cksum(m, len - iplen)) {
 		CARPSTATS_INC(carps_badsum);
 		CARP_DEBUG("%s: checksum failed on %s\n", __func__,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= iplen;
 
 	carp_input_c(m, ch, AF_INET);
 	return (IPPROTO_DONE);
 }
 #endif
 
 #ifdef INET6
 int
 carp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct carp_header *ch;
 	u_int len;
 
 	CARPSTATS_INC(carps_ipackets6);
 
 	if (!V_carp_allow) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* check if received on a valid carp interface */
 	if (m->m_pkthdr.rcvif->if_carp == NULL) {
 		CARPSTATS_INC(carps_badif);
 		CARP_DEBUG("%s: packet received on non-carp interface: %s\n",
 		    __func__, m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that the IP TTL is 255 */
 	if (ip6->ip6_hlim != CARP_DFLTTL) {
 		CARPSTATS_INC(carps_badttl);
 		CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
 		    ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that we have a complete carp packet */
 	len = m->m_len;
 	IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
 	if (ch == NULL) {
 		CARPSTATS_INC(carps_badlen);
 		CARP_DEBUG("%s: packet size %u too small\n", __func__, len);
 		return (IPPROTO_DONE);
 	}
 
 
 	/* verify the CARP checksum */
 	m->m_data += *offp;
 	if (in_cksum(m, sizeof(*ch))) {
 		CARPSTATS_INC(carps_badsum);
 		CARP_DEBUG("%s: checksum failed, on %s\n", __func__,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= *offp;
 
 	carp_input_c(m, ch, AF_INET6);
 	return (IPPROTO_DONE);
 }
 #endif /* INET6 */
 
 /*
  * This routine should not be necessary at all, but some switches
  * (VMWare ESX vswitches) can echo our own packets back at us,
  * and we must ignore them or they will cause us to drop out of
  * MASTER mode.
  *
  * We cannot catch all cases of network loops.  Instead, what we
  * do here is catch any packet that arrives with a carp header
  * with a VHID of 0, that comes from an address that is our own.
  * These packets are by definition "from us" (even if they are from
  * a misconfigured host that is pretending to be us).
  *
  * The VHID test is outside this mini-function.
  */
 static int
 carp_source_is_self(struct mbuf *m, struct ifaddr *ifa, sa_family_t af)
 {
 #ifdef INET
 	struct ip *ip4;
 	struct in_addr in4;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	struct in6_addr in6;
 #endif
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		ip4 = mtod(m, struct ip *);
 		in4 = ifatoia(ifa)->ia_addr.sin_addr;
 		return (in4.s_addr == ip4->ip_src.s_addr);
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ip6 = mtod(m, struct ip6_hdr *);
 		in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
 		return (memcmp(&in6, &ip6->ip6_src, sizeof(in6)) == 0);
 #endif
 	default:
 		break;
 	}
 	return (0);
 }
 
 static void
 carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
 {
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ifaddr *ifa, *match;
 	struct carp_softc *sc;
 	uint64_t tmp_counter;
 	struct timeval sc_tv, ch_tv;
 	struct epoch_tracker et;
 	int error;
 
 	/*
 	 * Verify that the VHID is valid on the receiving interface.
 	 *
 	 * There should be just one match.  If there are none
 	 * the VHID is not valid and we drop the packet.  If
 	 * there are multiple VHID matches, take just the first
 	 * one, for compatibility with previous code.  While we're
 	 * scanning, check for obvious loops in the network topology
 	 * (these should never happen, and as noted above, we may
 	 * miss real loops; this is just a double-check).
 	 */
 	NET_EPOCH_ENTER(et);
 	error = 0;
 	match = NULL;
 	IFNET_FOREACH_IFA(ifp, ifa) {
 		if (match == NULL && ifa->ifa_carp != NULL &&
 		    ifa->ifa_addr->sa_family == af &&
 		    ifa->ifa_carp->sc_vhid == ch->carp_vhid)
 			match = ifa;
 		if (ch->carp_vhid == 0 && carp_source_is_self(m, ifa, af))
 			error = ELOOP;
 	}
 	ifa = error ? NULL : match;
 	if (ifa != NULL)
 		ifa_ref(ifa);
 	NET_EPOCH_EXIT(et);
 
 	if (ifa == NULL) {
 		if (error == ELOOP) {
 			CARP_DEBUG("dropping looped packet on interface %s\n",
 			    ifp->if_xname);
 			CARPSTATS_INC(carps_badif);	/* ??? */
 		} else {
 			CARPSTATS_INC(carps_badvhid);
 		}
 		m_freem(m);
 		return;
 	}
 
 	/* verify the CARP version. */
 	if (ch->carp_version != CARP_VERSION) {
 		CARPSTATS_INC(carps_badver);
 		CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname,
 		    ch->carp_version);
 		ifa_free(ifa);
 		m_freem(m);
 		return;
 	}
 
 	sc = ifa->ifa_carp;
 	CARP_LOCK(sc);
 	ifa_free(ifa);
 
 	if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
 		CARPSTATS_INC(carps_badauth);
 		CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__,
 		    sc->sc_vhid, ifp->if_xname);
 		goto out;
 	}
 
 	tmp_counter = ntohl(ch->carp_counter[0]);
 	tmp_counter = tmp_counter<<32;
 	tmp_counter += ntohl(ch->carp_counter[1]);
 
 	/* XXX Replay protection goes here */
 
 	sc->sc_init_counter = 0;
 	sc->sc_counter = tmp_counter;
 
 	sc_tv.tv_sec = sc->sc_advbase;
 	sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256;
 	ch_tv.tv_sec = ch->carp_advbase;
 	ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
 
 	switch (sc->sc_state) {
 	case INIT:
 		break;
 	case MASTER:
 		/*
 		 * If we receive an advertisement from a master who's going to
 		 * be more frequent than us, go into BACKUP state.
 		 */
 		if (timevalcmp(&sc_tv, &ch_tv, >) ||
 		    timevalcmp(&sc_tv, &ch_tv, ==)) {
 			callout_stop(&sc->sc_ad_tmo);
 			carp_set_state(sc, BACKUP,
 			    "more frequent advertisement received");
 			carp_setrun(sc, 0);
 			carp_delroute(sc);
 		}
 		break;
 	case BACKUP:
 		/*
 		 * If we're pre-empting masters who advertise slower than us,
 		 * and this one claims to be slower, treat him as down.
 		 */
 		if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) {
 			carp_master_down_locked(sc,
 			    "preempting a slower master");
 			break;
 		}
 
 		/*
 		 *  If the master is going to advertise at such a low frequency
 		 *  that he's guaranteed to time out, we'd might as well just
 		 *  treat him as timed out now.
 		 */
 		sc_tv.tv_sec = sc->sc_advbase * 3;
 		if (timevalcmp(&sc_tv, &ch_tv, <)) {
 			carp_master_down_locked(sc, "master will time out");
 			break;
 		}
 
 		/*
 		 * Otherwise, we reset the counter and wait for the next
 		 * advertisement.
 		 */
 		carp_setrun(sc, af);
 		break;
 	}
 
 out:
 	CARP_UNLOCK(sc);
 	m_freem(m);
 }
 
 static int
 carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
 {
 	struct m_tag *mtag;
 
 	if (sc->sc_init_counter) {
 		/* this could also be seconds since unix epoch */
 		sc->sc_counter = arc4random();
 		sc->sc_counter = sc->sc_counter << 32;
 		sc->sc_counter += arc4random();
 	} else
 		sc->sc_counter++;
 
 	ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
 	ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);
 
 	carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
 
 	/* Tag packet for carp_output */
 	if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *),
 	    M_NOWAIT)) == NULL) {
 		m_freem(m);
 		CARPSTATS_INC(carps_onomem);
 		return (ENOMEM);
 	}
 	bcopy(&sc, mtag + 1, sizeof(sc));
 	m_tag_prepend(m, mtag);
 
 	return (0);
 }
 
 /*
  * To avoid LORs and possible recursions this function shouldn't
  * be called directly, but scheduled via taskqueue.
  */
 static void
 carp_send_ad_all(void *ctx __unused, int pending __unused)
 {
 	struct carp_softc *sc;
 
 	mtx_lock(&carp_mtx);
 	LIST_FOREACH(sc, &carp_list, sc_next)
 		if (sc->sc_state == MASTER) {
 			CARP_LOCK(sc);
 			CURVNET_SET(sc->sc_carpdev->if_vnet);
 			carp_send_ad_locked(sc);
 			CURVNET_RESTORE();
 			CARP_UNLOCK(sc);
 		}
 	mtx_unlock(&carp_mtx);
 }
 
 /* Send a periodic advertisement, executed in callout context. */
 static void
 carp_send_ad(void *v)
 {
 	struct carp_softc *sc = v;
 
 	CARP_LOCK_ASSERT(sc);
 	CURVNET_SET(sc->sc_carpdev->if_vnet);
 	carp_send_ad_locked(sc);
 	CURVNET_RESTORE();
 	CARP_UNLOCK(sc);
 }
 
 static void
 carp_send_ad_error(struct carp_softc *sc, int error)
 {
 
 	if (error) {
 		if (sc->sc_sendad_errors < INT_MAX)
 			sc->sc_sendad_errors++;
 		if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
 			static const char fmt[] = "send error %d on %s";
 			char msg[sizeof(fmt) + IFNAMSIZ];
 
 			sprintf(msg, fmt, error, sc->sc_carpdev->if_xname);
 			carp_demote_adj(V_carp_senderr_adj, msg);
 		}
 		sc->sc_sendad_success = 0;
 	} else {
 		if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS &&
 		    ++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) {
 			static const char fmt[] = "send ok on %s";
 			char msg[sizeof(fmt) + IFNAMSIZ];
 
 			sprintf(msg, fmt, sc->sc_carpdev->if_xname);
 			carp_demote_adj(-V_carp_senderr_adj, msg);
 			sc->sc_sendad_errors = 0;
 		} else
 			sc->sc_sendad_errors = 0;
 	}
 }
 
 /*
  * Pick the best ifaddr on the given ifp for sending CARP
  * advertisements.
  *
  * "Best" here is defined by ifa_preferred().  This function is much
  * much like ifaof_ifpforaddr() except that we just use ifa_preferred().
  *
  * (This could be simplified to return the actual address, except that
  * it has a different format in AF_INET and AF_INET6.)
  */
 static struct ifaddr *
 carp_best_ifa(int af, struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa, *best;
 
 	if (af >= AF_MAX)
 		return (NULL);
 	best = NULL;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family == af &&
 		    (best == NULL || ifa_preferred(best, ifa)))
 			best = ifa;
 	}
 	NET_EPOCH_EXIT(et);
 	if (best != NULL)
 		ifa_ref(best);
 	return (best);
 }
 
 static void
 carp_send_ad_locked(struct carp_softc *sc)
 {
 	struct carp_header ch;
 	struct timeval tv;
 	struct ifaddr *ifa;
 	struct carp_header *ch_ptr;
 	struct mbuf *m;
 	int len, advskew;
 
 	CARP_LOCK_ASSERT(sc);
 
 	advskew = DEMOTE_ADVSKEW(sc);
 	tv.tv_sec = sc->sc_advbase;
 	tv.tv_usec = advskew * 1000000 / 256;
 
 	ch.carp_version = CARP_VERSION;
 	ch.carp_type = CARP_ADVERTISEMENT;
 	ch.carp_vhid = sc->sc_vhid;
 	ch.carp_advbase = sc->sc_advbase;
 	ch.carp_advskew = advskew;
 	ch.carp_authlen = 7;	/* XXX DEFINE */
 	ch.carp_pad1 = 0;	/* must be zero */
 	ch.carp_cksum = 0;
 
 	/* XXXGL: OpenBSD picks first ifaddr with needed family. */
 
 #ifdef INET
 	if (sc->sc_naddrs) {
 		struct ip *ip;
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			CARPSTATS_INC(carps_onomem);
 			goto resched;
 		}
 		len = sizeof(*ip) + sizeof(ch);
 		m->m_pkthdr.len = len;
 		m->m_pkthdr.rcvif = NULL;
 		m->m_len = len;
 		M_ALIGN(m, m->m_len);
 		m->m_flags |= M_MCAST;
 		ip = mtod(m, struct ip *);
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(*ip) >> 2;
 		ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET;
 		ip->ip_len = htons(len);
 		ip->ip_off = htons(IP_DF);
 		ip->ip_ttl = CARP_DFLTTL;
 		ip->ip_p = IPPROTO_CARP;
 		ip->ip_sum = 0;
 		ip_fillid(ip);
 
 		ifa = carp_best_ifa(AF_INET, sc->sc_carpdev);
 		if (ifa != NULL) {
 			ip->ip_src.s_addr =
 			    ifatoia(ifa)->ia_addr.sin_addr.s_addr;
 			ifa_free(ifa);
 		} else
 			ip->ip_src.s_addr = 0;
 		ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
 
 		ch_ptr = (struct carp_header *)(&ip[1]);
 		bcopy(&ch, ch_ptr, sizeof(ch));
 		if (carp_prepare_ad(m, sc, ch_ptr))
 			goto resched;
 
 		m->m_data += sizeof(*ip);
 		ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip));
 		m->m_data -= sizeof(*ip);
 
 		CARPSTATS_INC(carps_opackets);
 
 		carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT,
 		    &sc->sc_carpdev->if_carp->cif_imo, NULL));
 	}
 #endif /* INET */
 #ifdef INET6
 	if (sc->sc_naddrs6) {
 		struct ip6_hdr *ip6;
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			CARPSTATS_INC(carps_onomem);
 			goto resched;
 		}
 		len = sizeof(*ip6) + sizeof(ch);
 		m->m_pkthdr.len = len;
 		m->m_pkthdr.rcvif = NULL;
 		m->m_len = len;
 		M_ALIGN(m, m->m_len);
 		m->m_flags |= M_MCAST;
 		ip6 = mtod(m, struct ip6_hdr *);
 		bzero(ip6, sizeof(*ip6));
 		ip6->ip6_vfc |= IPV6_VERSION;
 		/* Traffic class isn't defined in ip6 struct instead
 		 * it gets offset into flowid field */
 		ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN +
 		    IPTOS_DSCP_OFFSET));
 		ip6->ip6_hlim = CARP_DFLTTL;
 		ip6->ip6_nxt = IPPROTO_CARP;
 
 		/* set the source address */
 		ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev);
 		if (ifa != NULL) {
 			bcopy(IFA_IN6(ifa), &ip6->ip6_src,
 			    sizeof(struct in6_addr));
 			ifa_free(ifa);
 		} else
 			/* This should never happen with IPv6. */
 			bzero(&ip6->ip6_src, sizeof(struct in6_addr));
 
 		/* Set the multicast destination. */
 		ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
 		ip6->ip6_dst.s6_addr8[15] = 0x12;
 		if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
 			m_freem(m);
 			CARP_DEBUG("%s: in6_setscope failed\n", __func__);
 			goto resched;
 		}
 
 		ch_ptr = (struct carp_header *)(&ip6[1]);
 		bcopy(&ch, ch_ptr, sizeof(ch));
 		if (carp_prepare_ad(m, sc, ch_ptr))
 			goto resched;
 
 		m->m_data += sizeof(*ip6);
 		ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6));
 		m->m_data -= sizeof(*ip6);
 
 		CARPSTATS_INC(carps_opackets6);
 
 		carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0,
 		    &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL));
 	}
 #endif /* INET6 */
 
 resched:
 	callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc);
 }
 
 static void
 carp_addroute(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 
 	CARP_FOREACH_IFA(sc, ifa)
 		carp_ifa_addroute(ifa);
 }
 
 static void
 carp_ifa_addroute(struct ifaddr *ifa)
 {
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		in_addprefix(ifatoia(ifa), RTF_UP);
 		ifa_add_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia(ifa)->ia_addr);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ifa_add_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
 		nd6_add_ifa_lle(ifatoia6(ifa));
 		break;
 #endif
 	}
 }
 
 static void
 carp_delroute(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 
 	CARP_FOREACH_IFA(sc, ifa)
 		carp_ifa_delroute(ifa);
 }
 
 static void
 carp_ifa_delroute(struct ifaddr *ifa)
 {
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		ifa_del_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia(ifa)->ia_addr);
 		in_scrubprefix(ifatoia(ifa), LLE_STATIC);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ifa_del_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
 		nd6_rem_ifa_lle(ifatoia6(ifa), 1);
 		break;
 #endif
 	}
 }
 
 int
 carp_master(struct ifaddr *ifa)
 {
 	struct carp_softc *sc = ifa->ifa_carp;
 
 	return (sc->sc_state == MASTER);
 }
 
 #ifdef INET
 /*
  * Broadcast a gratuitous ARP request containing
  * the virtual router MAC address for each IP address
  * associated with the virtual router.
  */
 static void
 carp_send_arp(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 	struct in_addr addr;
 
 	CARP_FOREACH_IFA(sc, ifa) {
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 		addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr;
 		arp_announce_ifaddr(sc->sc_carpdev, addr, LLADDR(&sc->sc_addr));
 	}
 }
 
 int
 carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr)
 {
 	struct carp_softc *sc = ifa->ifa_carp;
 
 	if (sc->sc_state == MASTER) {
 		*enaddr = LLADDR(&sc->sc_addr);
 		return (1);
 	}
 
 	return (0);
 }
 #endif
 
 #ifdef INET6
 static void
 carp_send_na(struct carp_softc *sc)
 {
 	static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
 	struct ifaddr *ifa;
 	struct in6_addr *in6;
 
 	CARP_FOREACH_IFA(sc, ifa) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		in6 = IFA_IN6(ifa);
 		nd6_na_output(sc->sc_carpdev, &mcast, in6,
 		    ND_NA_FLAG_OVERRIDE, 1, NULL);
 		DELAY(1000);	/* XXX */
 	}
 }
 
 /*
  * Returns ifa in case it's a carp address and it is MASTER, or if the address
  * matches and is not a carp address.  Returns NULL otherwise.
  */
 struct ifaddr *
 carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 
 	ifa = NULL;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa)))
 			continue;
 		if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER)
 			ifa = NULL;
 		else
 			ifa_ref(ifa);
 		break;
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (ifa);
 }
 
 caddr_t
 carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ENTER(et);
 	IFNET_FOREACH_IFA(ifp, ifa)
 		if (ifa->ifa_addr->sa_family == AF_INET6 &&
 		    IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) {
 			struct carp_softc *sc = ifa->ifa_carp;
 			struct m_tag *mtag;
 
 			NET_EPOCH_EXIT(et);
 
 			mtag = m_tag_get(PACKET_TAG_CARP,
 			    sizeof(struct carp_softc *), M_NOWAIT);
 			if (mtag == NULL)
 				/* Better a bit than nothing. */
 				return (LLADDR(&sc->sc_addr));
 
 			bcopy(&sc, mtag + 1, sizeof(sc));
 			m_tag_prepend(m, mtag);
 
 			return (LLADDR(&sc->sc_addr));
 		}
 	NET_EPOCH_EXIT(et);
 
 	return (NULL);
 }
 #endif /* INET6 */
 
 int
 carp_forus(struct ifnet *ifp, u_char *dhost)
 {
 	struct carp_softc *sc;
 	uint8_t *ena = dhost;
 
 	if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
 		return (0);
 
 	CIF_LOCK(ifp->if_carp);
 	IFNET_FOREACH_CARP(ifp, sc) {
 		CARP_LOCK(sc);
 		if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr),
 		    ETHER_ADDR_LEN)) {
 			CARP_UNLOCK(sc);
 			CIF_UNLOCK(ifp->if_carp);
 			return (1);
 		}
 		CARP_UNLOCK(sc);
 	}
 	CIF_UNLOCK(ifp->if_carp);
 
 	return (0);
 }
 
 /* Master down timeout event, executed in callout context. */
 static void
 carp_master_down(void *v)
 {
 	struct carp_softc *sc = v;
 
 	CARP_LOCK_ASSERT(sc);
 
 	CURVNET_SET(sc->sc_carpdev->if_vnet);
 	if (sc->sc_state == BACKUP) {
 		carp_master_down_locked(sc, "master timed out");
 	}
 	CURVNET_RESTORE();
 
 	CARP_UNLOCK(sc);
 }
 
 static void
 carp_master_down_locked(struct carp_softc *sc, const char *reason)
 {
 
 	CARP_LOCK_ASSERT(sc);
 
 	switch (sc->sc_state) {
 	case BACKUP:
 		carp_set_state(sc, MASTER, reason);
 		carp_send_ad_locked(sc);
 #ifdef INET
 		carp_send_arp(sc);
 #endif
 #ifdef INET6
 		carp_send_na(sc);
 #endif
 		carp_setrun(sc, 0);
 		carp_addroute(sc);
 		break;
 	case INIT:
 	case MASTER:
 #ifdef INVARIANTS
 		panic("carp: VHID %u@%s: master_down event in %s state\n",
 		    sc->sc_vhid,
 		    sc->sc_carpdev->if_xname,
 		    sc->sc_state ? "MASTER" : "INIT");
 #endif
 		break;
 	}
 }
 
 /*
  * When in backup state, af indicates whether to reset the master down timer
  * for v4 or v6. If it's set to zero, reset the ones which are already pending.
  */
 static void
 carp_setrun(struct carp_softc *sc, sa_family_t af)
 {
 	struct timeval tv;
 
 	CARP_LOCK_ASSERT(sc);
 
 	if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 ||
 	    sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
 	    (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) ||
 	    !V_carp_allow)
 		return;
 
 	switch (sc->sc_state) {
 	case INIT:
 		carp_set_state(sc, BACKUP, "initialization complete");
 		carp_setrun(sc, 0);
 		break;
 	case BACKUP:
 		callout_stop(&sc->sc_ad_tmo);
 		tv.tv_sec = 3 * sc->sc_advbase;
 		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
 		switch (af) {
 #ifdef INET
 		case AF_INET:
 			callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
 			    carp_master_down, sc);
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
 			    carp_master_down, sc);
 			break;
 #endif
 		default:
 #ifdef INET
 			if (sc->sc_naddrs)
 				callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
 				    carp_master_down, sc);
 #endif
 #ifdef INET6
 			if (sc->sc_naddrs6)
 				callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
 				    carp_master_down, sc);
 #endif
 			break;
 		}
 		break;
 	case MASTER:
 		tv.tv_sec = sc->sc_advbase;
 		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
 		callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
 		    carp_send_ad, sc);
 		break;
 	}
 }
 
 /*
  * Setup multicast structures.
  */
 static int
 carp_multicast_setup(struct carp_if *cif, sa_family_t sa)
 {
 	struct ifnet *ifp = cif->cif_ifp;
 	int error = 0;
 
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
 	    {
 		struct ip_moptions *imo = &cif->cif_imo;
+		struct in_mfilter *imf;
 		struct in_addr addr;
 
-		if (imo->imo_membership)
+		if (ip_mfilter_first(&imo->imo_head) != NULL)
 			return (0);
 
-		imo->imo_membership = (struct in_multi **)malloc(
-		    (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP,
-		    M_WAITOK);
-		imo->imo_mfilters = NULL;
-		imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
+		imf = ip_mfilter_alloc(M_WAITOK, 0, 0);
+		ip_mfilter_init(&imo->imo_head);
 		imo->imo_multicast_vif = -1;
 
 		addr.s_addr = htonl(INADDR_CARP_GROUP);
 		if ((error = in_joingroup(ifp, &addr, NULL,
-		    &imo->imo_membership[0])) != 0) {
-			free(imo->imo_membership, M_CARP);
+		    &imf->imf_inm)) != 0) {
+			ip_mfilter_free(imf);
 			break;
 		}
-		imo->imo_num_memberships++;
+
+		ip_mfilter_insert(&imo->imo_head, imf);
 		imo->imo_multicast_ifp = ifp;
 		imo->imo_multicast_ttl = CARP_DFLTTL;
 		imo->imo_multicast_loop = 0;
 		break;
 	   }
 #endif
 #ifdef INET6
 	case AF_INET6:
 	    {
 		struct ip6_moptions *im6o = &cif->cif_im6o;
+		struct in6_mfilter *im6f[2];
 		struct in6_addr in6;
-		struct in6_multi *in6m;
 
-		if (im6o->im6o_membership)
+		if (ip6_mfilter_first(&im6o->im6o_head))
 			return (0);
 
-		im6o->im6o_membership = (struct in6_multi **)malloc(
-		    (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP,
-		    M_ZERO | M_WAITOK);
-		im6o->im6o_mfilters = NULL;
-		im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
+		im6f[0] = ip6_mfilter_alloc(M_WAITOK, 0, 0);
+		im6f[1] = ip6_mfilter_alloc(M_WAITOK, 0, 0);
+
+		ip6_mfilter_init(&im6o->im6o_head);
 		im6o->im6o_multicast_hlim = CARP_DFLTTL;
 		im6o->im6o_multicast_ifp = ifp;
 
 		/* Join IPv6 CARP multicast group. */
 		bzero(&in6, sizeof(in6));
 		in6.s6_addr16[0] = htons(0xff02);
 		in6.s6_addr8[15] = 0x12;
 		if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
-			free(im6o->im6o_membership, M_CARP);
+			ip6_mfilter_free(im6f[0]);
+			ip6_mfilter_free(im6f[1]);
 			break;
 		}
-		in6m = NULL;
-		if ((error = in6_joingroup(ifp, &in6, NULL, &in6m, 0)) != 0) {
-			free(im6o->im6o_membership, M_CARP);
+		if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[0]->im6f_in6m, 0)) != 0) {
+			ip6_mfilter_free(im6f[0]);
+			ip6_mfilter_free(im6f[1]);
 			break;
 		}
-		in6m_acquire(in6m);
-		im6o->im6o_membership[0] = in6m;
-		im6o->im6o_num_memberships++;
 
 		/* Join solicited multicast address. */
 		bzero(&in6, sizeof(in6));
 		in6.s6_addr16[0] = htons(0xff02);
 		in6.s6_addr32[1] = 0;
 		in6.s6_addr32[2] = htonl(1);
 		in6.s6_addr32[3] = 0;
 		in6.s6_addr8[12] = 0xff;
+
 		if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
-			in6_leavegroup(im6o->im6o_membership[0], NULL);
-			free(im6o->im6o_membership, M_CARP);
+			ip6_mfilter_free(im6f[0]);
+			ip6_mfilter_free(im6f[1]);
 			break;
 		}
-		in6m = NULL;
-		if ((error = in6_joingroup(ifp, &in6, NULL, &in6m, 0)) != 0) {
-			in6_leavegroup(im6o->im6o_membership[0], NULL);
-			free(im6o->im6o_membership, M_CARP);
+
+		if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[1]->im6f_in6m, 0)) != 0) {
+			in6_leavegroup(im6f[0]->im6f_in6m, NULL);
+			ip6_mfilter_free(im6f[0]);
+			ip6_mfilter_free(im6f[1]);
 			break;
 		}
-		in6m_acquire(in6m);
-		im6o->im6o_membership[1] = in6m;
-		im6o->im6o_num_memberships++;
+		ip6_mfilter_insert(&im6o->im6o_head, im6f[0]);
+		ip6_mfilter_insert(&im6o->im6o_head, im6f[1]);
 		break;
 	    }
 #endif
 	}
 
 	return (error);
 }
 
 /*
  * Free multicast structures.
  */
 static void
 carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa)
 {
-
+#ifdef INET
+	struct ip_moptions *imo = &cif->cif_imo;
+	struct in_mfilter *imf;
+#endif
+#ifdef INET6
+	struct ip6_moptions *im6o = &cif->cif_im6o;
+	struct in6_mfilter *im6f;
+#endif
 	sx_assert(&carp_sx, SA_XLOCKED);
 
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
-		if (cif->cif_naddrs == 0) {
-			struct ip_moptions *imo = &cif->cif_imo;
+		if (cif->cif_naddrs != 0)
+			break;
 
-			in_leavegroup(imo->imo_membership[0], NULL);
-			KASSERT(imo->imo_mfilters == NULL,
-			    ("%s: imo_mfilters != NULL", __func__));
-			free(imo->imo_membership, M_CARP);
-			imo->imo_membership = NULL;
-
+		while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) {
+			ip_mfilter_remove(&imo->imo_head, imf);
+			in_leavegroup(imf->imf_inm, NULL);
+			ip_mfilter_free(imf);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
-		if (cif->cif_naddrs6 == 0) {
-			struct ip6_moptions *im6o = &cif->cif_im6o;
+		if (cif->cif_naddrs6 != 0)
+			break;
 
-			in6_leavegroup(im6o->im6o_membership[0], NULL);
-			in6_leavegroup(im6o->im6o_membership[1], NULL);
-			KASSERT(im6o->im6o_mfilters == NULL,
-			    ("%s: im6o_mfilters != NULL", __func__));
-			free(im6o->im6o_membership, M_CARP);
-			im6o->im6o_membership = NULL;
+		while ((im6f = ip6_mfilter_first(&im6o->im6o_head)) != NULL) {
+			ip6_mfilter_remove(&im6o->im6o_head, im6f);
+			in6_leavegroup(im6f->im6f_in6m, NULL);
+			ip6_mfilter_free(im6f);
 		}
 		break;
 #endif
 	}
 }
 
 int
 carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa)
 {
 	struct m_tag *mtag;
 	struct carp_softc *sc;
 
 	if (!sa)
 		return (0);
 
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		break;
 #endif
 	default:
 		return (0);
 	}
 
 	mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
 	if (mtag == NULL)
 		return (0);
 
 	bcopy(mtag + 1, &sc, sizeof(sc));
 
 	/* Set the source MAC address to the Virtual Router MAC Address. */
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_BRIDGE:
 	case IFT_L2VLAN: {
 			struct ether_header *eh;
 
 			eh = mtod(m, struct ether_header *);
 			eh->ether_shost[0] = 0;
 			eh->ether_shost[1] = 0;
 			eh->ether_shost[2] = 0x5e;
 			eh->ether_shost[3] = 0;
 			eh->ether_shost[4] = 1;
 			eh->ether_shost[5] = sc->sc_vhid;
 		}
 		break;
 	default:
 		printf("%s: carp is not supported for the %d interface type\n",
 		    ifp->if_xname, ifp->if_type);
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static struct carp_softc*
 carp_alloc(struct ifnet *ifp)
 {
 	struct carp_softc *sc;
 	struct carp_if *cif;
 
 	sx_assert(&carp_sx, SA_XLOCKED);
 
 	if ((cif = ifp->if_carp) == NULL)
 		cif = carp_alloc_if(ifp);
 
 	sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO);
 
 	sc->sc_advbase = CARP_DFLTINTV;
 	sc->sc_vhid = -1;	/* required setting */
 	sc->sc_init_counter = 1;
 	sc->sc_state = INIT;
 
 	sc->sc_ifasiz = sizeof(struct ifaddr *);
 	sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO);
 	sc->sc_carpdev = ifp;
 
 	CARP_LOCK_INIT(sc);
 #ifdef INET
 	callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 #endif
 #ifdef INET6
 	callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 #endif
 	callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 
 	CIF_LOCK(cif);
 	TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list);
 	CIF_UNLOCK(cif);
 
 	mtx_lock(&carp_mtx);
 	LIST_INSERT_HEAD(&carp_list, sc, sc_next);
 	mtx_unlock(&carp_mtx);
 
 	return (sc);
 }
 
 static void
 carp_grow_ifas(struct carp_softc *sc)
 {
 	struct ifaddr **new;
 
 	new = malloc(sc->sc_ifasiz * 2, M_CARP, M_WAITOK | M_ZERO);
 	CARP_LOCK(sc);
 	bcopy(sc->sc_ifas, new, sc->sc_ifasiz);
 	free(sc->sc_ifas, M_CARP);
 	sc->sc_ifas = new;
 	sc->sc_ifasiz *= 2;
 	CARP_UNLOCK(sc);
 }
 
 static void
 carp_destroy(struct carp_softc *sc)
 {
 	struct ifnet *ifp = sc->sc_carpdev;
 	struct carp_if *cif = ifp->if_carp;
 
 	sx_assert(&carp_sx, SA_XLOCKED);
 
 	if (sc->sc_suppress)
 		carp_demote_adj(-V_carp_ifdown_adj, "vhid removed");
 	CARP_UNLOCK(sc);
 
 	CIF_LOCK(cif);
 	TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list);
 	CIF_UNLOCK(cif);
 
 	mtx_lock(&carp_mtx);
 	LIST_REMOVE(sc, sc_next);
 	mtx_unlock(&carp_mtx);
 
 	callout_drain(&sc->sc_ad_tmo);
 #ifdef INET
 	callout_drain(&sc->sc_md_tmo);
 #endif
 #ifdef INET6
 	callout_drain(&sc->sc_md6_tmo);
 #endif
 	CARP_LOCK_DESTROY(sc);
 
 	free(sc->sc_ifas, M_CARP);
 	free(sc, M_CARP);
 }
 
 static struct carp_if*
 carp_alloc_if(struct ifnet *ifp)
 {
 	struct carp_if *cif;
 	int error;
 
 	cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO);
 
 	if ((error = ifpromisc(ifp, 1)) != 0)
 		printf("%s: ifpromisc(%s) failed: %d\n",
 		    __func__, ifp->if_xname, error);
 	else
 		cif->cif_flags |= CIF_PROMISC;
 
 	CIF_LOCK_INIT(cif);
 	cif->cif_ifp = ifp;
 	TAILQ_INIT(&cif->cif_vrs);
 
 	IF_ADDR_WLOCK(ifp);
 	ifp->if_carp = cif;
 	if_ref(ifp);
 	IF_ADDR_WUNLOCK(ifp);
 
 	return (cif);
 }
 
 static void
 carp_free_if(struct carp_if *cif)
 {
 	struct ifnet *ifp = cif->cif_ifp;
 
 	CIF_LOCK_ASSERT(cif);
 	KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty",
 	    __func__));
 
 	IF_ADDR_WLOCK(ifp);
 	ifp->if_carp = NULL;
 	IF_ADDR_WUNLOCK(ifp);
 
 	CIF_LOCK_DESTROY(cif);
 
 	if (cif->cif_flags & CIF_PROMISC)
 		ifpromisc(ifp, 0);
 	if_rele(ifp);
 
 	free(cif, M_CARP);
 }
 
 static void
 carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv)
 {
 
 	CARP_LOCK(sc);
 	carpr->carpr_state = sc->sc_state;
 	carpr->carpr_vhid = sc->sc_vhid;
 	carpr->carpr_advbase = sc->sc_advbase;
 	carpr->carpr_advskew = sc->sc_advskew;
 	if (priv)
 		bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key));
 	else
 		bzero(carpr->carpr_key, sizeof(carpr->carpr_key));
 	CARP_UNLOCK(sc);
 }
 
 int
 carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td)
 {
 	struct carpreq carpr;
 	struct ifnet *ifp;
 	struct carp_softc *sc = NULL;
 	int error = 0, locked = 0;
 
 	if ((error = copyin(ifr_data_get_ptr(ifr), &carpr, sizeof carpr)))
 		return (error);
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL)
 		return (ENXIO);
 
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		goto out;
 	}
 
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 		error = EADDRNOTAVAIL;
 		goto out;
 	}
 
 	sx_xlock(&carp_sx);
 	switch (cmd) {
 	case SIOCSVH:
 		if ((error = priv_check(td, PRIV_NETINET_CARP)))
 			break;
 		if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID ||
 		    carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) {
 			error = EINVAL;
 			break;
 		}
 
 		if (ifp->if_carp) {
 			IFNET_FOREACH_CARP(ifp, sc)
 				if (sc->sc_vhid == carpr.carpr_vhid)
 					break;
 		}
 		if (sc == NULL) {
 			sc = carp_alloc(ifp);
 			CARP_LOCK(sc);
 			sc->sc_vhid = carpr.carpr_vhid;
 			LLADDR(&sc->sc_addr)[0] = 0;
 			LLADDR(&sc->sc_addr)[1] = 0;
 			LLADDR(&sc->sc_addr)[2] = 0x5e;
 			LLADDR(&sc->sc_addr)[3] = 0;
 			LLADDR(&sc->sc_addr)[4] = 1;
 			LLADDR(&sc->sc_addr)[5] = sc->sc_vhid;
 		} else
 			CARP_LOCK(sc);
 		locked = 1;
 		if (carpr.carpr_advbase > 0) {
 			if (carpr.carpr_advbase > 255 ||
 			    carpr.carpr_advbase < CARP_DFLTINTV) {
 				error = EINVAL;
 				break;
 			}
 			sc->sc_advbase = carpr.carpr_advbase;
 		}
 		if (carpr.carpr_advskew >= 255) {
 			error = EINVAL;
 			break;
 		}
 		sc->sc_advskew = carpr.carpr_advskew;
 		if (carpr.carpr_key[0] != '\0') {
 			bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
 			carp_hmac_prepare(sc);
 		}
 		if (sc->sc_state != INIT &&
 		    carpr.carpr_state != sc->sc_state) {
 			switch (carpr.carpr_state) {
 			case BACKUP:
 				callout_stop(&sc->sc_ad_tmo);
 				carp_set_state(sc, BACKUP,
 				    "user requested via ifconfig");
 				carp_setrun(sc, 0);
 				carp_delroute(sc);
 				break;
 			case MASTER:
 				carp_master_down_locked(sc,
 				    "user requested via ifconfig");
 				break;
 			default:
 				break;
 			}
 		}
 		break;
 
 	case SIOCGVH:
 	    {
 		int priveleged;
 
 		if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) {
 			error = EINVAL;
 			break;
 		}
 		if (carpr.carpr_count < 1) {
 			error = EMSGSIZE;
 			break;
 		}
 		if (ifp->if_carp == NULL) {
 			error = ENOENT;
 			break;
 		}
 
 		priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0);
 		if (carpr.carpr_vhid != 0) {
 			IFNET_FOREACH_CARP(ifp, sc)
 				if (sc->sc_vhid == carpr.carpr_vhid)
 					break;
 			if (sc == NULL) {
 				error = ENOENT;
 				break;
 			}
 			carp_carprcp(&carpr, sc, priveleged);
 			error = copyout(&carpr, ifr_data_get_ptr(ifr),
 			    sizeof(carpr));
 		} else  {
 			int i, count;
 
 			count = 0;
 			IFNET_FOREACH_CARP(ifp, sc)
 				count++;
 
 			if (count > carpr.carpr_count) {
 				CIF_UNLOCK(ifp->if_carp);
 				error = EMSGSIZE;
 				break;
 			}
 
 			i = 0;
 			IFNET_FOREACH_CARP(ifp, sc) {
 				carp_carprcp(&carpr, sc, priveleged);
 				carpr.carpr_count = count;
 				error = copyout(&carpr,
 				    (caddr_t)ifr_data_get_ptr(ifr) +
 				    (i * sizeof(carpr)), sizeof(carpr));
 				if (error) {
 					CIF_UNLOCK(ifp->if_carp);
 					break;
 				}
 				i++;
 			}
 		}
 		break;
 	    }
 	default:
 		error = EINVAL;
 	}
 	sx_xunlock(&carp_sx);
 
 out:
 	if (locked)
 		CARP_UNLOCK(sc);
 	if_rele(ifp);
 
 	return (error);
 }
 
 static int
 carp_get_vhid(struct ifaddr *ifa)
 {
 
 	if (ifa == NULL || ifa->ifa_carp == NULL)
 		return (0);
 
 	return (ifa->ifa_carp->sc_vhid);
 }
 
 int
 carp_attach(struct ifaddr *ifa, int vhid)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct carp_if *cif = ifp->if_carp;
 	struct carp_softc *sc;
 	int index, error;
 
 	KASSERT(ifa->ifa_carp == NULL, ("%s: ifa %p attached", __func__, ifa));
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 #endif
 #ifdef INET6
 	case AF_INET6:
 #endif
 		break;
 	default:
 		return (EPROTOTYPE);
 	}
 
 	sx_xlock(&carp_sx);
 	if (ifp->if_carp == NULL) {
 		sx_xunlock(&carp_sx);
 		return (ENOPROTOOPT);
 	}
 
 	IFNET_FOREACH_CARP(ifp, sc)
 		if (sc->sc_vhid == vhid)
 			break;
 	if (sc == NULL) {
 		sx_xunlock(&carp_sx);
 		return (ENOENT);
 	}
 
 	error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family);
 	if (error) {
 		CIF_FREE(cif);
 		sx_xunlock(&carp_sx);
 		return (error);
 	}
 
 	index = sc->sc_naddrs + sc->sc_naddrs6 + 1;
 	if (index > sc->sc_ifasiz / sizeof(struct ifaddr *))
 		carp_grow_ifas(sc);
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		cif->cif_naddrs++;
 		sc->sc_naddrs++;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		cif->cif_naddrs6++;
 		sc->sc_naddrs6++;
 		break;
 #endif
 	}
 
 	ifa_ref(ifa);
 
 	CARP_LOCK(sc);
 	sc->sc_ifas[index - 1] = ifa;
 	ifa->ifa_carp = sc;
 	carp_hmac_prepare(sc);
 	carp_sc_state(sc);
 	CARP_UNLOCK(sc);
 
 	sx_xunlock(&carp_sx);
 
 	return (0);
 }
 
 void
 carp_detach(struct ifaddr *ifa, bool keep_cif)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct carp_if *cif = ifp->if_carp;
 	struct carp_softc *sc = ifa->ifa_carp;
 	int i, index;
 
 	KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa));
 
 	sx_xlock(&carp_sx);
 
 	CARP_LOCK(sc);
 	/* Shift array. */
 	index = sc->sc_naddrs + sc->sc_naddrs6;
 	for (i = 0; i < index; i++)
 		if (sc->sc_ifas[i] == ifa)
 			break;
 	KASSERT(i < index, ("%s: %p no backref", __func__, ifa));
 	for (; i < index - 1; i++)
 		sc->sc_ifas[i] = sc->sc_ifas[i+1];
 	sc->sc_ifas[index - 1] = NULL;
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		cif->cif_naddrs--;
 		sc->sc_naddrs--;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		cif->cif_naddrs6--;
 		sc->sc_naddrs6--;
 		break;
 #endif
 	}
 
 	carp_ifa_delroute(ifa);
 	carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family);
 
 	ifa->ifa_carp = NULL;
 	ifa_free(ifa);
 
 	carp_hmac_prepare(sc);
 	carp_sc_state(sc);
 
 	if (!keep_cif && sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0)
 		carp_destroy(sc);
 	else
 		CARP_UNLOCK(sc);
 
 	if (!keep_cif)
 		CIF_FREE(cif);
 
 	sx_xunlock(&carp_sx);
 }
 
 static void
 carp_set_state(struct carp_softc *sc, int state, const char *reason)
 {
 
 	CARP_LOCK_ASSERT(sc);
 
 	if (sc->sc_state != state) {
 		const char *carp_states[] = { CARP_STATES };
 		char subsys[IFNAMSIZ+5];
 
 		snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid,
 		    sc->sc_carpdev->if_xname);
 
 		CARP_LOG("%s: %s -> %s (%s)\n", subsys,
 		    carp_states[sc->sc_state], carp_states[state], reason);
 
 		sc->sc_state = state;
 
 		devctl_notify("CARP", subsys, carp_states[state], NULL);
 	}
 }
 
 static void
 carp_linkstate(struct ifnet *ifp)
 {
 	struct carp_softc *sc;
 
 	CIF_LOCK(ifp->if_carp);
 	IFNET_FOREACH_CARP(ifp, sc) {
 		CARP_LOCK(sc);
 		carp_sc_state(sc);
 		CARP_UNLOCK(sc);
 	}
 	CIF_UNLOCK(ifp->if_carp);
 }
 
 static void
 carp_sc_state(struct carp_softc *sc)
 {
 
 	CARP_LOCK_ASSERT(sc);
 
 	if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
 	    !(sc->sc_carpdev->if_flags & IFF_UP) ||
 	    !V_carp_allow) {
 		callout_stop(&sc->sc_ad_tmo);
 #ifdef INET
 		callout_stop(&sc->sc_md_tmo);
 #endif
 #ifdef INET6
 		callout_stop(&sc->sc_md6_tmo);
 #endif
 		carp_set_state(sc, INIT, "hardware interface down");
 		carp_setrun(sc, 0);
 		if (!sc->sc_suppress)
 			carp_demote_adj(V_carp_ifdown_adj, "interface down");
 		sc->sc_suppress = 1;
 	} else {
 		carp_set_state(sc, INIT, "hardware interface up");
 		carp_setrun(sc, 0);
 		if (sc->sc_suppress)
 			carp_demote_adj(-V_carp_ifdown_adj, "interface up");
 		sc->sc_suppress = 0;
 	}
 }
 
 static void
 carp_demote_adj(int adj, char *reason)
 {
 	atomic_add_int(&V_carp_demotion, adj);
 	CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason);
 	taskqueue_enqueue(taskqueue_swi, &carp_sendall_task);
 }
 
 static int
 carp_allow_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int new, error;
 	struct carp_softc *sc;
 
 	new = V_carp_allow;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (V_carp_allow != new) {
 		V_carp_allow = new;
 
 		mtx_lock(&carp_mtx);
 		LIST_FOREACH(sc, &carp_list, sc_next) {
 			CARP_LOCK(sc);
 			if (curvnet == sc->sc_carpdev->if_vnet)
 				carp_sc_state(sc);
 			CARP_UNLOCK(sc);
 		}
 		mtx_unlock(&carp_mtx);
 	}
 
 	return (0);
 }
 
 static int
 carp_dscp_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int new, error;
 
 	new = V_carp_dscp;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (new < 0 || new > 63)
 		return (EINVAL);
 
 	V_carp_dscp = new;
 
 	return (0);
 }
 
 static int
 carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int new, error;
 
 	new = V_carp_demotion;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	carp_demote_adj(new, "sysctl");
 
 	return (0);
 }
 
 #ifdef INET
 extern  struct domain inetdomain;
 static struct protosw in_carp_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_CARP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		carp_input,
 	.pr_output =		rip_output,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 };
 #endif
 
 #ifdef INET6
 extern	struct domain inet6domain;
 static struct protosw in6_carp_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inet6domain,
 	.pr_protocol =		IPPROTO_CARP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		carp6_input,
 	.pr_output =		rip6_output,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_usrreqs =		&rip6_usrreqs
 };
 #endif
 
 static void
 carp_mod_cleanup(void)
 {
 
 #ifdef INET
 	if (proto_reg[CARP_INET] == 0) {
 		(void)ipproto_unregister(IPPROTO_CARP);
 		pf_proto_unregister(PF_INET, IPPROTO_CARP, SOCK_RAW);
 		proto_reg[CARP_INET] = -1;
 	}
 	carp_iamatch_p = NULL;
 #endif
 #ifdef INET6
 	if (proto_reg[CARP_INET6] == 0) {
 		(void)ip6proto_unregister(IPPROTO_CARP);
 		pf_proto_unregister(PF_INET6, IPPROTO_CARP, SOCK_RAW);
 		proto_reg[CARP_INET6] = -1;
 	}
 	carp_iamatch6_p = NULL;
 	carp_macmatch6_p = NULL;
 #endif
 	carp_ioctl_p = NULL;
 	carp_attach_p = NULL;
 	carp_detach_p = NULL;
 	carp_get_vhid_p = NULL;
 	carp_linkstate_p = NULL;
 	carp_forus_p = NULL;
 	carp_output_p = NULL;
 	carp_demote_adj_p = NULL;
 	carp_master_p = NULL;
 	mtx_unlock(&carp_mtx);
 	taskqueue_drain(taskqueue_swi, &carp_sendall_task);
 	mtx_destroy(&carp_mtx);
 	sx_destroy(&carp_sx);
 }
 
 static int
 carp_mod_load(void)
 {
 	int err;
 
 	mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
 	sx_init(&carp_sx, "carp_sx");
 	LIST_INIT(&carp_list);
 	carp_get_vhid_p = carp_get_vhid;
 	carp_forus_p = carp_forus;
 	carp_output_p = carp_output;
 	carp_linkstate_p = carp_linkstate;
 	carp_ioctl_p = carp_ioctl;
 	carp_attach_p = carp_attach;
 	carp_detach_p = carp_detach;
 	carp_demote_adj_p = carp_demote_adj;
 	carp_master_p = carp_master;
 #ifdef INET6
 	carp_iamatch6_p = carp_iamatch6;
 	carp_macmatch6_p = carp_macmatch6;
 	proto_reg[CARP_INET6] = pf_proto_register(PF_INET6,
 	    (struct protosw *)&in6_carp_protosw);
 	if (proto_reg[CARP_INET6]) {
 		printf("carp: error %d attaching to PF_INET6\n",
 		    proto_reg[CARP_INET6]);
 		carp_mod_cleanup();
 		return (proto_reg[CARP_INET6]);
 	}
 	err = ip6proto_register(IPPROTO_CARP);
 	if (err) {
 		printf("carp: error %d registering with INET6\n", err);
 		carp_mod_cleanup();
 		return (err);
 	}
 #endif
 #ifdef INET
 	carp_iamatch_p = carp_iamatch;
 	proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw);
 	if (proto_reg[CARP_INET]) {
 		printf("carp: error %d attaching to PF_INET\n",
 		    proto_reg[CARP_INET]);
 		carp_mod_cleanup();
 		return (proto_reg[CARP_INET]);
 	}
 	err = ipproto_register(IPPROTO_CARP);
 	if (err) {
 		printf("carp: error %d registering with INET\n", err);
 		carp_mod_cleanup();
 		return (err);
 	}
 #endif
 	return (0);
 }
 
 static int
 carp_modevent(module_t mod, int type, void *data)
 {
 	switch (type) {
 	case MOD_LOAD:
 		return carp_mod_load();
 		/* NOTREACHED */
 	case MOD_UNLOAD:
 		mtx_lock(&carp_mtx);
 		if (LIST_EMPTY(&carp_list))
 			carp_mod_cleanup();
 		else {
 			mtx_unlock(&carp_mtx);
 			return (EBUSY);
 		}
 		break;
 
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static moduledata_t carp_mod = {
 	"carp",
 	carp_modevent,
 	0
 };
 
 DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
Index: head/sys/netinet/ip_mroute.c
===================================================================
--- head/sys/netinet/ip_mroute.c	(revision 349368)
+++ head/sys/netinet/ip_mroute.c	(revision 349369)
@@ -1,2942 +1,2939 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989 Stephen Deering
  * Copyright (c) 1992, 1993
  *      The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
  */
 
 /*
  * IP multicast forwarding procedures
  *
  * Written by David Waitzman, BBN Labs, August 1988.
  * Modified by Steve Deering, Stanford, February 1989.
  * Modified by Mark J. Steiglitz, Stanford, May, 1991
  * Modified by Van Jacobson, LBL, January 1993
  * Modified by Ajit Thyagarajan, PARC, August 1993
  * Modified by Bill Fenner, PARC, April 1995
  * Modified by Ahmed Helmy, SGI, June 1996
  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
  * Modified by Hitoshi Asaeda, WIDE, August 2000
  * Modified by Pavlin Radoslavov, ICSI, October 2002
  *
  * MROUTING Revision: 3.5
  * and PIM-SMv2 and PIM-DM support, advanced API support,
  * bandwidth metering and signaling
  */
 
 /*
  * TODO: Prefix functions with ipmf_.
  * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol
  * domain attachment (if_afdata) so we can track consumers of that service.
  * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT,
  * move it to socket options.
  * TODO: Cleanup LSRR removal further.
  * TODO: Push RSVP stubs into raw_ip.c.
  * TODO: Use bitstring.h for vif set.
  * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded.
  * TODO: Sync ip6_mroute.c with this file.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_mrouting.h"
 
 #define _PIM_VT 1
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/stddef.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/ktr.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/counter.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/igmp.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip_mroute.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/pim.h>
 #include <netinet/pim_var.h>
 #include <netinet/udp.h>
 
 #include <machine/in_cksum.h>
 
 #ifndef KTR_IPMF
 #define KTR_IPMF KTR_INET
 #endif
 
 #define		VIFI_INVALID	((vifi_t) -1)
 
 VNET_DEFINE_STATIC(uint32_t, last_tv_sec); /* last time we processed this */
 #define	V_last_tv_sec	VNET(last_tv_sec)
 
 static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache");
 
 /*
  * Locking.  We use two locks: one for the virtual interface table and
  * one for the forwarding table.  These locks may be nested in which case
  * the VIF lock must always be taken first.  Note that each lock is used
  * to cover not only the specific data structure but also related data
  * structures.
  */
 
 static struct mtx mrouter_mtx;
 #define	MROUTER_LOCK()		mtx_lock(&mrouter_mtx)
 #define	MROUTER_UNLOCK()	mtx_unlock(&mrouter_mtx)
 #define	MROUTER_LOCK_ASSERT()	mtx_assert(&mrouter_mtx, MA_OWNED)
 #define	MROUTER_LOCK_INIT()						\
 	mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF)
 #define	MROUTER_LOCK_DESTROY()	mtx_destroy(&mrouter_mtx)
 
 static int ip_mrouter_cnt;	/* # of vnets with active mrouters */
 static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */
 
 VNET_PCPUSTAT_DEFINE_STATIC(struct mrtstat, mrtstat);
 VNET_PCPUSTAT_SYSINIT(mrtstat);
 VNET_PCPUSTAT_SYSUNINIT(mrtstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat,
     mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, "
     "netinet/ip_mroute.h)");
 
 VNET_DEFINE_STATIC(u_long, mfchash);
 #define	V_mfchash		VNET(mfchash)
 #define	MFCHASH(a, g)							\
 	((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \
 	  ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash)
 #define	MFCHASHSIZE	256
 
 static u_long mfchashsize;			/* Hash size */
 VNET_DEFINE_STATIC(u_char *, nexpire);		/* 0..mfchashsize-1 */
 #define	V_nexpire		VNET(nexpire)
 VNET_DEFINE_STATIC(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl);
 #define	V_mfchashtbl		VNET(mfchashtbl)
 
 static struct mtx mfc_mtx;
 #define	MFC_LOCK()		mtx_lock(&mfc_mtx)
 #define	MFC_UNLOCK()		mtx_unlock(&mfc_mtx)
 #define	MFC_LOCK_ASSERT()	mtx_assert(&mfc_mtx, MA_OWNED)
 #define	MFC_LOCK_INIT()							\
 	mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF)
 #define	MFC_LOCK_DESTROY()	mtx_destroy(&mfc_mtx)
 
 VNET_DEFINE_STATIC(vifi_t, numvifs);
 #define	V_numvifs		VNET(numvifs)
 VNET_DEFINE_STATIC(struct vif, viftable[MAXVIFS]);
 #define	V_viftable		VNET(viftable)
 SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD,
     &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]",
     "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)");
 
 static struct mtx vif_mtx;
 #define	VIF_LOCK()		mtx_lock(&vif_mtx)
 #define	VIF_UNLOCK()		mtx_unlock(&vif_mtx)
 #define	VIF_LOCK_ASSERT()	mtx_assert(&vif_mtx, MA_OWNED)
 #define	VIF_LOCK_INIT()							\
 	mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF)
 #define	VIF_LOCK_DESTROY()	mtx_destroy(&vif_mtx)
 
 static eventhandler_tag if_detach_event_tag = NULL;
 
 VNET_DEFINE_STATIC(struct callout, expire_upcalls_ch);
 #define	V_expire_upcalls_ch	VNET(expire_upcalls_ch)
 
 #define		EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second		*/
 #define		UPCALL_EXPIRE	6		/* number of timeouts	*/
 
 /*
  * Bandwidth meter variables and constants
  */
 static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters");
 /*
  * Pending timeouts are stored in a hash table, the key being the
  * expiration time. Periodically, the entries are analysed and processed.
  */
 #define	BW_METER_BUCKETS	1024
 VNET_DEFINE_STATIC(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]);
 #define	V_bw_meter_timers	VNET(bw_meter_timers)
 VNET_DEFINE_STATIC(struct callout, bw_meter_ch);
 #define	V_bw_meter_ch		VNET(bw_meter_ch)
 #define	BW_METER_PERIOD (hz)		/* periodical handling of bw meters */
 
 /*
  * Pending upcalls are stored in a vector which is flushed when
  * full, or periodically
  */
 VNET_DEFINE_STATIC(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]);
 #define	V_bw_upcalls		VNET(bw_upcalls)
 VNET_DEFINE_STATIC(u_int, bw_upcalls_n); /* # of pending upcalls */
 #define	V_bw_upcalls_n    	VNET(bw_upcalls_n)
 VNET_DEFINE_STATIC(struct callout, bw_upcalls_ch);
 #define	V_bw_upcalls_ch		VNET(bw_upcalls_ch)
 
 #define BW_UPCALLS_PERIOD (hz)		/* periodical flush of bw upcalls */
 
 VNET_PCPUSTAT_DEFINE_STATIC(struct pimstat, pimstat);
 VNET_PCPUSTAT_SYSINIT(pimstat);
 VNET_PCPUSTAT_SYSUNINIT(pimstat);
 
 SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");
 SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat,
     pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)");
 
 static u_long	pim_squelch_wholepkt = 0;
 SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW,
     &pim_squelch_wholepkt, 0,
     "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified");
 
 static const struct encaptab *pim_encap_cookie;
 static int pim_encapcheck(const struct mbuf *, int, int, void *);
 static int pim_input(struct mbuf *, int, int, void *);
 
 static const struct encap_config ipv4_encap_cfg = {
 	.proto = IPPROTO_PIM,
 	.min_length = sizeof(struct ip) + PIM_MINLEN,
 	.exact_match = 8,
 	.check = pim_encapcheck,
 	.input = pim_input
 };
 
 /*
  * Note: the PIM Register encapsulation adds the following in front of a
  * data packet:
  *
  * struct pim_encap_hdr {
  *    struct ip ip;
  *    struct pim_encap_pimhdr  pim;
  * }
  *
  */
 
 struct pim_encap_pimhdr {
 	struct pim pim;
 	uint32_t   flags;
 };
 #define		PIM_ENCAP_TTL	64
 
 static struct ip pim_encap_iphdr = {
 #if BYTE_ORDER == LITTLE_ENDIAN
 	sizeof(struct ip) >> 2,
 	IPVERSION,
 #else
 	IPVERSION,
 	sizeof(struct ip) >> 2,
 #endif
 	0,			/* tos */
 	sizeof(struct ip),	/* total length */
 	0,			/* id */
 	0,			/* frag offset */
 	PIM_ENCAP_TTL,
 	IPPROTO_PIM,
 	0,			/* checksum */
 };
 
 static struct pim_encap_pimhdr pim_encap_pimhdr = {
     {
 	PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
 	0,			/* reserved */
 	0,			/* checksum */
     },
     0				/* flags */
 };
 
 VNET_DEFINE_STATIC(vifi_t, reg_vif_num) = VIFI_INVALID;
 #define	V_reg_vif_num		VNET(reg_vif_num)
 VNET_DEFINE_STATIC(struct ifnet, multicast_register_if);
 #define	V_multicast_register_if	VNET(multicast_register_if)
 
 /*
  * Private variables.
  */
 
 static u_long	X_ip_mcast_src(int);
 static int	X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *,
 		    struct ip_moptions *);
 static int	X_ip_mrouter_done(void);
 static int	X_ip_mrouter_get(struct socket *, struct sockopt *);
 static int	X_ip_mrouter_set(struct socket *, struct sockopt *);
 static int	X_legal_vif_num(int);
 static int	X_mrt_ioctl(u_long, caddr_t, int);
 
 static int	add_bw_upcall(struct bw_upcall *);
 static int	add_mfc(struct mfcctl2 *);
 static int	add_vif(struct vifctl *);
 static void	bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
 static void	bw_meter_process(void);
 static void	bw_meter_receive_packet(struct bw_meter *, int,
 		    struct timeval *);
 static void	bw_upcalls_send(void);
 static int	del_bw_upcall(struct bw_upcall *);
 static int	del_mfc(struct mfcctl2 *);
 static int	del_vif(vifi_t);
 static int	del_vif_locked(vifi_t);
 static void	expire_bw_meter_process(void *);
 static void	expire_bw_upcalls_send(void *);
 static void	expire_mfc(struct mfc *);
 static void	expire_upcalls(void *);
 static void	free_bw_list(struct bw_meter *);
 static int	get_sg_cnt(struct sioc_sg_req *);
 static int	get_vif_cnt(struct sioc_vif_req *);
 static void	if_detached_event(void *, struct ifnet *);
 static int	ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
 static int	ip_mrouter_init(struct socket *, int);
 static __inline struct mfc *
 		mfc_find(struct in_addr *, struct in_addr *);
 static void	phyint_send(struct ip *, struct vif *, struct mbuf *);
 static struct mbuf *
 		pim_register_prepare(struct ip *, struct mbuf *);
 static int	pim_register_send(struct ip *, struct vif *,
 		    struct mbuf *, struct mfc *);
 static int	pim_register_send_rp(struct ip *, struct vif *,
 		    struct mbuf *, struct mfc *);
 static int	pim_register_send_upcall(struct ip *, struct vif *,
 		    struct mbuf *, struct mfc *);
 static void	schedule_bw_meter(struct bw_meter *, struct timeval *);
 static void	send_packet(struct vif *, struct mbuf *);
 static int	set_api_config(uint32_t *);
 static int	set_assert(int);
 static int	socket_send(struct socket *, struct mbuf *,
 		    struct sockaddr_in *);
 static void	unschedule_bw_meter(struct bw_meter *);
 
 /*
  * Kernel multicast forwarding API capabilities and setup.
  * If more API capabilities are added to the kernel, they should be
  * recorded in `mrt_api_support'.
  */
 #define MRT_API_VERSION		0x0305
 
 static const int mrt_api_version = MRT_API_VERSION;
 static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
 					 MRT_MFC_FLAGS_BORDER_VIF |
 					 MRT_MFC_RP |
 					 MRT_MFC_BW_UPCALL);
 VNET_DEFINE_STATIC(uint32_t, mrt_api_config);
 #define	V_mrt_api_config	VNET(mrt_api_config)
 VNET_DEFINE_STATIC(int, pim_assert_enabled);
 #define	V_pim_assert_enabled	VNET(pim_assert_enabled)
 static struct timeval pim_assert_interval = { 3, 0 };	/* Rate limit */
 
 /*
  * Find a route for a given origin IP address and multicast group address.
  * Statistics must be updated by the caller.
  */
 static __inline struct mfc *
 mfc_find(struct in_addr *o, struct in_addr *g)
 {
 	struct mfc *rt;
 
 	MFC_LOCK_ASSERT();
 
 	LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
 		if (in_hosteq(rt->mfc_origin, *o) &&
 		    in_hosteq(rt->mfc_mcastgrp, *g) &&
 		    TAILQ_EMPTY(&rt->mfc_stall))
 			break;
 	}
 
 	return (rt);
 }
 
 /*
  * Handle MRT setsockopt commands to modify the multicast forwarding tables.
  */
 static int
 X_ip_mrouter_set(struct socket *so, struct sockopt *sopt)
 {
     int	error, optval;
     vifi_t	vifi;
     struct	vifctl vifc;
     struct	mfcctl2 mfc;
     struct	bw_upcall bw_upcall;
     uint32_t	i;
 
     if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT)
 	return EPERM;
 
     error = 0;
     switch (sopt->sopt_name) {
     case MRT_INIT:
 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
 	if (error)
 	    break;
 	error = ip_mrouter_init(so, optval);
 	break;
 
     case MRT_DONE:
 	error = ip_mrouter_done();
 	break;
 
     case MRT_ADD_VIF:
 	error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
 	if (error)
 	    break;
 	error = add_vif(&vifc);
 	break;
 
     case MRT_DEL_VIF:
 	error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
 	if (error)
 	    break;
 	error = del_vif(vifi);
 	break;
 
     case MRT_ADD_MFC:
     case MRT_DEL_MFC:
 	/*
 	 * select data size depending on API version.
 	 */
 	if (sopt->sopt_name == MRT_ADD_MFC &&
 		V_mrt_api_config & MRT_API_FLAGS_ALL) {
 	    error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2),
 				sizeof(struct mfcctl2));
 	} else {
 	    error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl),
 				sizeof(struct mfcctl));
 	    bzero((caddr_t)&mfc + sizeof(struct mfcctl),
 			sizeof(mfc) - sizeof(struct mfcctl));
 	}
 	if (error)
 	    break;
 	if (sopt->sopt_name == MRT_ADD_MFC)
 	    error = add_mfc(&mfc);
 	else
 	    error = del_mfc(&mfc);
 	break;
 
     case MRT_ASSERT:
 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
 	if (error)
 	    break;
 	set_assert(optval);
 	break;
 
     case MRT_API_CONFIG:
 	error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
 	if (!error)
 	    error = set_api_config(&i);
 	if (!error)
 	    error = sooptcopyout(sopt, &i, sizeof i);
 	break;
 
     case MRT_ADD_BW_UPCALL:
     case MRT_DEL_BW_UPCALL:
 	error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall,
 				sizeof bw_upcall);
 	if (error)
 	    break;
 	if (sopt->sopt_name == MRT_ADD_BW_UPCALL)
 	    error = add_bw_upcall(&bw_upcall);
 	else
 	    error = del_bw_upcall(&bw_upcall);
 	break;
 
     default:
 	error = EOPNOTSUPP;
 	break;
     }
     return error;
 }
 
 /*
  * Handle MRT getsockopt commands
  */
 static int
 X_ip_mrouter_get(struct socket *so, struct sockopt *sopt)
 {
     int error;
 
     switch (sopt->sopt_name) {
     case MRT_VERSION:
 	error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version);
 	break;
 
     case MRT_ASSERT:
 	error = sooptcopyout(sopt, &V_pim_assert_enabled,
 	    sizeof V_pim_assert_enabled);
 	break;
 
     case MRT_API_SUPPORT:
 	error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support);
 	break;
 
     case MRT_API_CONFIG:
 	error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config);
 	break;
 
     default:
 	error = EOPNOTSUPP;
 	break;
     }
     return error;
 }
 
 /*
  * Handle ioctl commands to obtain information from the cache
  */
 static int
 X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused)
 {
     int error = 0;
 
     /*
      * Currently the only function calling this ioctl routine is rtioctl_fib().
      * Typically, only root can create the raw socket in order to execute
      * this ioctl method, however the request might be coming from a prison
      */
     error = priv_check(curthread, PRIV_NETINET_MROUTE);
     if (error)
 	return (error);
     switch (cmd) {
     case (SIOCGETVIFCNT):
 	error = get_vif_cnt((struct sioc_vif_req *)data);
 	break;
 
     case (SIOCGETSGCNT):
 	error = get_sg_cnt((struct sioc_sg_req *)data);
 	break;
 
     default:
 	error = EINVAL;
 	break;
     }
     return error;
 }
 
 /*
  * returns the packet, byte, rpf-failure count for the source group provided
  */
 static int
 get_sg_cnt(struct sioc_sg_req *req)
 {
     struct mfc *rt;
 
     MFC_LOCK();
     rt = mfc_find(&req->src, &req->grp);
     if (rt == NULL) {
 	MFC_UNLOCK();
 	req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
 	return EADDRNOTAVAIL;
     }
     req->pktcnt = rt->mfc_pkt_cnt;
     req->bytecnt = rt->mfc_byte_cnt;
     req->wrong_if = rt->mfc_wrong_if;
     MFC_UNLOCK();
     return 0;
 }
 
 /*
  * returns the input and output packet and byte counts on the vif provided
  */
 static int
 get_vif_cnt(struct sioc_vif_req *req)
 {
     vifi_t vifi = req->vifi;
 
     VIF_LOCK();
     if (vifi >= V_numvifs) {
 	VIF_UNLOCK();
 	return EINVAL;
     }
 
     req->icount = V_viftable[vifi].v_pkt_in;
     req->ocount = V_viftable[vifi].v_pkt_out;
     req->ibytes = V_viftable[vifi].v_bytes_in;
     req->obytes = V_viftable[vifi].v_bytes_out;
     VIF_UNLOCK();
 
     return 0;
 }
 
 static void
 if_detached_event(void *arg __unused, struct ifnet *ifp)
 {
     vifi_t vifi;
     u_long i;
 
     MROUTER_LOCK();
 
     if (V_ip_mrouter == NULL) {
 	MROUTER_UNLOCK();
 	return;
     }
 
     VIF_LOCK();
     MFC_LOCK();
 
     /*
      * Tear down multicast forwarder state associated with this ifnet.
      * 1. Walk the vif list, matching vifs against this ifnet.
      * 2. Walk the multicast forwarding cache (mfc) looking for
      *    inner matches with this vif's index.
      * 3. Expire any matching multicast forwarding cache entries.
      * 4. Free vif state. This should disable ALLMULTI on the interface.
      */
     for (vifi = 0; vifi < V_numvifs; vifi++) {
 	if (V_viftable[vifi].v_ifp != ifp)
 		continue;
 	for (i = 0; i < mfchashsize; i++) {
 		struct mfc *rt, *nrt;
 
 		LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
 			if (rt->mfc_parent == vifi) {
 				expire_mfc(rt);
 			}
 		}
 	}
 	del_vif_locked(vifi);
     }
 
     MFC_UNLOCK();
     VIF_UNLOCK();
 
     MROUTER_UNLOCK();
 }
                         
 /*
  * Enable multicast forwarding.
  */
 static int
 ip_mrouter_init(struct socket *so, int version)
 {
 
     CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__,
         so->so_type, so->so_proto->pr_protocol);
 
     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP)
 	return EOPNOTSUPP;
 
     if (version != 1)
 	return ENOPROTOOPT;
 
     MROUTER_LOCK();
 
     if (ip_mrouter_unloading) {
 	MROUTER_UNLOCK();
 	return ENOPROTOOPT;
     }
 
     if (V_ip_mrouter != NULL) {
 	MROUTER_UNLOCK();
 	return EADDRINUSE;
     }
 
     V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash,
 	HASH_NOWAIT);
 
     callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
 	curvnet);
     callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
 	curvnet);
     callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
 	curvnet);
 
     V_ip_mrouter = so;
     ip_mrouter_cnt++;
 
     MROUTER_UNLOCK();
 
     CTR1(KTR_IPMF, "%s: done", __func__);
 
     return 0;
 }
 
 /*
  * Disable multicast forwarding.
  */
 static int
 X_ip_mrouter_done(void)
 {
     struct ifnet *ifp;
     u_long i;
     vifi_t vifi;
 
     MROUTER_LOCK();
 
     if (V_ip_mrouter == NULL) {
 	MROUTER_UNLOCK();
 	return EINVAL;
     }
 
     /*
      * Detach/disable hooks to the reset of the system.
      */
     V_ip_mrouter = NULL;
     ip_mrouter_cnt--;
     V_mrt_api_config = 0;
 
     VIF_LOCK();
 
     /*
      * For each phyint in use, disable promiscuous reception of all IP
      * multicasts.
      */
     for (vifi = 0; vifi < V_numvifs; vifi++) {
 	if (!in_nullhost(V_viftable[vifi].v_lcl_addr) &&
 		!(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
 	    ifp = V_viftable[vifi].v_ifp;
 	    if_allmulti(ifp, 0);
 	}
     }
     bzero((caddr_t)V_viftable, sizeof(V_viftable));
     V_numvifs = 0;
     V_pim_assert_enabled = 0;
     
     VIF_UNLOCK();
 
     callout_stop(&V_expire_upcalls_ch);
     callout_stop(&V_bw_upcalls_ch);
     callout_stop(&V_bw_meter_ch);
 
     MFC_LOCK();
 
     /*
      * Free all multicast forwarding cache entries.
      * Do not use hashdestroy(), as we must perform other cleanup.
      */
     for (i = 0; i < mfchashsize; i++) {
 	struct mfc *rt, *nrt;
 
 	LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
 		expire_mfc(rt);
 	}
     }
     free(V_mfchashtbl, M_MRTABLE);
     V_mfchashtbl = NULL;
 
     bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize);
 
     V_bw_upcalls_n = 0;
     bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers));
 
     MFC_UNLOCK();
 
     V_reg_vif_num = VIFI_INVALID;
 
     MROUTER_UNLOCK();
 
     CTR1(KTR_IPMF, "%s: done", __func__);
 
     return 0;
 }
 
 /*
  * Set PIM assert processing global
  */
 static int
 set_assert(int i)
 {
     if ((i != 1) && (i != 0))
 	return EINVAL;
 
     V_pim_assert_enabled = i;
 
     return 0;
 }
 
 /*
  * Configure API capabilities
  */
 int
 set_api_config(uint32_t *apival)
 {
     u_long i;
 
     /*
      * We can set the API capabilities only if it is the first operation
      * after MRT_INIT. I.e.:
      *  - there are no vifs installed
      *  - pim_assert is not enabled
      *  - the MFC table is empty
      */
     if (V_numvifs > 0) {
 	*apival = 0;
 	return EPERM;
     }
     if (V_pim_assert_enabled) {
 	*apival = 0;
 	return EPERM;
     }
 
     MFC_LOCK();
 
     for (i = 0; i < mfchashsize; i++) {
 	if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) {
 	    MFC_UNLOCK();
 	    *apival = 0;
 	    return EPERM;
 	}
     }
 
     MFC_UNLOCK();
 
     V_mrt_api_config = *apival & mrt_api_support;
     *apival = V_mrt_api_config;
 
     return 0;
 }
 
 /*
  * Add a vif to the vif table
  */
 static int
 add_vif(struct vifctl *vifcp)
 {
     struct vif *vifp = V_viftable + vifcp->vifc_vifi;
     struct sockaddr_in sin = {sizeof sin, AF_INET};
     struct ifaddr *ifa;
     struct ifnet *ifp;
     int error;
 
     VIF_LOCK();
     if (vifcp->vifc_vifi >= MAXVIFS) {
 	VIF_UNLOCK();
 	return EINVAL;
     }
     /* rate limiting is no longer supported by this code */
     if (vifcp->vifc_rate_limit != 0) {
 	log(LOG_ERR, "rate limiting is no longer supported\n");
 	VIF_UNLOCK();
 	return EINVAL;
     }
     if (!in_nullhost(vifp->v_lcl_addr)) {
 	VIF_UNLOCK();
 	return EADDRINUSE;
     }
     if (in_nullhost(vifcp->vifc_lcl_addr)) {
 	VIF_UNLOCK();
 	return EADDRNOTAVAIL;
     }
 
     /* Find the interface with an address in AF_INET family */
     if (vifcp->vifc_flags & VIFF_REGISTER) {
 	/*
 	 * XXX: Because VIFF_REGISTER does not really need a valid
 	 * local interface (e.g. it could be 127.0.0.2), we don't
 	 * check its address.
 	 */
 	ifp = NULL;
     } else {
 	struct epoch_tracker et;
 
 	sin.sin_addr = vifcp->vifc_lcl_addr;
 	NET_EPOCH_ENTER(et);
 	ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
 	if (ifa == NULL) {
 		NET_EPOCH_EXIT(et);
 	    VIF_UNLOCK();
 	    return EADDRNOTAVAIL;
 	}
 	ifp = ifa->ifa_ifp;
 	NET_EPOCH_EXIT(et);
     }
 
     if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) {
 	CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__);
 	VIF_UNLOCK();
 	return EOPNOTSUPP;
     } else if (vifcp->vifc_flags & VIFF_REGISTER) {
 	ifp = &V_multicast_register_if;
 	CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp);
 	if (V_reg_vif_num == VIFI_INVALID) {
 	    if_initname(&V_multicast_register_if, "register_vif", 0);
 	    V_multicast_register_if.if_flags = IFF_LOOPBACK;
 	    V_reg_vif_num = vifcp->vifc_vifi;
 	}
     } else {		/* Make sure the interface supports multicast */
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 	    VIF_UNLOCK();
 	    return EOPNOTSUPP;
 	}
 
 	/* Enable promiscuous reception of all IP multicasts from the if */
 	error = if_allmulti(ifp, 1);
 	if (error) {
 	    VIF_UNLOCK();
 	    return error;
 	}
     }
 
     vifp->v_flags     = vifcp->vifc_flags;
     vifp->v_threshold = vifcp->vifc_threshold;
     vifp->v_lcl_addr  = vifcp->vifc_lcl_addr;
     vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
     vifp->v_ifp       = ifp;
     /* initialize per vif pkt counters */
     vifp->v_pkt_in    = 0;
     vifp->v_pkt_out   = 0;
     vifp->v_bytes_in  = 0;
     vifp->v_bytes_out = 0;
 
     /* Adjust numvifs up if the vifi is higher than numvifs */
     if (V_numvifs <= vifcp->vifc_vifi)
 	V_numvifs = vifcp->vifc_vifi + 1;
 
     VIF_UNLOCK();
 
     CTR4(KTR_IPMF, "%s: add vif %d laddr 0x%08x thresh %x", __func__,
 	(int)vifcp->vifc_vifi, ntohl(vifcp->vifc_lcl_addr.s_addr),
 	(int)vifcp->vifc_threshold);
 
     return 0;
 }
 
 /*
  * Delete a vif from the vif table
  */
 static int
 del_vif_locked(vifi_t vifi)
 {
     struct vif *vifp;
 
     VIF_LOCK_ASSERT();
 
     if (vifi >= V_numvifs) {
 	return EINVAL;
     }
     vifp = &V_viftable[vifi];
     if (in_nullhost(vifp->v_lcl_addr)) {
 	return EADDRNOTAVAIL;
     }
 
     if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER)))
 	if_allmulti(vifp->v_ifp, 0);
 
     if (vifp->v_flags & VIFF_REGISTER)
 	V_reg_vif_num = VIFI_INVALID;
 
     bzero((caddr_t)vifp, sizeof (*vifp));
 
     CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi);
 
     /* Adjust numvifs down */
     for (vifi = V_numvifs; vifi > 0; vifi--)
 	if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr))
 	    break;
     V_numvifs = vifi;
 
     return 0;
 }
 
 static int
 del_vif(vifi_t vifi)
 {
     int cc;
 
     VIF_LOCK();
     cc = del_vif_locked(vifi);
     VIF_UNLOCK();
 
     return cc;
 }
 
 /*
  * update an mfc entry without resetting counters and S,G addresses.
  */
 static void
 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 {
     int i;
 
     rt->mfc_parent = mfccp->mfcc_parent;
     for (i = 0; i < V_numvifs; i++) {
 	rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 	rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config &
 	    MRT_MFC_FLAGS_ALL;
     }
     /* set the RP address */
     if (V_mrt_api_config & MRT_MFC_RP)
 	rt->mfc_rp = mfccp->mfcc_rp;
     else
 	rt->mfc_rp.s_addr = INADDR_ANY;
 }
 
 /*
  * fully initialize an mfc entry from the parameter.
  */
 static void
 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 {
     rt->mfc_origin     = mfccp->mfcc_origin;
     rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 
     update_mfc_params(rt, mfccp);
 
     /* initialize pkt counters per src-grp */
     rt->mfc_pkt_cnt    = 0;
     rt->mfc_byte_cnt   = 0;
     rt->mfc_wrong_if   = 0;
     timevalclear(&rt->mfc_last_assert);
 }
 
 static void
 expire_mfc(struct mfc *rt)
 {
 	struct rtdetq *rte, *nrte;
 
 	MFC_LOCK_ASSERT();
 
 	free_bw_list(rt->mfc_bw_meter);
 
 	TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
 		m_freem(rte->m);
 		TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link);
 		free(rte, M_MRTABLE);
 	}
 
 	LIST_REMOVE(rt, mfc_hash);
 	free(rt, M_MRTABLE);
 }
 
 /*
  * Add an mfc entry
  */
 static int
 add_mfc(struct mfcctl2 *mfccp)
 {
     struct mfc *rt;
     struct rtdetq *rte, *nrte;
     u_long hash = 0;
     u_short nstl;
 
     VIF_LOCK();
     MFC_LOCK();
 
     rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 
     /* If an entry already exists, just update the fields */
     if (rt) {
 	CTR4(KTR_IPMF, "%s: update mfc orig 0x%08x group %lx parent %x",
 	    __func__, ntohl(mfccp->mfcc_origin.s_addr),
 	    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 	    mfccp->mfcc_parent);
 	update_mfc_params(rt, mfccp);
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return (0);
     }
 
     /*
      * Find the entry for which the upcall was made and update
      */
     nstl = 0;
     hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
     LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
 	if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 	    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
 	    !TAILQ_EMPTY(&rt->mfc_stall)) {
 		CTR5(KTR_IPMF,
 		    "%s: add mfc orig 0x%08x group %lx parent %x qh %p",
 		    __func__, ntohl(mfccp->mfcc_origin.s_addr),
 		    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 		    mfccp->mfcc_parent,
 		    TAILQ_FIRST(&rt->mfc_stall));
 		if (nstl++)
 			CTR1(KTR_IPMF, "%s: multiple matches", __func__);
 
 		init_mfc_params(rt, mfccp);
 		rt->mfc_expire = 0;	/* Don't clean this guy up */
 		V_nexpire[hash]--;
 
 		/* Free queued packets, but attempt to forward them first. */
 		TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
 			if (rte->ifp != NULL)
 				ip_mdq(rte->m, rte->ifp, rt, -1);
 			m_freem(rte->m);
 			TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link);
 			rt->mfc_nstall--;
 			free(rte, M_MRTABLE);
 		}
 	}
     }
 
     /*
      * It is possible that an entry is being inserted without an upcall
      */
     if (nstl == 0) {
 	CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__);
 	LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
 		if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 		    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
 			init_mfc_params(rt, mfccp);
 			if (rt->mfc_expire)
 			    V_nexpire[hash]--;
 			rt->mfc_expire = 0;
 			break; /* XXX */
 		}
 	}
 
 	if (rt == NULL) {		/* no upcall, so make a new entry */
 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 	    if (rt == NULL) {
 		MFC_UNLOCK();
 		VIF_UNLOCK();
 		return (ENOBUFS);
 	    }
 
 	    init_mfc_params(rt, mfccp);
 	    TAILQ_INIT(&rt->mfc_stall);
 	    rt->mfc_nstall = 0;
 
 	    rt->mfc_expire     = 0;
 	    rt->mfc_bw_meter = NULL;
 
 	    /* insert new entry at head of hash chain */
 	    LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
 	}
     }
 
     MFC_UNLOCK();
     VIF_UNLOCK();
 
     return (0);
 }
 
 /*
  * Delete an mfc entry
  */
 static int
 del_mfc(struct mfcctl2 *mfccp)
 {
     struct in_addr	origin;
     struct in_addr	mcastgrp;
     struct mfc		*rt;
 
     origin = mfccp->mfcc_origin;
     mcastgrp = mfccp->mfcc_mcastgrp;
 
     CTR3(KTR_IPMF, "%s: delete mfc orig 0x%08x group %lx", __func__,
 	ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr));
 
     MFC_LOCK();
 
     rt = mfc_find(&origin, &mcastgrp);
     if (rt == NULL) {
 	MFC_UNLOCK();
 	return EADDRNOTAVAIL;
     }
 
     /*
      * free the bw_meter entries
      */
     free_bw_list(rt->mfc_bw_meter);
     rt->mfc_bw_meter = NULL;
 
     LIST_REMOVE(rt, mfc_hash);
     free(rt, M_MRTABLE);
 
     MFC_UNLOCK();
 
     return (0);
 }
 
 /*
  * Send a message to the routing daemon on the multicast routing socket.
  */
 static int
 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
 {
     if (s) {
 	SOCKBUF_LOCK(&s->so_rcv);
 	if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm,
 	    NULL) != 0) {
 	    sorwakeup_locked(s);
 	    return 0;
 	}
 	SOCKBUF_UNLOCK(&s->so_rcv);
     }
     m_freem(mm);
     return -1;
 }
 
 /*
  * IP multicast forwarding function. This function assumes that the packet
  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
  * pointed to by "ifp", and the packet is to be relayed to other networks
  * that have members of the packet's destination IP multicast group.
  *
  * The packet is returned unscathed to the caller, unless it is
  * erroneous, in which case a non-zero return value tells the caller to
  * discard it.
  */
 
 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 
 static int
 X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m,
     struct ip_moptions *imo)
 {
     struct mfc *rt;
     int error;
     vifi_t vifi;
 
     CTR3(KTR_IPMF, "ip_mforward: delete mfc orig 0x%08x group %lx ifp %p",
 	ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), ifp);
 
     if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 ||
 		((u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
 	/*
 	 * Packet arrived via a physical interface or
 	 * an encapsulated tunnel or a register_vif.
 	 */
     } else {
 	/*
 	 * Packet arrived through a source-route tunnel.
 	 * Source-route tunnels are no longer supported.
 	 */
 	return (1);
     }
 
     VIF_LOCK();
     MFC_LOCK();
     if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) {
 	if (ip->ip_ttl < MAXTTL)
 	    ip->ip_ttl++;	/* compensate for -1 in *_send routines */
 	error = ip_mdq(m, ifp, NULL, vifi);
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return error;
     }
 
     /*
      * Don't forward a packet with time-to-live of zero or one,
      * or a packet destined to a local-only group.
      */
     if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) {
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return 0;
     }
 
     /*
      * Determine forwarding vifs from the forwarding cache table
      */
     MRTSTAT_INC(mrts_mfc_lookups);
     rt = mfc_find(&ip->ip_src, &ip->ip_dst);
 
     /* Entry exists, so forward if necessary */
     if (rt != NULL) {
 	error = ip_mdq(m, ifp, rt, -1);
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 	return error;
     } else {
 	/*
 	 * If we don't have a route for packet's origin,
 	 * Make a copy of the packet & send message to routing daemon
 	 */
 
 	struct mbuf *mb0;
 	struct rtdetq *rte;
 	u_long hash;
 	int hlen = ip->ip_hl << 2;
 
 	MRTSTAT_INC(mrts_mfc_misses);
 	MRTSTAT_INC(mrts_no_route);
 	CTR2(KTR_IPMF, "ip_mforward: no mfc for (0x%08x,%lx)",
 	    ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr));
 
 	/*
 	 * Allocate mbufs early so that we don't do extra work if we are
 	 * just going to fail anyway.  Make sure to pullup the header so
 	 * that other people can't step on it.
 	 */
 	rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE,
 	    M_NOWAIT|M_ZERO);
 	if (rte == NULL) {
 	    MFC_UNLOCK();
 	    VIF_UNLOCK();
 	    return ENOBUFS;
 	}
 
 	mb0 = m_copypacket(m, M_NOWAIT);
 	if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen))
 	    mb0 = m_pullup(mb0, hlen);
 	if (mb0 == NULL) {
 	    free(rte, M_MRTABLE);
 	    MFC_UNLOCK();
 	    VIF_UNLOCK();
 	    return ENOBUFS;
 	}
 
 	/* is there an upcall waiting for this flow ? */
 	hash = MFCHASH(ip->ip_src, ip->ip_dst);
 	LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
 		if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
 		    in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
 		    !TAILQ_EMPTY(&rt->mfc_stall))
 			break;
 	}
 
 	if (rt == NULL) {
 	    int i;
 	    struct igmpmsg *im;
 	    struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 	    struct mbuf *mm;
 
 	    /*
 	     * Locate the vifi for the incoming interface for this packet.
 	     * If none found, drop packet.
 	     */
 	    for (vifi = 0; vifi < V_numvifs &&
 		    V_viftable[vifi].v_ifp != ifp; vifi++)
 		;
 	    if (vifi >= V_numvifs)	/* vif not found, drop packet */
 		goto non_fatal;
 
 	    /* no upcall, so make a new entry */
 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
 	    if (rt == NULL)
 		goto fail;
 
 	    /* Make a copy of the header to send to the user level process */
 	    mm = m_copym(mb0, 0, hlen, M_NOWAIT);
 	    if (mm == NULL)
 		goto fail1;
 
 	    /*
 	     * Send message to routing daemon to install
 	     * a route into the kernel table
 	     */
 
 	    im = mtod(mm, struct igmpmsg *);
 	    im->im_msgtype = IGMPMSG_NOCACHE;
 	    im->im_mbz = 0;
 	    im->im_vif = vifi;
 
 	    MRTSTAT_INC(mrts_upcalls);
 
 	    k_igmpsrc.sin_addr = ip->ip_src;
 	    if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
 		CTR0(KTR_IPMF, "ip_mforward: socket queue full");
 		MRTSTAT_INC(mrts_upq_sockfull);
 fail1:
 		free(rt, M_MRTABLE);
 fail:
 		free(rte, M_MRTABLE);
 		m_freem(mb0);
 		MFC_UNLOCK();
 		VIF_UNLOCK();
 		return ENOBUFS;
 	    }
 
 	    /* insert new entry at head of hash chain */
 	    rt->mfc_origin.s_addr     = ip->ip_src.s_addr;
 	    rt->mfc_mcastgrp.s_addr   = ip->ip_dst.s_addr;
 	    rt->mfc_expire	      = UPCALL_EXPIRE;
 	    V_nexpire[hash]++;
 	    for (i = 0; i < V_numvifs; i++) {
 		rt->mfc_ttls[i] = 0;
 		rt->mfc_flags[i] = 0;
 	    }
 	    rt->mfc_parent = -1;
 
 	    /* clear the RP address */
 	    rt->mfc_rp.s_addr = INADDR_ANY;
 	    rt->mfc_bw_meter = NULL;
 
 	    /* initialize pkt counters per src-grp */
 	    rt->mfc_pkt_cnt = 0;
 	    rt->mfc_byte_cnt = 0;
 	    rt->mfc_wrong_if = 0;
 	    timevalclear(&rt->mfc_last_assert);
 
 	    TAILQ_INIT(&rt->mfc_stall);
 	    rt->mfc_nstall = 0;
 
 	    /* link into table */
 	    LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
 	    TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link);
 	    rt->mfc_nstall++;
 
 	} else {
 	    /* determine if queue has overflowed */
 	    if (rt->mfc_nstall > MAX_UPQ) {
 		MRTSTAT_INC(mrts_upq_ovflw);
 non_fatal:
 		free(rte, M_MRTABLE);
 		m_freem(mb0);
 		MFC_UNLOCK();
 		VIF_UNLOCK();
 		return (0);
 	    }
 	    TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link);
 	    rt->mfc_nstall++;
 	}
 
 	rte->m			= mb0;
 	rte->ifp		= ifp;
 
 	MFC_UNLOCK();
 	VIF_UNLOCK();
 
 	return 0;
     }
 }
 
 /*
  * Clean up the cache entry if upcall is not serviced
  */
 static void
 expire_upcalls(void *arg)
 {
     u_long i;
 
     CURVNET_SET((struct vnet *) arg);
 
     MFC_LOCK();
 
     for (i = 0; i < mfchashsize; i++) {
 	struct mfc *rt, *nrt;
 
 	if (V_nexpire[i] == 0)
 	    continue;
 
 	LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
 		if (TAILQ_EMPTY(&rt->mfc_stall))
 			continue;
 
 		if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
 			continue;
 
 		/*
 		 * free the bw_meter entries
 		 */
 		while (rt->mfc_bw_meter != NULL) {
 		    struct bw_meter *x = rt->mfc_bw_meter;
 
 		    rt->mfc_bw_meter = x->bm_mfc_next;
 		    free(x, M_BWMETER);
 		}
 
 		MRTSTAT_INC(mrts_cache_cleanups);
 		CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__,
 		    (u_long)ntohl(rt->mfc_origin.s_addr),
 		    (u_long)ntohl(rt->mfc_mcastgrp.s_addr));
 
 		expire_mfc(rt);
 	    }
     }
 
     MFC_UNLOCK();
 
     callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
 	curvnet);
 
     CURVNET_RESTORE();
 }
 
 /*
  * Packet forwarding routine once entry in the cache is made
  */
 static int
 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
 {
     struct ip  *ip = mtod(m, struct ip *);
     vifi_t vifi;
     int plen = ntohs(ip->ip_len);
 
     VIF_LOCK_ASSERT();
 
     /*
      * If xmt_vif is not -1, send on only the requested vif.
      *
      * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
      */
     if (xmt_vif < V_numvifs) {
 	if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER)
 		pim_register_send(ip, V_viftable + xmt_vif, m, rt);
 	else
 		phyint_send(ip, V_viftable + xmt_vif, m);
 	return 1;
     }
 
     /*
      * Don't forward if it didn't arrive from the parent vif for its origin.
      */
     vifi = rt->mfc_parent;
     if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) {
 	CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)",
 	    __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp);
 	MRTSTAT_INC(mrts_wrong_if);
 	++rt->mfc_wrong_if;
 	/*
 	 * If we are doing PIM assert processing, send a message
 	 * to the routing daemon.
 	 *
 	 * XXX: A PIM-SM router needs the WRONGVIF detection so it
 	 * can complete the SPT switch, regardless of the type
 	 * of the iif (broadcast media, GRE tunnel, etc).
 	 */
 	if (V_pim_assert_enabled && (vifi < V_numvifs) &&
 	    V_viftable[vifi].v_ifp) {
 
 	    if (ifp == &V_multicast_register_if)
 		PIMSTAT_INC(pims_rcv_registers_wrongiif);
 
 	    /* Get vifi for the incoming packet */
 	    for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp;
 		vifi++)
 		;
 	    if (vifi >= V_numvifs)
 		return 0;	/* The iif is not found: ignore the packet. */
 
 	    if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF)
 		return 0;	/* WRONGVIF disabled: ignore the packet */
 
 	    if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) {
 		struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 		struct igmpmsg *im;
 		int hlen = ip->ip_hl << 2;
 		struct mbuf *mm = m_copym(m, 0, hlen, M_NOWAIT);
 
 		if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen))
 		    mm = m_pullup(mm, hlen);
 		if (mm == NULL)
 		    return ENOBUFS;
 
 		im = mtod(mm, struct igmpmsg *);
 		im->im_msgtype	= IGMPMSG_WRONGVIF;
 		im->im_mbz		= 0;
 		im->im_vif		= vifi;
 
 		MRTSTAT_INC(mrts_upcalls);
 
 		k_igmpsrc.sin_addr = im->im_src;
 		if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
 		    CTR1(KTR_IPMF, "%s: socket queue full", __func__);
 		    MRTSTAT_INC(mrts_upq_sockfull);
 		    return ENOBUFS;
 		}
 	    }
 	}
 	return 0;
     }
 
 
     /* If I sourced this packet, it counts as output, else it was input. */
     if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) {
 	V_viftable[vifi].v_pkt_out++;
 	V_viftable[vifi].v_bytes_out += plen;
     } else {
 	V_viftable[vifi].v_pkt_in++;
 	V_viftable[vifi].v_bytes_in += plen;
     }
     rt->mfc_pkt_cnt++;
     rt->mfc_byte_cnt += plen;
 
     /*
      * For each vif, decide if a copy of the packet should be forwarded.
      * Forward if:
      *		- the ttl exceeds the vif's threshold
      *		- there are group members downstream on interface
      */
     for (vifi = 0; vifi < V_numvifs; vifi++)
 	if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) {
 	    V_viftable[vifi].v_pkt_out++;
 	    V_viftable[vifi].v_bytes_out += plen;
 	    if (V_viftable[vifi].v_flags & VIFF_REGISTER)
 		pim_register_send(ip, V_viftable + vifi, m, rt);
 	    else
 		phyint_send(ip, V_viftable + vifi, m);
 	}
 
     /*
      * Perform upcall-related bw measuring.
      */
     if (rt->mfc_bw_meter != NULL) {
 	struct bw_meter *x;
 	struct timeval now;
 
 	microtime(&now);
 	MFC_LOCK_ASSERT();
 	for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
 	    bw_meter_receive_packet(x, plen, &now);
     }
 
     return 0;
 }
 
 /*
  * Check if a vif number is legal/ok. This is used by in_mcast.c.
  */
 static int
 X_legal_vif_num(int vif)
 {
 	int ret;
 
 	ret = 0;
 	if (vif < 0)
 		return (ret);
 
 	VIF_LOCK();
 	if (vif < V_numvifs)
 		ret = 1;
 	VIF_UNLOCK();
 
 	return (ret);
 }
 
 /*
  * Return the local address used by this vif
  */
 static u_long
 X_ip_mcast_src(int vifi)
 {
 	in_addr_t addr;
 
 	addr = INADDR_ANY;
 	if (vifi < 0)
 		return (addr);
 
 	VIF_LOCK();
 	if (vifi < V_numvifs)
 		addr = V_viftable[vifi].v_lcl_addr.s_addr;
 	VIF_UNLOCK();
 
 	return (addr);
 }
 
 static void
 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 {
     struct mbuf *mb_copy;
     int hlen = ip->ip_hl << 2;
 
     VIF_LOCK_ASSERT();
 
     /*
      * Make a new reference to the packet; make sure that
      * the IP header is actually copied, not just referenced,
      * so that ip_output() only scribbles on the copy.
      */
     mb_copy = m_copypacket(m, M_NOWAIT);
     if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen))
 	mb_copy = m_pullup(mb_copy, hlen);
     if (mb_copy == NULL)
 	return;
 
     send_packet(vifp, mb_copy);
 }
 
 static void
 send_packet(struct vif *vifp, struct mbuf *m)
 {
 	struct ip_moptions imo;
-	struct in_multi *imm[2];
 	int error __unused;
 
 	VIF_LOCK_ASSERT();
 
 	imo.imo_multicast_ifp  = vifp->v_ifp;
 	imo.imo_multicast_ttl  = mtod(m, struct ip *)->ip_ttl - 1;
 	imo.imo_multicast_loop = 1;
 	imo.imo_multicast_vif  = -1;
-	imo.imo_num_memberships = 0;
-	imo.imo_max_memberships = 2;
-	imo.imo_membership  = &imm[0];
+	STAILQ_INIT(&imo.imo_head);
 
 	/*
 	 * Re-entrancy should not be a problem here, because
 	 * the packets that we send out and are looped back at us
 	 * should get rejected because they appear to come from
 	 * the loopback interface, thus preventing looping.
 	 */
 	error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL);
 	CTR3(KTR_IPMF, "%s: vif %td err %d", __func__,
 	    (ptrdiff_t)(vifp - V_viftable), error);
 }
 
 /*
  * Stubs for old RSVP socket shim implementation.
  */
 
 static int
 X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused)
 {
 
 	return (EOPNOTSUPP);
 }
 
 static void
 X_ip_rsvp_force_done(struct socket *so __unused)
 {
 
 }
 
 static int
 X_rsvp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 
 	m = *mp;
 	*mp = NULL;
 	if (!V_rsvp_on)
 		m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Code for bandwidth monitors
  */
 
 /*
  * Define common interface for timeval-related methods
  */
 #define	BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp)
 #define	BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp))
 #define	BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp))
 
 static uint32_t
 compute_bw_meter_flags(struct bw_upcall *req)
 {
     uint32_t flags = 0;
 
     if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
 	flags |= BW_METER_UNIT_PACKETS;
     if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
 	flags |= BW_METER_UNIT_BYTES;
     if (req->bu_flags & BW_UPCALL_GEQ)
 	flags |= BW_METER_GEQ;
     if (req->bu_flags & BW_UPCALL_LEQ)
 	flags |= BW_METER_LEQ;
 
     return flags;
 }
 
 /*
  * Add a bw_meter entry
  */
 static int
 add_bw_upcall(struct bw_upcall *req)
 {
     struct mfc *mfc;
     struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
 		BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
     struct timeval now;
     struct bw_meter *x;
     uint32_t flags;
 
     if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
 	return EOPNOTSUPP;
 
     /* Test if the flags are valid */
     if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
 	return EINVAL;
     if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
 	return EINVAL;
     if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 	    == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 	return EINVAL;
 
     /* Test if the threshold time interval is valid */
     if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
 	return EINVAL;
 
     flags = compute_bw_meter_flags(req);
 
     /*
      * Find if we have already same bw_meter entry
      */
     MFC_LOCK();
     mfc = mfc_find(&req->bu_src, &req->bu_dst);
     if (mfc == NULL) {
 	MFC_UNLOCK();
 	return EADDRNOTAVAIL;
     }
     for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
 	if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 			   &req->bu_threshold.b_time, ==)) &&
 	    (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 	    (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 	    (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
 	    MFC_UNLOCK();
 	    return 0;		/* XXX Already installed */
 	}
     }
 
     /* Allocate the new bw_meter entry */
     x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
     if (x == NULL) {
 	MFC_UNLOCK();
 	return ENOBUFS;
     }
 
     /* Set the new bw_meter entry */
     x->bm_threshold.b_time = req->bu_threshold.b_time;
     microtime(&now);
     x->bm_start_time = now;
     x->bm_threshold.b_packets = req->bu_threshold.b_packets;
     x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
     x->bm_measured.b_packets = 0;
     x->bm_measured.b_bytes = 0;
     x->bm_flags = flags;
     x->bm_time_next = NULL;
     x->bm_time_hash = BW_METER_BUCKETS;
 
     /* Add the new bw_meter entry to the front of entries for this MFC */
     x->bm_mfc = mfc;
     x->bm_mfc_next = mfc->mfc_bw_meter;
     mfc->mfc_bw_meter = x;
     schedule_bw_meter(x, &now);
     MFC_UNLOCK();
 
     return 0;
 }
 
 static void
 free_bw_list(struct bw_meter *list)
 {
     while (list != NULL) {
 	struct bw_meter *x = list;
 
 	list = list->bm_mfc_next;
 	unschedule_bw_meter(x);
 	free(x, M_BWMETER);
     }
 }
 
 /*
  * Delete one or multiple bw_meter entries
  */
 static int
 del_bw_upcall(struct bw_upcall *req)
 {
     struct mfc *mfc;
     struct bw_meter *x;
 
     if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
 	return EOPNOTSUPP;
 
     MFC_LOCK();
 
     /* Find the corresponding MFC entry */
     mfc = mfc_find(&req->bu_src, &req->bu_dst);
     if (mfc == NULL) {
 	MFC_UNLOCK();
 	return EADDRNOTAVAIL;
     } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
 	/*
 	 * Delete all bw_meter entries for this mfc
 	 */
 	struct bw_meter *list;
 
 	list = mfc->mfc_bw_meter;
 	mfc->mfc_bw_meter = NULL;
 	free_bw_list(list);
 	MFC_UNLOCK();
 	return 0;
     } else {			/* Delete a single bw_meter entry */
 	struct bw_meter *prev;
 	uint32_t flags = 0;
 
 	flags = compute_bw_meter_flags(req);
 
 	/* Find the bw_meter entry to delete */
 	for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
 	     prev = x, x = x->bm_mfc_next) {
 	    if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 			       &req->bu_threshold.b_time, ==)) &&
 		(x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 		(x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 		(x->bm_flags & BW_METER_USER_FLAGS) == flags)
 		break;
 	}
 	if (x != NULL) { /* Delete entry from the list for this MFC */
 	    if (prev != NULL)
 		prev->bm_mfc_next = x->bm_mfc_next;	/* remove from middle*/
 	    else
 		x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
 
 	    unschedule_bw_meter(x);
 	    MFC_UNLOCK();
 	    /* Free the bw_meter entry */
 	    free(x, M_BWMETER);
 	    return 0;
 	} else {
 	    MFC_UNLOCK();
 	    return EINVAL;
 	}
     }
     /* NOTREACHED */
 }
 
 /*
  * Perform bandwidth measurement processing that may result in an upcall
  */
 static void
 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 {
     struct timeval delta;
 
     MFC_LOCK_ASSERT();
 
     delta = *nowp;
     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 
     if (x->bm_flags & BW_METER_GEQ) {
 	/*
 	 * Processing for ">=" type of bw_meter entry
 	 */
 	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 	    /* Reset the bw_meter entry */
 	    x->bm_start_time = *nowp;
 	    x->bm_measured.b_packets = 0;
 	    x->bm_measured.b_bytes = 0;
 	    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 	}
 
 	/* Record that a packet is received */
 	x->bm_measured.b_packets++;
 	x->bm_measured.b_bytes += plen;
 
 	/*
 	 * Test if we should deliver an upcall
 	 */
 	if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, nowp);
 		x->bm_flags |= BW_METER_UPCALL_DELIVERED;
 	    }
 	}
     } else if (x->bm_flags & BW_METER_LEQ) {
 	/*
 	 * Processing for "<=" type of bw_meter entry
 	 */
 	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 	    /*
 	     * We are behind time with the multicast forwarding table
 	     * scanning for "<=" type of bw_meter entries, so test now
 	     * if we should deliver an upcall.
 	     */
 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, nowp);
 	    }
 	    /* Reschedule the bw_meter entry */
 	    unschedule_bw_meter(x);
 	    schedule_bw_meter(x, nowp);
 	}
 
 	/* Record that a packet is received */
 	x->bm_measured.b_packets++;
 	x->bm_measured.b_bytes += plen;
 
 	/*
 	 * Test if we should restart the measuring interval
 	 */
 	if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
 	     x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
 	    (x->bm_flags & BW_METER_UNIT_BYTES &&
 	     x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
 	    /* Don't restart the measuring interval */
 	} else {
 	    /* Do restart the measuring interval */
 	    /*
 	     * XXX: note that we don't unschedule and schedule, because this
 	     * might be too much overhead per packet. Instead, when we process
 	     * all entries for a given timer hash bin, we check whether it is
 	     * really a timeout. If not, we reschedule at that time.
 	     */
 	    x->bm_start_time = *nowp;
 	    x->bm_measured.b_packets = 0;
 	    x->bm_measured.b_bytes = 0;
 	    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 	}
     }
 }
 
 /*
  * Prepare a bandwidth-related upcall
  */
 static void
 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
 {
     struct timeval delta;
     struct bw_upcall *u;
 
     MFC_LOCK_ASSERT();
 
     /*
      * Compute the measured time interval
      */
     delta = *nowp;
     BW_TIMEVALDECR(&delta, &x->bm_start_time);
 
     /*
      * If there are too many pending upcalls, deliver them now
      */
     if (V_bw_upcalls_n >= BW_UPCALLS_MAX)
 	bw_upcalls_send();
 
     /*
      * Set the bw_upcall entry
      */
     u = &V_bw_upcalls[V_bw_upcalls_n++];
     u->bu_src = x->bm_mfc->mfc_origin;
     u->bu_dst = x->bm_mfc->mfc_mcastgrp;
     u->bu_threshold.b_time = x->bm_threshold.b_time;
     u->bu_threshold.b_packets = x->bm_threshold.b_packets;
     u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
     u->bu_measured.b_time = delta;
     u->bu_measured.b_packets = x->bm_measured.b_packets;
     u->bu_measured.b_bytes = x->bm_measured.b_bytes;
     u->bu_flags = 0;
     if (x->bm_flags & BW_METER_UNIT_PACKETS)
 	u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
     if (x->bm_flags & BW_METER_UNIT_BYTES)
 	u->bu_flags |= BW_UPCALL_UNIT_BYTES;
     if (x->bm_flags & BW_METER_GEQ)
 	u->bu_flags |= BW_UPCALL_GEQ;
     if (x->bm_flags & BW_METER_LEQ)
 	u->bu_flags |= BW_UPCALL_LEQ;
 }
 
 /*
  * Send the pending bandwidth-related upcalls
  */
 static void
 bw_upcalls_send(void)
 {
     struct mbuf *m;
     int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]);
     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
     static struct igmpmsg igmpmsg = { 0,		/* unused1 */
 				      0,		/* unused2 */
 				      IGMPMSG_BW_UPCALL,/* im_msgtype */
 				      0,		/* im_mbz  */
 				      0,		/* im_vif  */
 				      0,		/* unused3 */
 				      { 0 },		/* im_src  */
 				      { 0 } };		/* im_dst  */
 
     MFC_LOCK_ASSERT();
 
     if (V_bw_upcalls_n == 0)
 	return;			/* No pending upcalls */
 
     V_bw_upcalls_n = 0;
 
     /*
      * Allocate a new mbuf, initialize it with the header and
      * the payload for the pending calls.
      */
     m = m_gethdr(M_NOWAIT, MT_DATA);
     if (m == NULL) {
 	log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
 	return;
     }
 
     m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
     m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]);
 
     /*
      * Send the upcalls
      * XXX do we need to set the address in k_igmpsrc ?
      */
     MRTSTAT_INC(mrts_upcalls);
     if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) {
 	log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
 	MRTSTAT_INC(mrts_upq_sockfull);
     }
 }
 
 /*
  * Compute the timeout hash value for the bw_meter entries
  */
 #define	BW_METER_TIMEHASH(bw_meter, hash)				\
     do {								\
 	struct timeval next_timeval = (bw_meter)->bm_start_time;	\
 									\
 	BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
 	(hash) = next_timeval.tv_sec;					\
 	if (next_timeval.tv_usec)					\
 	    (hash)++; /* XXX: make sure we don't timeout early */	\
 	(hash) %= BW_METER_BUCKETS;					\
     } while (0)
 
 /*
  * Schedule a timer to process periodically bw_meter entry of type "<="
  * by linking the entry in the proper hash bucket.
  */
 static void
 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
 {
     int time_hash;
 
     MFC_LOCK_ASSERT();
 
     if (!(x->bm_flags & BW_METER_LEQ))
 	return;		/* XXX: we schedule timers only for "<=" entries */
 
     /*
      * Reset the bw_meter entry
      */
     x->bm_start_time = *nowp;
     x->bm_measured.b_packets = 0;
     x->bm_measured.b_bytes = 0;
     x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 
     /*
      * Compute the timeout hash value and insert the entry
      */
     BW_METER_TIMEHASH(x, time_hash);
     x->bm_time_next = V_bw_meter_timers[time_hash];
     V_bw_meter_timers[time_hash] = x;
     x->bm_time_hash = time_hash;
 }
 
 /*
  * Unschedule the periodic timer that processes bw_meter entry of type "<="
  * by removing the entry from the proper hash bucket.
  */
 static void
 unschedule_bw_meter(struct bw_meter *x)
 {
     int time_hash;
     struct bw_meter *prev, *tmp;
 
     MFC_LOCK_ASSERT();
 
     if (!(x->bm_flags & BW_METER_LEQ))
 	return;		/* XXX: we schedule timers only for "<=" entries */
 
     /*
      * Compute the timeout hash value and delete the entry
      */
     time_hash = x->bm_time_hash;
     if (time_hash >= BW_METER_BUCKETS)
 	return;		/* Entry was not scheduled */
 
     for (prev = NULL, tmp = V_bw_meter_timers[time_hash];
 	     tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
 	if (tmp == x)
 	    break;
 
     if (tmp == NULL)
 	panic("unschedule_bw_meter: bw_meter entry not found");
 
     if (prev != NULL)
 	prev->bm_time_next = x->bm_time_next;
     else
 	V_bw_meter_timers[time_hash] = x->bm_time_next;
 
     x->bm_time_next = NULL;
     x->bm_time_hash = BW_METER_BUCKETS;
 }
 
 
 /*
  * Process all "<=" type of bw_meter that should be processed now,
  * and for each entry prepare an upcall if necessary. Each processed
  * entry is rescheduled again for the (periodic) processing.
  *
  * This is run periodically (once per second normally). On each round,
  * all the potentially matching entries are in the hash slot that we are
  * looking at.
  */
 static void
 bw_meter_process()
 {
     uint32_t loops;
     int i;
     struct timeval now, process_endtime;
 
     microtime(&now);
     if (V_last_tv_sec == now.tv_sec)
 	return;		/* nothing to do */
 
     loops = now.tv_sec - V_last_tv_sec;
     V_last_tv_sec = now.tv_sec;
     if (loops > BW_METER_BUCKETS)
 	loops = BW_METER_BUCKETS;
 
     MFC_LOCK();
     /*
      * Process all bins of bw_meter entries from the one after the last
      * processed to the current one. On entry, i points to the last bucket
      * visited, so we need to increment i at the beginning of the loop.
      */
     for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
 	struct bw_meter *x, *tmp_list;
 
 	if (++i >= BW_METER_BUCKETS)
 	    i = 0;
 
 	/* Disconnect the list of bw_meter entries from the bin */
 	tmp_list = V_bw_meter_timers[i];
 	V_bw_meter_timers[i] = NULL;
 
 	/* Process the list of bw_meter entries */
 	while (tmp_list != NULL) {
 	    x = tmp_list;
 	    tmp_list = tmp_list->bm_time_next;
 
 	    /* Test if the time interval is over */
 	    process_endtime = x->bm_start_time;
 	    BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
 	    if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
 		/* Not yet: reschedule, but don't reset */
 		int time_hash;
 
 		BW_METER_TIMEHASH(x, time_hash);
 		if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
 		    /*
 		     * XXX: somehow the bin processing is a bit ahead of time.
 		     * Put the entry in the next bin.
 		     */
 		    if (++time_hash >= BW_METER_BUCKETS)
 			time_hash = 0;
 		}
 		x->bm_time_next = V_bw_meter_timers[time_hash];
 		V_bw_meter_timers[time_hash] = x;
 		x->bm_time_hash = time_hash;
 
 		continue;
 	    }
 
 	    /*
 	     * Test if we should deliver an upcall
 	     */
 	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 		((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, &now);
 	    }
 
 	    /*
 	     * Reschedule for next processing
 	     */
 	    schedule_bw_meter(x, &now);
 	}
     }
 
     /* Send all upcalls that are pending delivery */
     bw_upcalls_send();
 
     MFC_UNLOCK();
 }
 
 /*
  * A periodic function for sending all upcalls that are pending delivery
  */
 static void
 expire_bw_upcalls_send(void *arg)
 {
     CURVNET_SET((struct vnet *) arg);
 
     MFC_LOCK();
     bw_upcalls_send();
     MFC_UNLOCK();
 
     callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
 	curvnet);
     CURVNET_RESTORE();
 }
 
 /*
  * A periodic function for periodic scanning of the multicast forwarding
  * table for processing all "<=" bw_meter entries.
  */
 static void
 expire_bw_meter_process(void *arg)
 {
     CURVNET_SET((struct vnet *) arg);
 
     if (V_mrt_api_config & MRT_MFC_BW_UPCALL)
 	bw_meter_process();
 
     callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
 	curvnet);
     CURVNET_RESTORE();
 }
 
 /*
  * End of bandwidth monitoring code
  */
 
 /*
  * Send the packet up to the user daemon, or eventually do kernel encapsulation
  *
  */
 static int
 pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m,
     struct mfc *rt)
 {
     struct mbuf *mb_copy, *mm;
 
     /*
      * Do not send IGMP_WHOLEPKT notifications to userland, if the
      * rendezvous point was unspecified, and we were told not to.
      */
     if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) &&
 	in_nullhost(rt->mfc_rp))
 	return 0;
 
     mb_copy = pim_register_prepare(ip, m);
     if (mb_copy == NULL)
 	return ENOBUFS;
 
     /*
      * Send all the fragments. Note that the mbuf for each fragment
      * is freed by the sending machinery.
      */
     for (mm = mb_copy; mm; mm = mb_copy) {
 	mb_copy = mm->m_nextpkt;
 	mm->m_nextpkt = 0;
 	mm = m_pullup(mm, sizeof(struct ip));
 	if (mm != NULL) {
 	    ip = mtod(mm, struct ip *);
 	    if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) {
 		pim_register_send_rp(ip, vifp, mm, rt);
 	    } else {
 		pim_register_send_upcall(ip, vifp, mm, rt);
 	    }
 	}
     }
 
     return 0;
 }
 
 /*
  * Return a copy of the data packet that is ready for PIM Register
  * encapsulation.
  * XXX: Note that in the returned copy the IP header is a valid one.
  */
 static struct mbuf *
 pim_register_prepare(struct ip *ip, struct mbuf *m)
 {
     struct mbuf *mb_copy = NULL;
     int mtu;
 
     /* Take care of delayed checksums */
     if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 	in_delayed_cksum(m);
 	m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
     }
 
     /*
      * Copy the old packet & pullup its IP header into the
      * new mbuf so we can modify it.
      */
     mb_copy = m_copypacket(m, M_NOWAIT);
     if (mb_copy == NULL)
 	return NULL;
     mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
     if (mb_copy == NULL)
 	return NULL;
 
     /* take care of the TTL */
     ip = mtod(mb_copy, struct ip *);
     --ip->ip_ttl;
 
     /* Compute the MTU after the PIM Register encapsulation */
     mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
 
     if (ntohs(ip->ip_len) <= mtu) {
 	/* Turn the IP header into a valid one */
 	ip->ip_sum = 0;
 	ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
     } else {
 	/* Fragment the packet */
 	mb_copy->m_pkthdr.csum_flags |= CSUM_IP;
 	if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) {
 	    m_freem(mb_copy);
 	    return NULL;
 	}
     }
     return mb_copy;
 }
 
 /*
  * Send an upcall with the data packet to the user-level process.
  */
 static int
 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
     struct mbuf *mb_copy, struct mfc *rt)
 {
     struct mbuf *mb_first;
     int len = ntohs(ip->ip_len);
     struct igmpmsg *im;
     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 
     VIF_LOCK_ASSERT();
 
     /*
      * Add a new mbuf with an upcall header
      */
     mb_first = m_gethdr(M_NOWAIT, MT_DATA);
     if (mb_first == NULL) {
 	m_freem(mb_copy);
 	return ENOBUFS;
     }
     mb_first->m_data += max_linkhdr;
     mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
     mb_first->m_len = sizeof(struct igmpmsg);
     mb_first->m_next = mb_copy;
 
     /* Send message to routing daemon */
     im = mtod(mb_first, struct igmpmsg *);
     im->im_msgtype	= IGMPMSG_WHOLEPKT;
     im->im_mbz		= 0;
     im->im_vif		= vifp - V_viftable;
     im->im_src		= ip->ip_src;
     im->im_dst		= ip->ip_dst;
 
     k_igmpsrc.sin_addr	= ip->ip_src;
 
     MRTSTAT_INC(mrts_upcalls);
 
     if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) {
 	CTR1(KTR_IPMF, "%s: socket queue full", __func__);
 	MRTSTAT_INC(mrts_upq_sockfull);
 	return ENOBUFS;
     }
 
     /* Keep statistics */
     PIMSTAT_INC(pims_snd_registers_msgs);
     PIMSTAT_ADD(pims_snd_registers_bytes, len);
 
     return 0;
 }
 
 /*
  * Encapsulate the data packet in PIM Register message and send it to the RP.
  */
 static int
 pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy,
     struct mfc *rt)
 {
     struct mbuf *mb_first;
     struct ip *ip_outer;
     struct pim_encap_pimhdr *pimhdr;
     int len = ntohs(ip->ip_len);
     vifi_t vifi = rt->mfc_parent;
 
     VIF_LOCK_ASSERT();
 
     if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) {
 	m_freem(mb_copy);
 	return EADDRNOTAVAIL;		/* The iif vif is invalid */
     }
 
     /*
      * Add a new mbuf with the encapsulating header
      */
     mb_first = m_gethdr(M_NOWAIT, MT_DATA);
     if (mb_first == NULL) {
 	m_freem(mb_copy);
 	return ENOBUFS;
     }
     mb_first->m_data += max_linkhdr;
     mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
     mb_first->m_next = mb_copy;
 
     mb_first->m_pkthdr.len = len + mb_first->m_len;
 
     /*
      * Fill in the encapsulating IP and PIM header
      */
     ip_outer = mtod(mb_first, struct ip *);
     *ip_outer = pim_encap_iphdr;
     ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
 	sizeof(pim_encap_pimhdr));
     ip_outer->ip_src = V_viftable[vifi].v_lcl_addr;
     ip_outer->ip_dst = rt->mfc_rp;
     /*
      * Copy the inner header TOS to the outer header, and take care of the
      * IP_DF bit.
      */
     ip_outer->ip_tos = ip->ip_tos;
     if (ip->ip_off & htons(IP_DF))
 	ip_outer->ip_off |= htons(IP_DF);
     ip_fillid(ip_outer);
     pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
 					 + sizeof(pim_encap_iphdr));
     *pimhdr = pim_encap_pimhdr;
     /* If the iif crosses a border, set the Border-bit */
     if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config)
 	pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
 
     mb_first->m_data += sizeof(pim_encap_iphdr);
     pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
     mb_first->m_data -= sizeof(pim_encap_iphdr);
 
     send_packet(vifp, mb_first);
 
     /* Keep statistics */
     PIMSTAT_INC(pims_snd_registers_msgs);
     PIMSTAT_ADD(pims_snd_registers_bytes, len);
 
     return 0;
 }
 
 /*
  * pim_encapcheck() is called by the encap4_input() path at runtime to
  * determine if a packet is for PIM; allowing PIM to be dynamically loaded
  * into the kernel.
  */
 static int
 pim_encapcheck(const struct mbuf *m __unused, int off __unused,
     int proto __unused, void *arg __unused)
 {
 
     KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM"));
     return (8);		/* claim the datagram. */
 }
 
 /*
  * PIM-SMv2 and PIM-DM messages processing.
  * Receives and verifies the PIM control messages, and passes them
  * up to the listening socket, using rip_input().
  * The only message with special processing is the PIM_REGISTER message
  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
  * is passed to if_simloop().
  */
 static int
 pim_input(struct mbuf *m, int off, int proto, void *arg __unused)
 {
     struct ip *ip = mtod(m, struct ip *);
     struct pim *pim;
     int iphlen = off;
     int minlen;
     int datalen = ntohs(ip->ip_len) - iphlen;
     int ip_tos;
 
     /* Keep statistics */
     PIMSTAT_INC(pims_rcv_total_msgs);
     PIMSTAT_ADD(pims_rcv_total_bytes, datalen);
 
     /*
      * Validate lengths
      */
     if (datalen < PIM_MINLEN) {
 	PIMSTAT_INC(pims_rcv_tooshort);
 	CTR3(KTR_IPMF, "%s: short packet (%d) from 0x%08x",
 	    __func__, datalen, ntohl(ip->ip_src.s_addr));
 	m_freem(m);
 	return (IPPROTO_DONE);
     }
 
     /*
      * If the packet is at least as big as a REGISTER, go agead
      * and grab the PIM REGISTER header size, to avoid another
      * possible m_pullup() later.
      *
      * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
      * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
      */
     minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
     /*
      * Get the IP and PIM headers in contiguous memory, and
      * possibly the PIM REGISTER header.
      */
     if (m->m_len < minlen && (m = m_pullup(m, minlen)) == NULL) {
 	CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__);
 	return (IPPROTO_DONE);
     }
 
     /* m_pullup() may have given us a new mbuf so reset ip. */
     ip = mtod(m, struct ip *);
     ip_tos = ip->ip_tos;
 
     /* adjust mbuf to point to the PIM header */
     m->m_data += iphlen;
     m->m_len  -= iphlen;
     pim = mtod(m, struct pim *);
 
     /*
      * Validate checksum. If PIM REGISTER, exclude the data packet.
      *
      * XXX: some older PIMv2 implementations don't make this distinction,
      * so for compatibility reason perform the checksum over part of the
      * message, and if error, then over the whole message.
      */
     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
 	/* do nothing, checksum okay */
     } else if (in_cksum(m, datalen)) {
 	PIMSTAT_INC(pims_rcv_badsum);
 	CTR1(KTR_IPMF, "%s: invalid checksum", __func__);
 	m_freem(m);
 	return (IPPROTO_DONE);
     }
 
     /* PIM version check */
     if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
 	PIMSTAT_INC(pims_rcv_badversion);
 	CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__,
 	    (int)PIM_VT_V(pim->pim_vt), PIM_VERSION);
 	m_freem(m);
 	return (IPPROTO_DONE);
     }
 
     /* restore mbuf back to the outer IP */
     m->m_data -= iphlen;
     m->m_len  += iphlen;
 
     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
 	/*
 	 * Since this is a REGISTER, we'll make a copy of the register
 	 * headers ip + pim + u_int32 + encap_ip, to be passed up to the
 	 * routing daemon.
 	 */
 	struct sockaddr_in dst = { sizeof(dst), AF_INET };
 	struct mbuf *mcp;
 	struct ip *encap_ip;
 	u_int32_t *reghdr;
 	struct ifnet *vifp;
 
 	VIF_LOCK();
 	if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) {
 	    VIF_UNLOCK();
 	    CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__,
 		(int)V_reg_vif_num);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 	/* XXX need refcnt? */
 	vifp = V_viftable[V_reg_vif_num].v_ifp;
 	VIF_UNLOCK();
 
 	/*
 	 * Validate length
 	 */
 	if (datalen < PIM_REG_MINLEN) {
 	    PIMSTAT_INC(pims_rcv_tooshort);
 	    PIMSTAT_INC(pims_rcv_badregisters);
 	    CTR1(KTR_IPMF, "%s: register packet size too small", __func__);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	reghdr = (u_int32_t *)(pim + 1);
 	encap_ip = (struct ip *)(reghdr + 1);
 
 	CTR3(KTR_IPMF, "%s: register: encap ip src 0x%08x len %d",
 	    __func__, ntohl(encap_ip->ip_src.s_addr),
 	    ntohs(encap_ip->ip_len));
 
 	/* verify the version number of the inner packet */
 	if (encap_ip->ip_v != IPVERSION) {
 	    PIMSTAT_INC(pims_rcv_badregisters);
 	    CTR1(KTR_IPMF, "%s: bad encap ip version", __func__);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	/* verify the inner packet is destined to a mcast group */
 	if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) {
 	    PIMSTAT_INC(pims_rcv_badregisters);
 	    CTR2(KTR_IPMF, "%s: bad encap ip dest 0x%08x", __func__,
 		ntohl(encap_ip->ip_dst.s_addr));
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	/* If a NULL_REGISTER, pass it to the daemon */
 	if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 	    goto pim_input_to_daemon;
 
 	/*
 	 * Copy the TOS from the outer IP header to the inner IP header.
 	 */
 	if (encap_ip->ip_tos != ip_tos) {
 	    /* Outer TOS -> inner TOS */
 	    encap_ip->ip_tos = ip_tos;
 	    /* Recompute the inner header checksum. Sigh... */
 
 	    /* adjust mbuf to point to the inner IP header */
 	    m->m_data += (iphlen + PIM_MINLEN);
 	    m->m_len  -= (iphlen + PIM_MINLEN);
 
 	    encap_ip->ip_sum = 0;
 	    encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
 
 	    /* restore mbuf to point back to the outer IP header */
 	    m->m_data -= (iphlen + PIM_MINLEN);
 	    m->m_len  += (iphlen + PIM_MINLEN);
 	}
 
 	/*
 	 * Decapsulate the inner IP packet and loopback to forward it
 	 * as a normal multicast packet. Also, make a copy of the
 	 *     outer_iphdr + pimhdr + reghdr + encap_iphdr
 	 * to pass to the daemon later, so it can take the appropriate
 	 * actions (e.g., send back PIM_REGISTER_STOP).
 	 * XXX: here m->m_data points to the outer IP header.
 	 */
 	mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_NOWAIT);
 	if (mcp == NULL) {
 	    CTR1(KTR_IPMF, "%s: m_copym() failed", __func__);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	/* Keep statistics */
 	/* XXX: registers_bytes include only the encap. mcast pkt */
 	PIMSTAT_INC(pims_rcv_registers_msgs);
 	PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len));
 
 	/*
 	 * forward the inner ip packet; point m_data at the inner ip.
 	 */
 	m_adj(m, iphlen + PIM_MINLEN);
 
 	CTR4(KTR_IPMF,
 	    "%s: forward decap'd REGISTER: src %lx dst %lx vif %d",
 	    __func__,
 	    (u_long)ntohl(encap_ip->ip_src.s_addr),
 	    (u_long)ntohl(encap_ip->ip_dst.s_addr),
 	    (int)V_reg_vif_num);
 
 	/* NB: vifp was collected above; can it change on us? */
 	if_simloop(vifp, m, dst.sin_family, 0);
 
 	/* prepare the register head to send to the mrouting daemon */
 	m = mcp;
     }
 
 pim_input_to_daemon:
     /*
      * Pass the PIM message up to the daemon; if it is a Register message,
      * pass the 'head' only up to the daemon. This includes the
      * outer IP header, PIM header, PIM-Register header and the
      * inner IP header.
      * XXX: the outer IP header pkt size of a Register is not adjust to
      * reflect the fact that the inner multicast data is truncated.
      */
     return (rip_input(&m, &off, proto));
 }
 
 static int
 sysctl_mfctable(SYSCTL_HANDLER_ARGS)
 {
 	struct mfc	*rt;
 	int		 error, i;
 
 	if (req->newptr)
 		return (EPERM);
 	if (V_mfchashtbl == NULL)	/* XXX unlocked */
 		return (0);
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 
 	MFC_LOCK();
 	for (i = 0; i < mfchashsize; i++) {
 		LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) {
 			error = SYSCTL_OUT(req, rt, sizeof(struct mfc));
 			if (error)
 				goto out_locked;
 		}
 	}
 out_locked:
 	MFC_UNLOCK();
 	return (error);
 }
 
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD,
     sysctl_mfctable, "IPv4 Multicast Forwarding Table "
     "(struct *mfc[mfchashsize], netinet/ip_mroute.h)");
 
 static void
 vnet_mroute_init(const void *unused __unused)
 {
 
 	V_nexpire = malloc(mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO);
 	bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers));
 	callout_init(&V_expire_upcalls_ch, 1);
 	callout_init(&V_bw_upcalls_ch, 1);
 	callout_init(&V_bw_meter_ch, 1);
 }
 
 VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init,
 	NULL);
 
 static void
 vnet_mroute_uninit(const void *unused __unused)
 {
 
 	free(V_nexpire, M_MRTABLE);
 	V_nexpire = NULL;
 }
 
 VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, 
 	vnet_mroute_uninit, NULL);
 
 static int
 ip_mroute_modevent(module_t mod, int type, void *unused)
 {
 
     switch (type) {
     case MOD_LOAD:
 	MROUTER_LOCK_INIT();
 
 	if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, 
 	    if_detached_event, NULL, EVENTHANDLER_PRI_ANY);
 	if (if_detach_event_tag == NULL) {
 		printf("ip_mroute: unable to register "
 		    "ifnet_departure_event handler\n");
 		MROUTER_LOCK_DESTROY();
 		return (EINVAL);
 	}
 
 	MFC_LOCK_INIT();
 	VIF_LOCK_INIT();
 
 	mfchashsize = MFCHASHSIZE;
 	if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) &&
 	    !powerof2(mfchashsize)) {
 		printf("WARNING: %s not a power of 2; using default\n",
 		    "net.inet.ip.mfchashsize");
 		mfchashsize = MFCHASHSIZE;
 	}
 
 	pim_squelch_wholepkt = 0;
 	TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt",
 	    &pim_squelch_wholepkt);
 
 	pim_encap_cookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK);
 	if (pim_encap_cookie == NULL) {
 		printf("ip_mroute: unable to attach pim encap\n");
 		VIF_LOCK_DESTROY();
 		MFC_LOCK_DESTROY();
 		MROUTER_LOCK_DESTROY();
 		return (EINVAL);
 	}
 
 	ip_mcast_src = X_ip_mcast_src;
 	ip_mforward = X_ip_mforward;
 	ip_mrouter_done = X_ip_mrouter_done;
 	ip_mrouter_get = X_ip_mrouter_get;
 	ip_mrouter_set = X_ip_mrouter_set;
 
 	ip_rsvp_force_done = X_ip_rsvp_force_done;
 	ip_rsvp_vif = X_ip_rsvp_vif;
 
 	legal_vif_num = X_legal_vif_num;
 	mrt_ioctl = X_mrt_ioctl;
 	rsvp_input_p = X_rsvp_input;
 	break;
 
     case MOD_UNLOAD:
 	/*
 	 * Typically module unload happens after the user-level
 	 * process has shutdown the kernel services (the check
 	 * below insures someone can't just yank the module out
 	 * from under a running process).  But if the module is
 	 * just loaded and then unloaded w/o starting up a user
 	 * process we still need to cleanup.
 	 */
 	MROUTER_LOCK();
 	if (ip_mrouter_cnt != 0) {
 	    MROUTER_UNLOCK();
 	    return (EINVAL);
 	}
 	ip_mrouter_unloading = 1;
 	MROUTER_UNLOCK();
 
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
 
 	if (pim_encap_cookie) {
 	    ip_encap_detach(pim_encap_cookie);
 	    pim_encap_cookie = NULL;
 	}
 
 	ip_mcast_src = NULL;
 	ip_mforward = NULL;
 	ip_mrouter_done = NULL;
 	ip_mrouter_get = NULL;
 	ip_mrouter_set = NULL;
 
 	ip_rsvp_force_done = NULL;
 	ip_rsvp_vif = NULL;
 
 	legal_vif_num = NULL;
 	mrt_ioctl = NULL;
 	rsvp_input_p = NULL;
 
 	VIF_LOCK_DESTROY();
 	MFC_LOCK_DESTROY();
 	MROUTER_LOCK_DESTROY();
 	break;
 
     default:
 	return EOPNOTSUPP;
     }
     return 0;
 }
 
 static moduledata_t ip_mroutemod = {
     "ip_mroute",
     ip_mroute_modevent,
     0
 };
 
 DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE);
Index: head/sys/netinet/ip_var.h
===================================================================
--- head/sys/netinet/ip_var.h	(revision 349368)
+++ head/sys/netinet/ip_var.h	(revision 349369)
@@ -1,302 +1,302 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_var.h	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IP_VAR_H_
 #define	_NETINET_IP_VAR_H_
 
 #include <sys/queue.h>
 #include <sys/epoch.h>
 
 /*
  * Overlay for ip header used by other protocols (tcp, udp).
  */
 struct ipovly {
 	u_char	ih_x1[9];		/* (unused) */
 	u_char	ih_pr;			/* protocol */
 	u_short	ih_len;			/* protocol length */
 	struct	in_addr ih_src;		/* source internet address */
 	struct	in_addr ih_dst;		/* destination internet address */
 };
 
 #ifdef _KERNEL
 /*
  * Ip reassembly queue structure.  Each fragment
  * being reassembled is attached to one of these structures.
  * They are timed out after ipq_ttl drops to 0, and may also
  * be reclaimed if memory becomes tight.
  */
 struct ipq {
 	TAILQ_ENTRY(ipq) ipq_list;	/* to other reass headers */
 	u_char	ipq_ttl;		/* time for reass q to live */
 	u_char	ipq_p;			/* protocol of this fragment */
 	u_short	ipq_id;			/* sequence id for reassembly */
 	int	ipq_maxoff;		/* total length of packet */
 	struct mbuf *ipq_frags;		/* to ip headers of fragments */
 	struct	in_addr ipq_src,ipq_dst;
 	u_char	ipq_nfrags;		/* # frags in this packet */
 	struct label *ipq_label;	/* MAC label */
 };
 #endif /* _KERNEL */
 
 /*
  * Structure stored in mbuf in inpcb.ip_options
  * and passed to ip_output when ip options are in use.
  * The actual length of the options (including ipopt_dst)
  * is in m_len.
  */
 #define MAX_IPOPTLEN	40
 
 struct ipoption {
 	struct	in_addr ipopt_dst;	/* first-hop dst if source routed */
 	char	ipopt_list[MAX_IPOPTLEN];	/* options proper */
 };
 
+#if defined(_NETINET_IN_VAR_H_) && defined(_KERNEL)
 /*
  * Structure attached to inpcb.ip_moptions and
  * passed to ip_output when IP multicast options are in use.
  * This structure is lazy-allocated.
  */
 struct ip_moptions {
 	struct	ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */
 	struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */
 	u_long	imo_multicast_vif;	/* vif num outgoing multicasts */
 	u_char	imo_multicast_ttl;	/* TTL for outgoing multicasts */
 	u_char	imo_multicast_loop;	/* 1 => hear sends if a member */
-	u_short	imo_num_memberships;	/* no. memberships this socket */
-	u_short	imo_max_memberships;	/* max memberships this socket */
-	struct	in_multi **imo_membership;	/* group memberships */
-	struct	in_mfilter *imo_mfilters;	/* source filters */
-	struct	epoch_context imo_epoch_ctx;
+	struct ip_mfilter_head imo_head; /* group membership list */
 };
+#else
+struct ip_moptions;
+#endif
 
 struct	ipstat {
 	uint64_t ips_total;		/* total packets received */
 	uint64_t ips_badsum;		/* checksum bad */
 	uint64_t ips_tooshort;		/* packet too short */
 	uint64_t ips_toosmall;		/* not enough data */
 	uint64_t ips_badhlen;		/* ip header length < data size */
 	uint64_t ips_badlen;		/* ip length < ip header length */
 	uint64_t ips_fragments;		/* fragments received */
 	uint64_t ips_fragdropped;	/* frags dropped (dups, out of space) */
 	uint64_t ips_fragtimeout;	/* fragments timed out */
 	uint64_t ips_forward;		/* packets forwarded */
 	uint64_t ips_fastforward;	/* packets fast forwarded */
 	uint64_t ips_cantforward;	/* packets rcvd for unreachable dest */
 	uint64_t ips_redirectsent;	/* packets forwarded on same net */
 	uint64_t ips_noproto;		/* unknown or unsupported protocol */
 	uint64_t ips_delivered;		/* datagrams delivered to upper level*/
 	uint64_t ips_localout;		/* total ip packets generated here */
 	uint64_t ips_odropped;		/* lost packets due to nobufs, etc. */
 	uint64_t ips_reassembled;	/* total packets reassembled ok */
 	uint64_t ips_fragmented;	/* datagrams successfully fragmented */
 	uint64_t ips_ofragments;	/* output fragments created */
 	uint64_t ips_cantfrag;		/* don't fragment flag was set, etc. */
 	uint64_t ips_badoptions;		/* error in option processing */
 	uint64_t ips_noroute;		/* packets discarded due to no route */
 	uint64_t ips_badvers;		/* ip version != 4 */
 	uint64_t ips_rawout;		/* total raw ip packets generated */
 	uint64_t ips_toolong;		/* ip length > max ip packet size */
 	uint64_t ips_notmember;		/* multicasts for unregistered grps */
 	uint64_t ips_nogif;		/* no match gif found */
 	uint64_t ips_badaddr;		/* invalid address on header */
 };
 
 #ifdef _KERNEL
 
 #include <sys/counter.h>
 #include <net/vnet.h>
 
 VNET_PCPUSTAT_DECLARE(struct ipstat, ipstat);
 /*
  * In-kernel consumers can use these accessor macros directly to update
  * stats.
  */
 #define	IPSTAT_ADD(name, val)	\
     VNET_PCPUSTAT_ADD(struct ipstat, ipstat, name, (val))
 #define	IPSTAT_SUB(name, val)	IPSTAT_ADD(name, -(val))
 #define	IPSTAT_INC(name)	IPSTAT_ADD(name, 1)
 #define	IPSTAT_DEC(name)	IPSTAT_SUB(name, 1)
 
 /*
  * Kernel module consumers must use this accessor macro.
  */
 void	kmod_ipstat_inc(int statnum);
 #define	KMOD_IPSTAT_INC(name)	\
     kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(uint64_t))
 void	kmod_ipstat_dec(int statnum);
 #define	KMOD_IPSTAT_DEC(name)	\
     kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(uint64_t))
 
 /* flags passed to ip_output as last parameter */
 #define	IP_FORWARDING		0x1		/* most of ip header exists */
 #define	IP_RAWOUTPUT		0x2		/* raw ip header exists */
 #define	IP_SENDONES		0x4		/* send all-ones broadcast */
 #define	IP_SENDTOIF		0x8		/* send on specific ifnet */
 #define IP_ROUTETOIF		SO_DONTROUTE	/* 0x10 bypass routing tables */
 #define IP_ALLOWBROADCAST	SO_BROADCAST	/* 0x20 can send broadcast packets */
 #define	IP_NODEFAULTFLOWID	0x40		/* Don't set the flowid from inp */
 
 #ifdef __NO_STRICT_ALIGNMENT
 #define IP_HDR_ALIGNED_P(ip)	1
 #else
 #define IP_HDR_ALIGNED_P(ip)	((((intptr_t) (ip)) & 3) == 0)
 #endif
 
 struct ip;
 struct inpcb;
 struct route;
 struct sockopt;
 struct inpcbinfo;
 
 VNET_DECLARE(int, ip_defttl);			/* default IP ttl */
 VNET_DECLARE(int, ipforwarding);		/* ip forwarding */
 #ifdef IPSTEALTH
 VNET_DECLARE(int, ipstealth);			/* stealth forwarding */
 #endif
 extern u_char	ip_protox[];
 VNET_DECLARE(struct socket *, ip_rsvpd);	/* reservation protocol daemon*/
 VNET_DECLARE(struct socket *, ip_mrouter);	/* multicast routing daemon */
 extern int	(*legal_vif_num)(int);
 extern u_long	(*ip_mcast_src)(int);
 VNET_DECLARE(int, rsvp_on);
 VNET_DECLARE(int, drop_redirect);
 extern struct	pr_usrreqs rip_usrreqs;
 
 #define	V_ip_id			VNET(ip_id)
 #define	V_ip_defttl		VNET(ip_defttl)
 #define	V_ipforwarding		VNET(ipforwarding)
 #ifdef IPSTEALTH
 #define	V_ipstealth		VNET(ipstealth)
 #endif
 #define	V_ip_rsvpd		VNET(ip_rsvpd)
 #define	V_ip_mrouter		VNET(ip_mrouter)
 #define	V_rsvp_on		VNET(rsvp_on)
 #define	V_drop_redirect		VNET(drop_redirect)
 
 void	inp_freemoptions(struct ip_moptions *);
 int	inp_getmoptions(struct inpcb *, struct sockopt *);
 int	inp_setmoptions(struct inpcb *, struct sockopt *);
 
 int	ip_ctloutput(struct socket *, struct sockopt *sopt);
 void	ip_drain(void);
 int	ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
 	    u_long if_hwassist_flags);
 void	ip_forward(struct mbuf *m, int srcrt);
 void	ip_init(void);
 extern int
 	(*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 	    struct ip_moptions *);
 int	ip_output(struct mbuf *,
 	    struct mbuf *, struct route *, int, struct ip_moptions *,
 	    struct inpcb *);
 int	ipproto_register(short);
 int	ipproto_unregister(short);
 struct mbuf *
 	ip_reass(struct mbuf *);
 void	ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
 	    struct mbuf *);
 void	ip_slowtimo(void);
 void	ip_fillid(struct ip *);
 int	rip_ctloutput(struct socket *, struct sockopt *);
 void	rip_ctlinput(int, struct sockaddr *, void *);
 void	rip_init(void);
 int	rip_input(struct mbuf **, int *, int);
 int	rip_output(struct mbuf *, struct socket *, ...);
 int	ipip_input(struct mbuf **, int *, int);
 int	rsvp_input(struct mbuf **, int *, int);
 int	ip_rsvp_init(struct socket *);
 int	ip_rsvp_done(void);
 extern int	(*ip_rsvp_vif)(struct socket *, struct sockopt *);
 extern void	(*ip_rsvp_force_done)(struct socket *);
 extern int	(*rsvp_input_p)(struct mbuf **, int *, int);
 
 VNET_DECLARE(struct pfil_head *, inet_pfil_head);
 #define	V_inet_pfil_head	VNET(inet_pfil_head)
 #define	PFIL_INET_NAME		"inet"
 
 void	in_delayed_cksum(struct mbuf *m);
 
 /* Hooks for ipfw, dummynet, divert etc. Most are declared in raw_ip.c */
 /*
  * Reference to an ipfw or packet filter rule that can be carried
  * outside critical sections.
  * A rule is identified by rulenum:rule_id which is ordered.
  * In version chain_id the rule can be found in slot 'slot', so
  * we don't need a lookup if chain_id == chain->id.
  *
  * On exit from the firewall this structure refers to the rule after
  * the matching one (slot points to the new rule; rulenum:rule_id-1
  * is the matching rule), and additional info (e.g. info often contains
  * the insn argument or tablearg in the low 16 bits, in host format).
  * On entry, the structure is valid if slot>0, and refers to the starting
  * rules. 'info' contains the reason for reinject, e.g. divert port,
  * divert direction, and so on.
  */
 struct ipfw_rule_ref {
 	uint32_t	slot;		/* slot for matching rule	*/
 	uint32_t	rulenum;	/* matching rule number		*/
 	uint32_t	rule_id;	/* matching rule id		*/
 	uint32_t	chain_id;	/* ruleset id			*/
 	uint32_t	info;		/* see below			*/
 };
 
 enum {
 	IPFW_INFO_MASK	= 0x0000ffff,
 	IPFW_INFO_OUT	= 0x00000000,	/* outgoing, just for convenience */
 	IPFW_INFO_IN	= 0x80000000,	/* incoming, overloads dir */
 	IPFW_ONEPASS	= 0x40000000,	/* One-pass, do not reinject */
 	IPFW_IS_MASK	= 0x30000000,	/* which source ? */
 	IPFW_IS_DIVERT	= 0x20000000,
 	IPFW_IS_DUMMYNET =0x10000000,
 	IPFW_IS_PIPE	= 0x08000000,	/* pipe=1, queue = 0 */
 };
 #define MTAG_IPFW	1148380143	/* IPFW-tagged cookie */
 #define MTAG_IPFW_RULE	1262273568	/* rule reference */
 #define	MTAG_IPFW_CALL	1308397630	/* call stack */
 
 struct ip_fw_args;
 typedef int	(*ip_fw_chk_ptr_t)(struct ip_fw_args *args);
 typedef int	(*ip_fw_ctl_ptr_t)(struct sockopt *);
 VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr);
 #define	V_ip_fw_ctl_ptr		VNET(ip_fw_ctl_ptr)
 
 /* Divert hooks. */
 extern void	(*ip_divert_ptr)(struct mbuf *m, bool incoming);
 /* ng_ipfw hooks -- XXX make it the same as divert and dummynet */
 extern int	(*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool);
 extern int	(*ip_dn_ctl_ptr)(struct sockopt *);
 extern int	(*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *);
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_IP_VAR_H_ */
Index: head/sys/netinet6/in6.h
===================================================================
--- head/sys/netinet6/in6.h	(revision 349368)
+++ head/sys/netinet6/in6.h	(revision 349369)
@@ -1,753 +1,750 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6.h,v 1.89 2001/05/27 13:28:35 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in.h	8.3 (Berkeley) 1/3/94
  * $FreeBSD$
  */
 
 #ifndef __KAME_NETINET_IN_H_INCLUDED_
 #error "do not include netinet6/in6.h directly, include netinet/in.h.  see RFC2553"
 #endif
 
 #ifndef _NETINET6_IN6_H_
 #define _NETINET6_IN6_H_
 
 /*
  * Identification of the network protocol stack
  * for *BSD-current/release: http://www.kame.net/dev/cvsweb.cgi/kame/COVERAGE
  * has the table of implementation/integration differences.
  */
 #define __KAME__
 #define __KAME_VERSION		"FreeBSD"
 
 /*
  * IPv6 port allocation rules should mirror the IPv4 rules and are controlled
  * by the net.inet.ip.portrange sysctl tree. The following defines exist
  * for compatibility with userland applications that need them.
  */
 #if __BSD_VISIBLE
 #define	IPV6PORT_RESERVED	1024
 #define	IPV6PORT_ANONMIN	49152
 #define	IPV6PORT_ANONMAX	65535
 #define	IPV6PORT_RESERVEDMIN	600
 #define	IPV6PORT_RESERVEDMAX	(IPV6PORT_RESERVED-1)
 #endif
 
 /*
  * IPv6 address
  */
 struct in6_addr {
 	union {
 		uint8_t		__u6_addr8[16];
 		uint16_t	__u6_addr16[8];
 		uint32_t	__u6_addr32[4];
 	} __u6_addr;			/* 128-bit IP6 address */
 };
 
 #define s6_addr   __u6_addr.__u6_addr8
 #ifdef _KERNEL	/* XXX nonstandard */
 #define s6_addr8  __u6_addr.__u6_addr8
 #define s6_addr16 __u6_addr.__u6_addr16
 #define s6_addr32 __u6_addr.__u6_addr32
 #endif
 
 #define INET6_ADDRSTRLEN	46
 
 /*
  * XXX missing POSIX.1-2001 macro IPPROTO_IPV6.
  */
 
 /*
  * Socket address for IPv6
  */
 #if __BSD_VISIBLE
 #define SIN6_LEN
 #endif
 
 struct sockaddr_in6 {
 	uint8_t		sin6_len;	/* length of this struct */
 	sa_family_t	sin6_family;	/* AF_INET6 */
 	in_port_t	sin6_port;	/* Transport layer port # */
 	uint32_t	sin6_flowinfo;	/* IP6 flow information */
 	struct in6_addr	sin6_addr;	/* IP6 address */
 	uint32_t	sin6_scope_id;	/* scope zone index */
 };
 
 /*
  * Local definition for masks
  */
 #ifdef _KERNEL	/* XXX nonstandard */
 #define IN6MASK0	{{{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }}}
 #define IN6MASK32	{{{ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, \
 			    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}}
 #define IN6MASK64	{{{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \
 			    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}}
 #define IN6MASK96	{{{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \
 			    0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }}}
 #define IN6MASK128	{{{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \
 			    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }}}
 #endif
 
 #ifdef _KERNEL
 extern const struct sockaddr_in6 sa6_any;
 
 extern const struct in6_addr in6mask0;
 extern const struct in6_addr in6mask32;
 extern const struct in6_addr in6mask64;
 extern const struct in6_addr in6mask96;
 extern const struct in6_addr in6mask128;
 #endif /* _KERNEL */
 
 /*
  * Macros started with IPV6_ADDR is KAME local
  */
 #ifdef _KERNEL	/* XXX nonstandard */
 #if _BYTE_ORDER == _BIG_ENDIAN
 #define IPV6_ADDR_INT32_ONE	1
 #define IPV6_ADDR_INT32_TWO	2
 #define IPV6_ADDR_INT32_MNL	0xff010000
 #define IPV6_ADDR_INT32_MLL	0xff020000
 #define IPV6_ADDR_INT32_SMP	0x0000ffff
 #define IPV6_ADDR_INT16_ULL	0xfe80
 #define IPV6_ADDR_INT16_USL	0xfec0
 #define IPV6_ADDR_INT16_MLL	0xff02
 #elif _BYTE_ORDER == _LITTLE_ENDIAN
 #define IPV6_ADDR_INT32_ONE	0x01000000
 #define IPV6_ADDR_INT32_TWO	0x02000000
 #define IPV6_ADDR_INT32_MNL	0x000001ff
 #define IPV6_ADDR_INT32_MLL	0x000002ff
 #define IPV6_ADDR_INT32_SMP	0xffff0000
 #define IPV6_ADDR_INT16_ULL	0x80fe
 #define IPV6_ADDR_INT16_USL	0xc0fe
 #define IPV6_ADDR_INT16_MLL	0x02ff
 #endif
 #endif
 
 /*
  * Definition of some useful macros to handle IP6 addresses
  */
 #if __BSD_VISIBLE
 #define IN6ADDR_ANY_INIT \
 	{{{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
 	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}}
 #define IN6ADDR_LOOPBACK_INIT \
 	{{{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
 	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}}
 #define IN6ADDR_NODELOCAL_ALLNODES_INIT \
 	{{{ 0xff, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
 	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}}
 #define IN6ADDR_INTFACELOCAL_ALLNODES_INIT \
 	{{{ 0xff, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
 	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}}
 #define IN6ADDR_LINKLOCAL_ALLNODES_INIT \
 	{{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
 	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}}
 #define IN6ADDR_LINKLOCAL_ALLROUTERS_INIT \
 	{{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
 	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 }}}
 #define IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT \
 	{{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
 	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16 }}}
 #endif
 
 extern const struct in6_addr in6addr_any;
 extern const struct in6_addr in6addr_loopback;
 #if __BSD_VISIBLE
 extern const struct in6_addr in6addr_nodelocal_allnodes;
 extern const struct in6_addr in6addr_linklocal_allnodes;
 extern const struct in6_addr in6addr_linklocal_allrouters;
 extern const struct in6_addr in6addr_linklocal_allv2routers;
 #endif
 
 /*
  * Equality
  * NOTE: Some of kernel programming environment (for example, openbsd/sparc)
  * does not supply memcmp().  For userland memcmp() is preferred as it is
  * in ANSI standard.
  */
 #ifdef _KERNEL
 #define IN6_ARE_ADDR_EQUAL(a, b)			\
     (bcmp(&(a)->s6_addr[0], &(b)->s6_addr[0], sizeof(struct in6_addr)) == 0)
 #else
 #if __BSD_VISIBLE
 #define IN6_ARE_ADDR_EQUAL(a, b)			\
     (memcmp(&(a)->s6_addr[0], &(b)->s6_addr[0], sizeof(struct in6_addr)) == 0)
 #endif
 #endif
 
 /*
  * Unspecified
  */
 #define IN6_IS_ADDR_UNSPECIFIED(a)	\
 	((a)->__u6_addr.__u6_addr32[0] == 0 &&	\
 	 (a)->__u6_addr.__u6_addr32[1] == 0 &&	\
 	 (a)->__u6_addr.__u6_addr32[2] == 0 &&	\
 	 (a)->__u6_addr.__u6_addr32[3] == 0)
 
 /*
  * Loopback
  */
 #define IN6_IS_ADDR_LOOPBACK(a)		\
 	((a)->__u6_addr.__u6_addr32[0] == 0 &&	\
 	 (a)->__u6_addr.__u6_addr32[1] == 0 &&	\
 	 (a)->__u6_addr.__u6_addr32[2] == 0 &&	\
 	 (a)->__u6_addr.__u6_addr32[3] == ntohl(1))
 
 /*
  * IPv4 compatible
  */
 #define IN6_IS_ADDR_V4COMPAT(a)		\
 	((a)->__u6_addr.__u6_addr32[0] == 0 &&	\
 	 (a)->__u6_addr.__u6_addr32[1] == 0 &&	\
 	 (a)->__u6_addr.__u6_addr32[2] == 0 &&	\
 	 (a)->__u6_addr.__u6_addr32[3] != 0 &&	\
 	 (a)->__u6_addr.__u6_addr32[3] != ntohl(1))
 
 /*
  * Mapped
  */
 #define IN6_IS_ADDR_V4MAPPED(a)		      \
 	((a)->__u6_addr.__u6_addr32[0] == 0 &&	\
 	 (a)->__u6_addr.__u6_addr32[1] == 0 &&	\
 	 (a)->__u6_addr.__u6_addr32[2] == ntohl(0x0000ffff))
 
 /*
  * KAME Scope Values
  */
 
 #ifdef _KERNEL	/* XXX nonstandard */
 #define IPV6_ADDR_SCOPE_NODELOCAL	0x01
 #define IPV6_ADDR_SCOPE_INTFACELOCAL	0x01
 #define IPV6_ADDR_SCOPE_LINKLOCAL	0x02
 #define IPV6_ADDR_SCOPE_SITELOCAL	0x05
 #define IPV6_ADDR_SCOPE_ORGLOCAL	0x08	/* just used in this file */
 #define IPV6_ADDR_SCOPE_GLOBAL		0x0e
 #else
 #define __IPV6_ADDR_SCOPE_NODELOCAL	0x01
 #define __IPV6_ADDR_SCOPE_INTFACELOCAL	0x01
 #define __IPV6_ADDR_SCOPE_LINKLOCAL	0x02
 #define __IPV6_ADDR_SCOPE_SITELOCAL	0x05
 #define __IPV6_ADDR_SCOPE_ORGLOCAL	0x08	/* just used in this file */
 #define __IPV6_ADDR_SCOPE_GLOBAL	0x0e
 #endif
 
 /*
  * Unicast Scope
  * Note that we must check topmost 10 bits only, not 16 bits (see RFC2373).
  */
 #define IN6_IS_ADDR_LINKLOCAL(a)	\
 	(((a)->s6_addr[0] == 0xfe) && (((a)->s6_addr[1] & 0xc0) == 0x80))
 #define IN6_IS_ADDR_SITELOCAL(a)	\
 	(((a)->s6_addr[0] == 0xfe) && (((a)->s6_addr[1] & 0xc0) == 0xc0))
 
 /*
  * Multicast
  */
 #define IN6_IS_ADDR_MULTICAST(a)	((a)->s6_addr[0] == 0xff)
 
 #ifdef _KERNEL	/* XXX nonstandard */
 #define IPV6_ADDR_MC_SCOPE(a)		((a)->s6_addr[1] & 0x0f)
 #else
 #define __IPV6_ADDR_MC_SCOPE(a)		((a)->s6_addr[1] & 0x0f)
 #endif
 
 /*
  * Multicast Scope
  */
 #ifdef _KERNEL	/* refers nonstandard items */
 #define IN6_IS_ADDR_MC_NODELOCAL(a)	\
 	(IN6_IS_ADDR_MULTICAST(a) &&	\
 	 (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_NODELOCAL))
 #define IN6_IS_ADDR_MC_INTFACELOCAL(a)	\
 	(IN6_IS_ADDR_MULTICAST(a) &&	\
 	 (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_INTFACELOCAL))
 #define IN6_IS_ADDR_MC_LINKLOCAL(a)	\
 	(IN6_IS_ADDR_MULTICAST(a) &&	\
 	 (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_LINKLOCAL))
 #define IN6_IS_ADDR_MC_SITELOCAL(a)	\
 	(IN6_IS_ADDR_MULTICAST(a) &&	\
 	 (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_SITELOCAL))
 #define IN6_IS_ADDR_MC_ORGLOCAL(a)	\
 	(IN6_IS_ADDR_MULTICAST(a) &&	\
 	 (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_ORGLOCAL))
 #define IN6_IS_ADDR_MC_GLOBAL(a)	\
 	(IN6_IS_ADDR_MULTICAST(a) &&	\
 	 (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_GLOBAL))
 #else
 #define IN6_IS_ADDR_MC_NODELOCAL(a)	\
 	(IN6_IS_ADDR_MULTICAST(a) &&	\
 	 (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_NODELOCAL))
 #define IN6_IS_ADDR_MC_LINKLOCAL(a)	\
 	(IN6_IS_ADDR_MULTICAST(a) &&	\
 	 (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_LINKLOCAL))
 #define IN6_IS_ADDR_MC_SITELOCAL(a)	\
 	(IN6_IS_ADDR_MULTICAST(a) &&	\
 	 (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_SITELOCAL))
 #define IN6_IS_ADDR_MC_ORGLOCAL(a)	\
 	(IN6_IS_ADDR_MULTICAST(a) &&	\
 	 (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_ORGLOCAL))
 #define IN6_IS_ADDR_MC_GLOBAL(a)	\
 	(IN6_IS_ADDR_MULTICAST(a) &&	\
 	 (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_GLOBAL))
 #endif
 
 #ifdef _KERNEL	/* nonstandard */
 /*
  * KAME Scope
  */
 #define IN6_IS_SCOPE_LINKLOCAL(a)	\
 	((IN6_IS_ADDR_LINKLOCAL(a)) ||	\
 	 (IN6_IS_ADDR_MC_LINKLOCAL(a)))
 #define	IN6_IS_SCOPE_EMBED(a)			\
 	((IN6_IS_ADDR_LINKLOCAL(a)) ||		\
 	 (IN6_IS_ADDR_MC_LINKLOCAL(a)) ||	\
 	 (IN6_IS_ADDR_MC_INTFACELOCAL(a)))
 
 #define IFA6_IS_DEPRECATED(a) \
 	((a)->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME && \
 	 (u_int32_t)((time_uptime - (a)->ia6_updatetime)) > \
 	 (a)->ia6_lifetime.ia6t_pltime)
 #define IFA6_IS_INVALID(a) \
 	((a)->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME && \
 	 (u_int32_t)((time_uptime - (a)->ia6_updatetime)) > \
 	 (a)->ia6_lifetime.ia6t_vltime)
 #endif /* _KERNEL */
 
 /*
  * IP6 route structure
  */
 #if __BSD_VISIBLE
 struct route_in6 {
 	struct	rtentry *ro_rt;
 	struct	llentry *ro_lle;
 	/*
 	 * ro_prepend and ro_plen are only used for bpf to pass in a
 	 * preformed header.  They are not cacheable.
 	 */
 	char		*ro_prepend;
 	uint16_t	ro_plen;
 	uint16_t	ro_flags;
 	uint16_t	ro_mtu;	/* saved ro_rt mtu */
 	uint16_t	spare;
 	struct	sockaddr_in6 ro_dst;
 };
 #endif
 
 #ifdef _KERNEL
 #define MTAG_ABI_IPV6		1444287380	/* IPv6 ABI */
 #define IPV6_TAG_DIRECT		0		/* direct-dispatch IPv6 */
 #endif /* _KERNEL */
 
 /*
  * Options for use with [gs]etsockopt at the IPV6 level.
  * First word of comment is data type; bool is stored in int.
  */
 /* no hdrincl */
 #if 0 /* the followings are relic in IPv4 and hence are disabled */
 #define IPV6_OPTIONS		1  /* buf/ip6_opts; set/get IP6 options */
 #define IPV6_RECVOPTS		5  /* bool; receive all IP6 opts w/dgram */
 #define IPV6_RECVRETOPTS	6  /* bool; receive IP6 opts for response */
 #define IPV6_RECVDSTADDR	7  /* bool; receive IP6 dst addr w/dgram */
 #define IPV6_RETOPTS		8  /* ip6_opts; set/get IP6 options */
 #endif
 #define IPV6_SOCKOPT_RESERVED1	3  /* reserved for future use */
 #define IPV6_UNICAST_HOPS	4  /* int; IP6 hops */
 #define IPV6_MULTICAST_IF	9  /* u_int; set/get IP6 multicast i/f  */
 #define IPV6_MULTICAST_HOPS	10 /* int; set/get IP6 multicast hops */
 #define IPV6_MULTICAST_LOOP	11 /* u_int; set/get IP6 multicast loopback */
 #define IPV6_JOIN_GROUP		12 /* ipv6_mreq; join a group membership */
 #define IPV6_LEAVE_GROUP	13 /* ipv6_mreq; leave a group membership */
 #define IPV6_PORTRANGE		14 /* int; range to choose for unspec port */
 #define ICMP6_FILTER		18 /* icmp6_filter; icmp6 filter */
 /* RFC2292 options */
 #ifdef _KERNEL
 #define IPV6_2292PKTINFO	19 /* bool; send/recv if, src/dst addr */
 #define IPV6_2292HOPLIMIT	20 /* bool; hop limit */
 #define IPV6_2292NEXTHOP	21 /* bool; next hop addr */
 #define IPV6_2292HOPOPTS	22 /* bool; hop-by-hop option */
 #define IPV6_2292DSTOPTS	23 /* bool; destinaion option */
 #define IPV6_2292RTHDR		24 /* bool; routing header */
 #define IPV6_2292PKTOPTIONS	25 /* buf/cmsghdr; set/get IPv6 options */
 #endif
 
 #define IPV6_CHECKSUM		26 /* int; checksum offset for raw socket */
 #define IPV6_V6ONLY		27 /* bool; make AF_INET6 sockets v6 only */
 #ifndef _KERNEL
 #define IPV6_BINDV6ONLY		IPV6_V6ONLY
 #endif
 
 #define IPV6_IPSEC_POLICY	28 /* struct; get/set security policy */
 				   /* 29; unused; was IPV6_FAITH */
 #if 1 /* IPV6FIREWALL */
 #define IPV6_FW_ADD		30 /* add a firewall rule to chain */
 #define IPV6_FW_DEL		31 /* delete a firewall rule from chain */
 #define IPV6_FW_FLUSH		32 /* flush firewall rule chain */
 #define IPV6_FW_ZERO		33 /* clear single/all firewall counter(s) */
 #define IPV6_FW_GET		34 /* get entire firewall rule chain */
 #endif
 
 /* new socket options introduced in RFC3542 */
 #define IPV6_RTHDRDSTOPTS	35 /* ip6_dest; send dst option before rthdr */
 
 #define IPV6_RECVPKTINFO	36 /* bool; recv if, dst addr */
 #define IPV6_RECVHOPLIMIT	37 /* bool; recv hop limit */
 #define IPV6_RECVRTHDR		38 /* bool; recv routing header */
 #define IPV6_RECVHOPOPTS	39 /* bool; recv hop-by-hop option */
 #define IPV6_RECVDSTOPTS	40 /* bool; recv dst option after rthdr */
 #ifdef _KERNEL
 #define IPV6_RECVRTHDRDSTOPTS	41 /* bool; recv dst option before rthdr */
 #endif
 
 #define IPV6_USE_MIN_MTU	42 /* bool; send packets at the minimum MTU */
 #define IPV6_RECVPATHMTU	43 /* bool; notify an according MTU */
 
 #define IPV6_PATHMTU		44 /* mtuinfo; get the current path MTU (sopt),
 				      4 bytes int; MTU notification (cmsg) */
 #if 0 /*obsoleted during 2292bis -> 3542*/
 #define IPV6_REACHCONF		45 /* no data; ND reachability confirm
 				      (cmsg only/not in of RFC3542) */
 #endif
 
 /* more new socket options introduced in RFC3542 */
 #define IPV6_PKTINFO		46 /* in6_pktinfo; send if, src addr */
 #define IPV6_HOPLIMIT		47 /* int; send hop limit */
 #define IPV6_NEXTHOP		48 /* sockaddr; next hop addr */
 #define IPV6_HOPOPTS		49 /* ip6_hbh; send hop-by-hop option */
 #define IPV6_DSTOPTS		50 /* ip6_dest; send dst option befor rthdr */
 #define IPV6_RTHDR		51 /* ip6_rthdr; send routing header */
 #if 0
 #define IPV6_PKTOPTIONS		52 /* buf/cmsghdr; set/get IPv6 options */
 				   /* obsoleted by RFC3542 */
 #endif
 
 #define IPV6_RECVTCLASS		57 /* bool; recv traffic class values */
 
 #define IPV6_AUTOFLOWLABEL	59 /* bool; attach flowlabel automagically */
 
 #define IPV6_TCLASS		61 /* int; send traffic class value */
 #define IPV6_DONTFRAG		62 /* bool; disable IPv6 fragmentation */
 
 #define IPV6_PREFER_TEMPADDR	63 /* int; prefer temporary addresses as
 				    * the source address.
 				    */
 
 #define	IPV6_BINDANY		64 /* bool: allow bind to any address */
 
 #define	IPV6_BINDMULTI		65 /* bool; allow multibind to same addr/port */
 #define	IPV6_RSS_LISTEN_BUCKET	66 /* int; set RSS listen bucket */
 #define	IPV6_FLOWID		67 /* int; flowid of given socket */
 #define	IPV6_FLOWTYPE		68 /* int; flowtype of given socket */
 #define	IPV6_RSSBUCKETID	69 /* int; RSS bucket ID of given socket */
 #define	IPV6_RECVFLOWID		70 /* bool; receive IP6 flowid/flowtype w/ datagram */
 #define	IPV6_RECVRSSBUCKETID	71 /* bool; receive IP6 RSS bucket id w/ datagram */
 
 #define	IPV6_ORIGDSTADDR	72 /* bool: allow getting dstaddr /port info */
 #define	IPV6_RECVORIGDSTADDR	IPV6_ORIGDSTADDR
 
 /*
  * The following option is private; do not use it from user applications.
  * It is deliberately defined to the same value as IP_MSFILTER.
  */
 #define	IPV6_MSFILTER		74 /* struct __msfilterreq;
 				    * set/get multicast source filter list.
 				    */
 
 /* to define items, should talk with KAME guys first, for *BSD compatibility */
 
 #define IPV6_RTHDR_LOOSE     0 /* this hop need not be a neighbor. XXX old spec */
 #define IPV6_RTHDR_STRICT    1 /* this hop must be a neighbor. XXX old spec */
 #define IPV6_RTHDR_TYPE_0    0 /* IPv6 routing header type 0 */
 
 /*
  * Defaults and limits for options
  */
 #define IPV6_DEFAULT_MULTICAST_HOPS 1	/* normally limit m'casts to 1 hop */
 #define IPV6_DEFAULT_MULTICAST_LOOP 1	/* normally hear sends if a member */
 
 /*
- * The im6o_membership vector for each socket is now dynamically allocated at
- * run-time, bounded by USHRT_MAX, and is reallocated when needed, sized
- * according to a power-of-two increment.
+ * Limit for IPv6 multicast memberships
  */
-#define	IPV6_MIN_MEMBERSHIPS	31
 #define	IPV6_MAX_MEMBERSHIPS	4095
 
 /*
  * Default resource limits for IPv6 multicast source filtering.
  * These may be modified by sysctl.
  */
 #define	IPV6_MAX_GROUP_SRC_FILTER	512	/* sources per group */
 #define	IPV6_MAX_SOCK_SRC_FILTER	128	/* sources per socket/group */
 
 /*
  * Argument structure for IPV6_JOIN_GROUP and IPV6_LEAVE_GROUP.
  */
 struct ipv6_mreq {
 	struct in6_addr	ipv6mr_multiaddr;
 	unsigned int	ipv6mr_interface;
 };
 
 /*
  * IPV6_PKTINFO: Packet information(RFC2292 sec 5)
  */
 struct in6_pktinfo {
 	struct in6_addr	ipi6_addr;	/* src/dst IPv6 address */
 	unsigned int	ipi6_ifindex;	/* send/recv interface index */
 };
 
 /*
  * Control structure for IPV6_RECVPATHMTU socket option.
  */
 struct ip6_mtuinfo {
 	struct sockaddr_in6 ip6m_addr;	/* or sockaddr_storage? */
 	uint32_t ip6m_mtu;
 };
 
 /*
  * Argument for IPV6_PORTRANGE:
  * - which range to search when port is unspecified at bind() or connect()
  */
 #define	IPV6_PORTRANGE_DEFAULT	0	/* default range */
 #define	IPV6_PORTRANGE_HIGH	1	/* "high" - request firewall bypass */
 #define	IPV6_PORTRANGE_LOW	2	/* "low" - vouchsafe security */
 
 #if __BSD_VISIBLE
 /*
  * Definitions for inet6 sysctl operations.
  *
  * Third level is protocol number.
  * Fourth level is desired variable within that protocol.
  */
 #define IPV6PROTO_MAXID	(IPPROTO_PIM + 1)	/* don't list to IPV6PROTO_MAX */
 
 /*
  * Names for IP sysctl objects
  */
 #define IPV6CTL_FORWARDING	1	/* act as router */
 #define IPV6CTL_SENDREDIRECTS	2	/* may send redirects when forwarding*/
 #define IPV6CTL_DEFHLIM		3	/* default Hop-Limit */
 #ifdef notyet
 #define IPV6CTL_DEFMTU		4	/* default MTU */
 #endif
 #define IPV6CTL_FORWSRCRT	5	/* forward source-routed dgrams */
 #define IPV6CTL_STATS		6	/* stats */
 #define IPV6CTL_MRTSTATS	7	/* multicast forwarding stats */
 #define IPV6CTL_MRTPROTO	8	/* multicast routing protocol */
 #define IPV6CTL_MAXFRAGPACKETS	9	/* max packets reassembly queue */
 #define IPV6CTL_SOURCECHECK	10	/* verify source route and intf */
 #define IPV6CTL_SOURCECHECK_LOGINT 11	/* minimume logging interval */
 #define IPV6CTL_ACCEPT_RTADV	12
 					/* 13; unused; was: IPV6CTL_KEEPFAITH */
 #define IPV6CTL_LOG_INTERVAL	14
 #define IPV6CTL_HDRNESTLIMIT	15
 #define IPV6CTL_DAD_COUNT	16
 #define IPV6CTL_AUTO_FLOWLABEL	17
 #define IPV6CTL_DEFMCASTHLIM	18
 #define IPV6CTL_GIF_HLIM	19	/* default HLIM for gif encap packet */
 #define IPV6CTL_KAME_VERSION	20
 #define IPV6CTL_USE_DEPRECATED	21	/* use deprecated addr (RFC2462 5.5.4) */
 #define IPV6CTL_RR_PRUNE	22	/* walk timer for router renumbering */
 #if 0	/* obsolete */
 #define IPV6CTL_MAPPED_ADDR	23
 #endif
 #define IPV6CTL_V6ONLY		24
 /*	IPV6CTL_RTEXPIRE	25	deprecated */
 /*	IPV6CTL_RTMINEXPIRE	26	deprecated */
 /*	IPV6CTL_RTMAXCACHE	27	deprecated */
 
 #define IPV6CTL_USETEMPADDR	32	/* use temporary addresses (RFC3041) */
 #define IPV6CTL_TEMPPLTIME	33	/* preferred lifetime for tmpaddrs */
 #define IPV6CTL_TEMPVLTIME	34	/* valid lifetime for tmpaddrs */
 #define IPV6CTL_AUTO_LINKLOCAL	35	/* automatic link-local addr assign */
 #define IPV6CTL_RIP6STATS	36	/* raw_ip6 stats */
 #define IPV6CTL_PREFER_TEMPADDR	37	/* prefer temporary addr as src */
 #define IPV6CTL_ADDRCTLPOLICY	38	/* get/set address selection policy */
 #define IPV6CTL_USE_DEFAULTZONE	39	/* use default scope zone */
 
 #define IPV6CTL_MAXFRAGS	41	/* max fragments */
 #if 0
 #define IPV6CTL_IFQ		42	/* ip6intrq node */
 #define IPV6CTL_ISATAPRTR	43	/* isatap router */
 #endif
 #define IPV6CTL_MCAST_PMTU	44	/* enable pMTU discovery for multicast? */
 
 /* New entries should be added here from current IPV6CTL_MAXID value. */
 /* to define items, should talk with KAME guys first, for *BSD compatibility */
 #define IPV6CTL_STEALTH		45
 
 #define	ICMPV6CTL_ND6_ONLINKNSRFC4861	47
 #define	IPV6CTL_NO_RADR		48	/* No defroute from RA */
 #define	IPV6CTL_NORBIT_RAIF	49	/* Disable R-bit in NA on RA
 					 * receiving IF. */
 #define	IPV6CTL_RFC6204W3	50	/* Accept defroute even when forwarding
 					   enabled */
 #define	IPV6CTL_INTRQMAXLEN	51	/* max length of IPv6 netisr queue */
 #define	IPV6CTL_INTRDQMAXLEN	52	/* max length of direct IPv6 netisr
 					 * queue */
 #define	IPV6CTL_MAXFRAGSPERPACKET	53 /* Max fragments per packet */
 #define	IPV6CTL_MAXFRAGBUCKETSIZE	54 /* Max reassembly queues per bucket */
 #define	IPV6CTL_MAXID		55
 #endif /* __BSD_VISIBLE */
 
 /*
  * Since both netinet/ and netinet6/ call into netipsec/ and netpfil/,
  * the protocol specific mbuf flags are shared between them.
  */
 #define	M_FASTFWD_OURS		M_PROTO1	/* changed dst to local */
 #define	M_IP6_NEXTHOP		M_PROTO2	/* explicit ip nexthop */
 #define	M_IP_NEXTHOP		M_PROTO2	/* explicit ip nexthop */
 #define	M_SKIP_FIREWALL		M_PROTO3	/* skip firewall processing */
 #define	M_AUTHIPHDR		M_PROTO4
 #define	M_DECRYPTED		M_PROTO5
 #define	M_LOOP			M_PROTO6
 #define	M_AUTHIPDGM		M_PROTO7
 #define	M_RTALERT_MLD		M_PROTO8
 #define	M_FRAGMENTED		M_PROTO9	/* contained fragment header */
 
 #ifdef _KERNEL
 struct cmsghdr;
 struct ip6_hdr;
 
 int	in6_cksum_pseudo(struct ip6_hdr *, uint32_t, uint8_t, uint16_t);
 int	in6_cksum(struct mbuf *, u_int8_t, u_int32_t, u_int32_t);
 int	in6_cksum_partial(struct mbuf *, u_int8_t, u_int32_t, u_int32_t,
 			  u_int32_t);
 int	in6_localaddr(struct in6_addr *);
 int	in6_localip(struct in6_addr *);
 int	in6_ifhasaddr(struct ifnet *, struct in6_addr *);
 int	in6_addrscope(const struct in6_addr *);
 char	*ip6_sprintf(char *, const struct in6_addr *);
 struct	in6_ifaddr *in6_ifawithifp(struct ifnet *, struct in6_addr *);
 extern void in6_if_up(struct ifnet *);
 struct sockaddr;
 extern	u_char	ip6_protox[];
 
 void	in6_sin6_2_sin(struct sockaddr_in *sin,
 			    struct sockaddr_in6 *sin6);
 void	in6_sin_2_v4mapsin6(struct sockaddr_in *sin,
 				 struct sockaddr_in6 *sin6);
 void	in6_sin6_2_sin_in_sock(struct sockaddr *nam);
 void	in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam);
 extern void addrsel_policy_init(void);
 
 #define	satosin6(sa)	((struct sockaddr_in6 *)(sa))
 #define	sin6tosa(sin6)	((struct sockaddr *)(sin6))
 #define	ifatoia6(ifa)	((struct in6_ifaddr *)(ifa))
 
 #endif /* _KERNEL */
 
 #ifndef _SIZE_T_DECLARED
 typedef	__size_t	size_t;
 #define	_SIZE_T_DECLARED
 #endif
 
 #ifndef _SOCKLEN_T_DECLARED
 typedef	__socklen_t	socklen_t;
 #define	_SOCKLEN_T_DECLARED
 #endif
 
 #if __BSD_VISIBLE
 
 __BEGIN_DECLS
 struct cmsghdr;
 
 extern int inet6_option_space(int);
 extern int inet6_option_init(void *, struct cmsghdr **, int);
 extern int inet6_option_append(struct cmsghdr *, const uint8_t *,
 	int, int);
 extern uint8_t *inet6_option_alloc(struct cmsghdr *, int, int, int);
 extern int inet6_option_next(const struct cmsghdr *, uint8_t **);
 extern int inet6_option_find(const struct cmsghdr *, uint8_t **, int);
 
 extern size_t inet6_rthdr_space(int, int);
 extern struct cmsghdr *inet6_rthdr_init(void *, int);
 extern int inet6_rthdr_add(struct cmsghdr *, const struct in6_addr *,
 	unsigned int);
 extern int inet6_rthdr_lasthop(struct cmsghdr *, unsigned int);
 #if 0 /* not implemented yet */
 extern int inet6_rthdr_reverse(const struct cmsghdr *, struct cmsghdr *);
 #endif
 extern int inet6_rthdr_segments(const struct cmsghdr *);
 extern struct in6_addr *inet6_rthdr_getaddr(struct cmsghdr *, int);
 extern int inet6_rthdr_getflags(const struct cmsghdr *, int);
 
 extern int inet6_opt_init(void *, socklen_t);
 extern int inet6_opt_append(void *, socklen_t, int, uint8_t, socklen_t,
 	uint8_t, void **);
 extern int inet6_opt_finish(void *, socklen_t, int);
 extern int inet6_opt_set_val(void *, int, void *, socklen_t);
 
 extern int inet6_opt_next(void *, socklen_t, int, uint8_t *, socklen_t *,
 	void **);
 extern int inet6_opt_find(void *, socklen_t, int, uint8_t, socklen_t *,
 	void **);
 extern int inet6_opt_get_val(void *, int, void *, socklen_t);
 extern socklen_t inet6_rth_space(int, int);
 extern void *inet6_rth_init(void *, socklen_t, int, int);
 extern int inet6_rth_add(void *, const struct in6_addr *);
 extern int inet6_rth_reverse(const void *, void *);
 extern int inet6_rth_segments(const void *);
 extern struct in6_addr *inet6_rth_getaddr(const void *, int);
 __END_DECLS
 
 #endif /* __BSD_VISIBLE */
 
 #endif /* !_NETINET6_IN6_H_ */
Index: head/sys/netinet6/in6_ifattach.c
===================================================================
--- head/sys/netinet6/in6_ifattach.c	(revision 349368)
+++ head/sys/netinet6/in6_ifattach.c	(revision 349369)
@@ -1,908 +1,910 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/syslog.h>
 #include <sys/md5.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/scope6_var.h>
 
 VNET_DEFINE(unsigned long, in6_maxmtu) = 0;
 
 #ifdef IP6_AUTO_LINKLOCAL
 VNET_DEFINE(int, ip6_auto_linklocal) = IP6_AUTO_LINKLOCAL;
 #else
 VNET_DEFINE(int, ip6_auto_linklocal) = 1;	/* enabled by default */
 #endif
 
 VNET_DEFINE(struct callout, in6_tmpaddrtimer_ch);
 #define	V_in6_tmpaddrtimer_ch		VNET(in6_tmpaddrtimer_ch)
 
 VNET_DECLARE(struct inpcbinfo, ripcbinfo);
 #define	V_ripcbinfo			VNET(ripcbinfo)
 
 static int get_rand_ifid(struct ifnet *, struct in6_addr *);
 static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *);
 static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *);
 static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *);
 static int in6_ifattach_loopback(struct ifnet *);
 static void in6_purgemaddrs(struct ifnet *);
 
 #define EUI64_GBIT	0x01
 #define EUI64_UBIT	0x02
 #define EUI64_TO_IFID(in6)	do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0)
 #define EUI64_GROUP(in6)	((in6)->s6_addr[8] & EUI64_GBIT)
 #define EUI64_INDIVIDUAL(in6)	(!EUI64_GROUP(in6))
 #define EUI64_LOCAL(in6)	((in6)->s6_addr[8] & EUI64_UBIT)
 #define EUI64_UNIVERSAL(in6)	(!EUI64_LOCAL(in6))
 
 #define IFID_LOCAL(in6)		(!EUI64_LOCAL(in6))
 #define IFID_UNIVERSAL(in6)	(!EUI64_UNIVERSAL(in6))
 
 /*
  * Generate a last-resort interface identifier, when the machine has no
  * IEEE802/EUI64 address sources.
  * The goal here is to get an interface identifier that is
  * (1) random enough and (2) does not change across reboot.
  * We currently use MD5(hostname) for it.
  *
  * in6 - upper 64bits are preserved
  */
 static int
 get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6)
 {
 	MD5_CTX ctxt;
 	struct prison *pr;
 	u_int8_t digest[16];
 	int hostnamelen;
 
 	pr = curthread->td_ucred->cr_prison;
 	mtx_lock(&pr->pr_mtx);
 	hostnamelen = strlen(pr->pr_hostname);
 #if 0
 	/* we need at least several letters as seed for ifid */
 	if (hostnamelen < 3) {
 		mtx_unlock(&pr->pr_mtx);
 		return -1;
 	}
 #endif
 
 	/* generate 8 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, pr->pr_hostname, hostnamelen);
 	mtx_unlock(&pr->pr_mtx);
 	MD5Final(digest, &ctxt);
 
 	/* assumes sizeof(digest) > sizeof(ifid) */
 	bcopy(digest, &in6->s6_addr[8], 8);
 
 	/* make sure to set "u" bit to local, and "g" bit to individual. */
 	in6->s6_addr[8] &= ~EUI64_GBIT;	/* g bit to "individual" */
 	in6->s6_addr[8] |= EUI64_UBIT;	/* u bit to "local" */
 
 	/* convert EUI64 into IPv6 interface identifier */
 	EUI64_TO_IFID(in6);
 
 	return 0;
 }
 
 static int
 generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret)
 {
 	MD5_CTX ctxt;
 	u_int8_t seed[16], digest[16], nullbuf[8];
 	u_int32_t val32;
 
 	/* If there's no history, start with a random seed. */
 	bzero(nullbuf, sizeof(nullbuf));
 	if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) {
 		int i;
 
 		for (i = 0; i < 2; i++) {
 			val32 = arc4random();
 			bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32));
 		}
 	} else
 		bcopy(seed0, seed, 8);
 
 	/* copy the right-most 64-bits of the given address */
 	/* XXX assumption on the size of IFID */
 	bcopy(seed1, &seed[8], 8);
 
 	if (0) {		/* for debugging purposes only */
 		int i;
 
 		printf("generate_tmp_ifid: new randomized ID from: ");
 		for (i = 0; i < 16; i++)
 			printf("%02x", seed[i]);
 		printf(" ");
 	}
 
 	/* generate 16 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, seed, sizeof(seed));
 	MD5Final(digest, &ctxt);
 
 	/*
 	 * RFC 3041 3.2.1. (3)
 	 * Take the left-most 64-bits of the MD5 digest and set bit 6 (the
 	 * left-most bit is numbered 0) to zero.
 	 */
 	bcopy(digest, ret, 8);
 	ret[0] &= ~EUI64_UBIT;
 
 	/*
 	 * XXX: we'd like to ensure that the generated value is not zero
 	 * for simplicity.  If the caclculated digest happens to be zero,
 	 * use a random non-zero value as the last resort.
 	 */
 	if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) {
 		nd6log((LOG_INFO,
 		    "generate_tmp_ifid: computed MD5 value is zero.\n"));
 
 		val32 = arc4random();
 		val32 = 1 + (val32 % (0xffffffff - 1));
 	}
 
 	/*
 	 * RFC 3041 3.2.1. (4)
 	 * Take the rightmost 64-bits of the MD5 digest and save them in
 	 * stable storage as the history value to be used in the next
 	 * iteration of the algorithm.
 	 */
 	bcopy(&digest[8], seed0, 8);
 
 	if (0) {		/* for debugging purposes only */
 		int i;
 
 		printf("to: ");
 		for (i = 0; i < 16; i++)
 			printf("%02x", digest[i]);
 		printf("\n");
 	}
 
 	return 0;
 }
 
 /*
  * Get interface identifier for the specified interface.
  * XXX assumes single sockaddr_dl (AF_LINK address) per an interface
  *
  * in6 - upper 64bits are preserved
  */
 int
 in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	u_int8_t *addr;
 	size_t addrlen;
 	static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 	static u_int8_t allone[8] =
 		{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_LINK)
 			continue;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		if (sdl == NULL)
 			continue;
 		if (sdl->sdl_alen == 0)
 			continue;
 
 		goto found;
 	}
 	NET_EPOCH_EXIT(et);
 
 	return -1;
 
 found:
 	IF_ADDR_LOCK_ASSERT(ifp);
 	addr = LLADDR(sdl);
 	addrlen = sdl->sdl_alen;
 
 	/* get EUI64 */
 	switch (ifp->if_type) {
 	case IFT_BRIDGE:
 	case IFT_ETHER:
 	case IFT_L2VLAN:
 	case IFT_ATM:
 	case IFT_IEEE1394:
 		/* IEEE802/EUI64 cases - what others? */
 		/* IEEE1394 uses 16byte length address starting with EUI64 */
 		if (addrlen > 8)
 			addrlen = 8;
 
 		/* look at IEEE802/EUI64 only */
 		if (addrlen != 8 && addrlen != 6) {
 			NET_EPOCH_EXIT(et);
 			return -1;
 		}
 
 		/*
 		 * check for invalid MAC address - on bsdi, we see it a lot
 		 * since wildboar configures all-zero MAC on pccard before
 		 * card insertion.
 		 */
 		if (bcmp(addr, allzero, addrlen) == 0) {
 			NET_EPOCH_EXIT(et);
 			return -1;
 		}
 		if (bcmp(addr, allone, addrlen) == 0) {
 			NET_EPOCH_EXIT(et);
 			return -1;
 		}
 
 		/* make EUI64 address */
 		if (addrlen == 8)
 			bcopy(addr, &in6->s6_addr[8], 8);
 		else if (addrlen == 6) {
 			in6->s6_addr[8] = addr[0];
 			in6->s6_addr[9] = addr[1];
 			in6->s6_addr[10] = addr[2];
 			in6->s6_addr[11] = 0xff;
 			in6->s6_addr[12] = 0xfe;
 			in6->s6_addr[13] = addr[3];
 			in6->s6_addr[14] = addr[4];
 			in6->s6_addr[15] = addr[5];
 		}
 		break;
 
 	case IFT_GIF:
 	case IFT_STF:
 		/*
 		 * RFC2893 says: "SHOULD use IPv4 address as ifid source".
 		 * however, IPv4 address is not very suitable as unique
 		 * identifier source (can be renumbered).
 		 * we don't do this.
 		 */
 		NET_EPOCH_EXIT(et);
 		return -1;
 
 	case IFT_INFINIBAND:
 		if (addrlen != 20) {
 			NET_EPOCH_EXIT(et);
 			return -1;
 		}
 		bcopy(addr + 12, &in6->s6_addr[8], 8);
 		break;
 
 	default:
 		NET_EPOCH_EXIT(et);
 		return -1;
 	}
 
 	/* sanity check: g bit must not indicate "group" */
 	if (EUI64_GROUP(in6)) {
 		NET_EPOCH_EXIT(et);
 		return -1;
 	}
 
 	/* convert EUI64 into IPv6 interface identifier */
 	EUI64_TO_IFID(in6);
 
 	/*
 	 * sanity check: ifid must not be all zero, avoid conflict with
 	 * subnet router anycast
 	 */
 	if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 &&
 	    bcmp(&in6->s6_addr[9], allzero, 7) == 0) {
 		NET_EPOCH_EXIT(et);
 		return -1;
 	}
 
 	NET_EPOCH_EXIT(et);
 	return 0;
 }
 
 /*
  * Get interface identifier for the specified interface.  If it is not
  * available on ifp0, borrow interface identifier from other information
  * sources.
  *
  * altifp - secondary EUI64 source
  */
 static int
 get_ifid(struct ifnet *ifp0, struct ifnet *altifp,
     struct in6_addr *in6)
 {
 	struct epoch_tracker et;
 	struct ifnet *ifp;
 
 	/* first, try to get it from the interface itself */
 	if (in6_get_hw_ifid(ifp0, in6) == 0) {
 		nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n",
 		    if_name(ifp0)));
 		goto success;
 	}
 
 	/* try secondary EUI64 source. this basically is for ATM PVC */
 	if (altifp && in6_get_hw_ifid(altifp, in6) == 0) {
 		nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n",
 		    if_name(ifp0), if_name(altifp)));
 		goto success;
 	}
 
 	/* next, try to get it from some other hardware interface */
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp == ifp0)
 			continue;
 		if (in6_get_hw_ifid(ifp, in6) != 0)
 			continue;
 
 		/*
 		 * to borrow ifid from other interface, ifid needs to be
 		 * globally unique
 		 */
 		if (IFID_UNIVERSAL(in6)) {
 			nd6log((LOG_DEBUG,
 			    "%s: borrow interface identifier from %s\n",
 			    if_name(ifp0), if_name(ifp)));
 			NET_EPOCH_EXIT(et);
 			goto success;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	/* last resort: get from random number source */
 	if (get_rand_ifid(ifp, in6) == 0) {
 		nd6log((LOG_DEBUG,
 		    "%s: interface identifier generated by random number\n",
 		    if_name(ifp0)));
 		goto success;
 	}
 
 	printf("%s: failed to get interface identifier\n", if_name(ifp0));
 	return -1;
 
 success:
 	nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
 	    if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10],
 	    in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13],
 	    in6->s6_addr[14], in6->s6_addr[15]));
 	return 0;
 }
 
 /*
  * altifp - secondary EUI64 source
  */
 static int
 in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp)
 {
 	struct in6_ifaddr *ia;
 	struct in6_aliasreq ifra;
 	struct nd_prefixctl pr0;
 	struct nd_prefix *pr;
 	int error;
 
 	/*
 	 * configure link-local address.
 	 */
 	in6_prepare_ifra(&ifra, NULL, &in6mask64);
 
 	ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000);
 	ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0;
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
 		ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0;
 		ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1);
 	} else {
 		if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) {
 			nd6log((LOG_ERR,
 			    "%s: no ifid available\n", if_name(ifp)));
 			return (-1);
 		}
 	}
 	if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL))
 		return (-1);
 
 	/* link-local addresses should NEVER expire. */
 	ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
 	ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
 
 	/*
 	 * Now call in6_update_ifa() to do a bunch of procedures to configure
 	 * a link-local address. We can set the 3rd argument to NULL, because
 	 * we know there's no other link-local address on the interface
 	 * and therefore we are adding one (instead of updating one).
 	 */
 	if ((error = in6_update_ifa(ifp, &ifra, NULL,
 				    IN6_IFAUPDATE_DADDELAY)) != 0) {
 		/*
 		 * XXX: When the interface does not support IPv6, this call
 		 * would fail in the SIOCSIFADDR ioctl.  I believe the
 		 * notification is rather confusing in this case, so just
 		 * suppress it.  (jinmei@kame.net 20010130)
 		 */
 		if (error != EAFNOSUPPORT)
 			nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to "
 			    "configure a link-local address on %s "
 			    "(errno=%d)\n",
 			    if_name(ifp), error));
 		return (-1);
 	}
 
 	ia = in6ifa_ifpforlinklocal(ifp, 0);
 	if (ia == NULL) {
 		/*
 		 * Another thread removed the address that we just added.
 		 * This should be rare, but it happens.
 		 */
 		nd6log((LOG_NOTICE, "%s: %s: new link-local address "
 			"disappeared\n", __func__, if_name(ifp)));
 		return (-1);
 	}
 	ifa_free(&ia->ia_ifa);
 
 	/*
 	 * Make the link-local prefix (fe80::%link/64) as on-link.
 	 * Since we'd like to manage prefixes separately from addresses,
 	 * we make an ND6 prefix structure for the link-local prefix,
 	 * and add it to the prefix list as a never-expire prefix.
 	 * XXX: this change might affect some existing code base...
 	 */
 	bzero(&pr0, sizeof(pr0));
 	pr0.ndpr_ifp = ifp;
 	/* this should be 64 at this moment. */
 	pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL);
 	pr0.ndpr_prefix = ifra.ifra_addr;
 	/* apply the mask for safety. (nd6_prelist_add will apply it again) */
 	IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &in6mask64);
 	/*
 	 * Initialize parameters.  The link-local prefix must always be
 	 * on-link, and its lifetimes never expire.
 	 */
 	pr0.ndpr_raf_onlink = 1;
 	pr0.ndpr_raf_auto = 1;	/* probably meaningless */
 	pr0.ndpr_vltime = ND6_INFINITE_LIFETIME;
 	pr0.ndpr_pltime = ND6_INFINITE_LIFETIME;
 	/*
 	 * Since there is no other link-local addresses, nd6_prefix_lookup()
 	 * probably returns NULL.  However, we cannot always expect the result.
 	 * For example, if we first remove the (only) existing link-local
 	 * address, and then reconfigure another one, the prefix is still
 	 * valid with referring to the old link-local address.
 	 */
 	if ((pr = nd6_prefix_lookup(&pr0)) == NULL) {
 		if ((error = nd6_prelist_add(&pr0, NULL, NULL)) != 0)
 			return (error);
 	} else
 		nd6_prefix_rele(pr);
 
 	return 0;
 }
 
 /*
  * ifp - must be IFT_LOOP
  */
 static int
 in6_ifattach_loopback(struct ifnet *ifp)
 {
 	struct in6_aliasreq ifra;
 	int error;
 
 	in6_prepare_ifra(&ifra, &in6addr_loopback, &in6mask128);
 
 	/*
 	 * Always initialize ia_dstaddr (= broadcast address) to loopback
 	 * address.  Follows IPv4 practice - see in_ifinit().
 	 */
 	ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6);
 	ifra.ifra_dstaddr.sin6_family = AF_INET6;
 	ifra.ifra_dstaddr.sin6_addr = in6addr_loopback;
 
 	/* the loopback  address should NEVER expire. */
 	ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
 	ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
 
 	/*
 	 * We are sure that this is a newly assigned address, so we can set
 	 * NULL to the 3rd arg.
 	 */
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) {
 		nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure "
 		    "the loopback address on %s (errno=%d)\n",
 		    if_name(ifp), error));
 		return (-1);
 	}
 
 	return 0;
 }
 
 /*
  * compute NI group address, based on the current hostname setting.
  * see RFC 4620.
  *
  * when ifp == NULL, the caller is responsible for filling scopeid.
  *
  * If oldmcprefix == 1, FF02:0:0:0:0:2::/96 is used for NI group address
  * while it is FF02:0:0:0:0:2:FF00::/104 in RFC 4620. 
  */
 static int
 in6_nigroup0(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6, int oldmcprefix)
 {
 	struct prison *pr;
 	const char *p;
 	u_char *q;
 	MD5_CTX ctxt;
 	u_int8_t digest[16];
 	char l;
 	char n[64];	/* a single label must not exceed 63 chars */
 
 	/*
 	 * If no name is given and namelen is -1,
 	 * we try to do the hostname lookup ourselves.
 	 */
 	if (!name && namelen == -1) {
 		pr = curthread->td_ucred->cr_prison;
 		mtx_lock(&pr->pr_mtx);
 		name = pr->pr_hostname;
 		namelen = strlen(name);
 	} else
 		pr = NULL;
 	if (!name || !namelen) {
 		if (pr != NULL)
 			mtx_unlock(&pr->pr_mtx);
 		return -1;
 	}
 
 	p = name;
 	while (p && *p && *p != '.' && p - name < namelen)
 		p++;
 	if (p == name || p - name > sizeof(n) - 1) {
 		if (pr != NULL)
 			mtx_unlock(&pr->pr_mtx);
 		return -1;	/* label too long */
 	}
 	l = p - name;
 	strncpy(n, name, l);
 	if (pr != NULL)
 		mtx_unlock(&pr->pr_mtx);
 	n[(int)l] = '\0';
 	for (q = n; *q; q++) {
 		if ('A' <= *q && *q <= 'Z')
 			*q = *q - 'A' + 'a';
 	}
 
 	/* generate 16 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, &l, sizeof(l));
 	MD5Update(&ctxt, n, l);
 	MD5Final(digest, &ctxt);
 
 	bzero(in6, sizeof(*in6));
 	in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 	in6->s6_addr8[11] = 2;
 	if (oldmcprefix == 0) {
 		in6->s6_addr8[12] = 0xff;
 	 	/* Copy the first 24 bits of 128-bit hash into the address. */
 		bcopy(digest, &in6->s6_addr8[13], 3);
 	} else {
 	 	/* Copy the first 32 bits of 128-bit hash into the address. */
 		bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3]));
 	}
 	if (in6_setscope(in6, ifp, NULL))
 		return (-1); /* XXX: should not fail */
 
 	return 0;
 }
 
 int
 in6_nigroup(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6)
 {
 
 	return (in6_nigroup0(ifp, name, namelen, in6, 0));
 }
 
 int
 in6_nigroup_oldmcprefix(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6)
 {
 
 	return (in6_nigroup0(ifp, name, namelen, in6, 1));
 }
 
 /*
  * XXX multiple loopback interface needs more care.  for instance,
  * nodelocal address needs to be configured onto only one of them.
  * XXX multiple link-local address case
  *
  * altifp - secondary EUI64 source
  */
 void
 in6_ifattach(struct ifnet *ifp, struct ifnet *altifp)
 {
 	struct in6_ifaddr *ia;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 	/*
 	 * quirks based on interface type
 	 */
 	switch (ifp->if_type) {
 	case IFT_STF:
 		/*
 		 * 6to4 interface is a very special kind of beast.
 		 * no multicast, no linklocal.  RFC2529 specifies how to make
 		 * linklocals for 6to4 interface, but there's no use and
 		 * it is rather harmful to have one.
 		 */
 		ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL;
 		ND_IFINFO(ifp)->flags |= ND6_IFF_NO_DAD;
 		break;
 	default:
 		break;
 	}
 
 	/*
 	 * usually, we require multicast capability to the interface
 	 */
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 		nd6log((LOG_INFO, "in6_ifattach: "
 		    "%s is not multicast capable, IPv6 not enabled\n",
 		    if_name(ifp)));
 		return;
 	}
 
 	/*
 	 * assign loopback address for loopback interface.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
 		/*
 		 * check that loopback address doesn't exist yet.
 		 */
 		ia = in6ifa_ifwithaddr(&in6addr_loopback, 0);
 		if (ia == NULL)
 			in6_ifattach_loopback(ifp);
 		else
 			ifa_free(&ia->ia_ifa);
 	}
 
 	/*
 	 * assign a link-local address, if there's none.
 	 */
 	if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 	    ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) {
 		ia = in6ifa_ifpforlinklocal(ifp, 0);
 		if (ia == NULL)
 			in6_ifattach_linklocal(ifp, altifp);
 		else
 			ifa_free(&ia->ia_ifa);
 	}
 
 	/* update dynamically. */
 	if (V_in6_maxmtu < ifp->if_mtu)
 		V_in6_maxmtu = ifp->if_mtu;
 }
 
 /*
  * NOTE: in6_ifdetach() does not support loopback if at this moment.
  *
  * When shutting down a VNET we clean up layers top-down.  In that case
  * upper layer protocols (ulp) are cleaned up already and locks are destroyed
  * and we must not call into these cleanup functions anymore, thus purgeulp
  * is set to 0 in that case by in6_ifdetach_destroy().
  * The normal case of destroying a (cloned) interface still needs to cleanup
  * everything related to the interface and will have purgeulp set to 1.
  */
 static void
 _in6_ifdetach(struct ifnet *ifp, int purgeulp)
 {
 	struct ifaddr *ifa, *next;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 
 	/*
 	 * nuke any of IPv6 addresses we have
 	 */
 	CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		in6_purgeaddr(ifa);
 	}
 	if (purgeulp) {
+		IN6_MULTI_LOCK();
 		in6_pcbpurgeif0(&V_udbinfo, ifp);
 		in6_pcbpurgeif0(&V_ulitecbinfo, ifp);
 		in6_pcbpurgeif0(&V_ripcbinfo, ifp);
+		IN6_MULTI_UNLOCK();
 	}
 	/* leave from all multicast groups joined */
 	in6_purgemaddrs(ifp);
 
 	/*
 	 * Remove neighbor management table.
 	 * Enabling the nd6_purge will panic on vmove for interfaces on VNET
 	 * teardown as the IPv6 layer is cleaned up already and the locks
 	 * are destroyed.
 	 */
 	if (purgeulp)
 		nd6_purge(ifp);
 }
 
 void
 in6_ifdetach(struct ifnet *ifp)
 {
 
 	_in6_ifdetach(ifp, 1);
 }
 
 void
 in6_ifdetach_destroy(struct ifnet *ifp)
 {
 
 	_in6_ifdetach(ifp, 0);
 }
 
 int
 in6_get_tmpifid(struct ifnet *ifp, u_int8_t *retbuf,
     const u_int8_t *baseid, int generate)
 {
 	u_int8_t nullbuf[8];
 	struct nd_ifinfo *ndi = ND_IFINFO(ifp);
 
 	bzero(nullbuf, sizeof(nullbuf));
 	if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) {
 		/* we've never created a random ID.  Create a new one. */
 		generate = 1;
 	}
 
 	if (generate) {
 		bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1));
 
 		/* generate_tmp_ifid will update seedn and buf */
 		(void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1,
 		    ndi->randomid);
 	}
 	bcopy(ndi->randomid, retbuf, 8);
 
 	return (0);
 }
 
 void
 in6_tmpaddrtimer(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct nd_ifinfo *ndi;
 	u_int8_t nullbuf[8];
 	struct ifnet *ifp;
 
 	callout_reset(&V_in6_tmpaddrtimer_ch,
 	    (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
 	    V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet);
 
 	bzero(nullbuf, sizeof(nullbuf));
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp->if_afdata[AF_INET6] == NULL)
 			continue;
 		ndi = ND_IFINFO(ifp);
 		if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) {
 			/*
 			 * We've been generating a random ID on this interface.
 			 * Create a new one.
 			 */
 			(void)generate_tmp_ifid(ndi->randomseed0,
 			    ndi->randomseed1, ndi->randomid);
 		}
 	}
 
 	CURVNET_RESTORE();
 }
 
 static void
 in6_purgemaddrs(struct ifnet *ifp)
 {
 	struct in6_multi_head inmh;
 
 	SLIST_INIT(&inmh);
 	IN6_MULTI_LOCK();
 	IN6_MULTI_LIST_LOCK();
 	mld_ifdetach(ifp, &inmh);
 	IN6_MULTI_LIST_UNLOCK();
 	IN6_MULTI_UNLOCK();
 	in6m_release_list_deferred(&inmh);
 
 	/*
 	 * Make sure all multicast deletions invoking if_ioctl() are
 	 * completed before returning. Else we risk accessing a freed
 	 * ifnet structure pointer.
 	 */
 	in6m_release_wait();
 }
 
 void
 in6_ifattach_destroy(void)
 {
 
 	callout_drain(&V_in6_tmpaddrtimer_ch);
 }
 
 static void
 in6_ifattach_init(void *dummy)
 {
 
 	/* Timer for regeneranation of temporary addresses randomize ID. */
 	callout_init(&V_in6_tmpaddrtimer_ch, 0);
 	callout_reset(&V_in6_tmpaddrtimer_ch,
 	    (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
 	    V_ip6_temp_regen_advance) * hz,
 	    in6_tmpaddrtimer, curvnet);
 }
 
 /*
  * Cheat.
  * This must be after route_init(), which is now SI_ORDER_THIRD.
  */
 SYSINIT(in6_ifattach_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
     in6_ifattach_init, NULL);
Index: head/sys/netinet6/in6_mcast.c
===================================================================
--- head/sys/netinet6/in6_mcast.c	(revision 349368)
+++ head/sys/netinet6/in6_mcast.c	(revision 349369)
@@ -1,2999 +1,2908 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2009 Bruce Simpson.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPv6 multicast socket, group, and socket option processing module.
  * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/gtaskqueue.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/priv.h>
 #include <sys/ktr.h>
 #include <sys/tree.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 
 #include <netinet/in.h>
 #include <netinet/udp.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/scope6_var.h>
 
 #ifndef KTR_MLD
 #define KTR_MLD KTR_INET6
 #endif
 
 #ifndef __SOCKUNION_DECLARED
 union sockunion {
 	struct sockaddr_storage	ss;
 	struct sockaddr		sa;
 	struct sockaddr_dl	sdl;
 	struct sockaddr_in6	sin6;
 };
 typedef union sockunion sockunion_t;
 #define __SOCKUNION_DECLARED
 #endif /* __SOCKUNION_DECLARED */
 
 static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter",
     "IPv6 multicast PCB-layer source filter");
 MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group");
 static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options");
 static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource",
     "IPv6 multicast MLD-layer source filter");
 
 RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp);
 
 /*
  * Locking:
- * - Lock order is: Giant, INP_WLOCK, IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK.
+ * - Lock order is: Giant, IN6_MULTI_LOCK, INP_WLOCK,
+ *   IN6_MULTI_LIST_LOCK, MLD_LOCK, IF_ADDR_LOCK.
  * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however
  *   it can be taken by code in net/if.c also.
  * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK.
  *
  * struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly
  * any need for in6_multi itself to be virtualized -- it is bound to an ifp
  * anyway no matter what happens.
  */
 struct mtx in6_multi_list_mtx;
 MTX_SYSINIT(in6_multi_mtx, &in6_multi_list_mtx, "in6_multi_list_mtx", MTX_DEF);
 
 struct mtx in6_multi_free_mtx;
 MTX_SYSINIT(in6_multi_free_mtx, &in6_multi_free_mtx, "in6_multi_free_mtx", MTX_DEF);
 
 struct sx in6_multi_sx;
 SX_SYSINIT(in6_multi_sx, &in6_multi_sx, "in6_multi_sx");
 
 
 
 static void	im6f_commit(struct in6_mfilter *);
 static int	im6f_get_source(struct in6_mfilter *imf,
 		    const struct sockaddr_in6 *psin,
 		    struct in6_msource **);
 static struct in6_msource *
 		im6f_graft(struct in6_mfilter *, const uint8_t,
 		    const struct sockaddr_in6 *);
 static void	im6f_leave(struct in6_mfilter *);
 static int	im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *);
 static void	im6f_purge(struct in6_mfilter *);
 static void	im6f_rollback(struct in6_mfilter *);
 static void	im6f_reap(struct in6_mfilter *);
-static int	im6o_grow(struct ip6_moptions *);
-static size_t	im6o_match_group(const struct ip6_moptions *,
+static struct in6_mfilter *
+		im6o_match_group(const struct ip6_moptions *,
 		    const struct ifnet *, const struct sockaddr *);
 static struct in6_msource *
-		im6o_match_source(const struct ip6_moptions *, const size_t,
-		    const struct sockaddr *);
+		im6o_match_source(struct in6_mfilter *, const struct sockaddr *);
 static void	im6s_merge(struct ip6_msource *ims,
 		    const struct in6_msource *lims, const int rollback);
 static int	in6_getmulti(struct ifnet *, const struct in6_addr *,
 		    struct in6_multi **);
 static int	in6m_get_source(struct in6_multi *inm,
 		    const struct in6_addr *addr, const int noalloc,
 		    struct ip6_msource **pims);
 #ifdef KTR
 static int	in6m_is_ifp_detached(const struct in6_multi *);
 #endif
 static int	in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *);
 static void	in6m_purge(struct in6_multi *);
 static void	in6m_reap(struct in6_multi *);
 static struct ip6_moptions *
 		in6p_findmoptions(struct inpcb *);
 static int	in6p_get_source_filters(struct inpcb *, struct sockopt *);
 static int	in6p_join_group(struct inpcb *, struct sockopt *);
 static int	in6p_leave_group(struct inpcb *, struct sockopt *);
 static struct ifnet *
 		in6p_lookup_mcast_ifp(const struct inpcb *,
 		    const struct sockaddr_in6 *);
 static int	in6p_block_unblock_source(struct inpcb *, struct sockopt *);
 static int	in6p_set_multicast_if(struct inpcb *, struct sockopt *);
 static int	in6p_set_source_filters(struct inpcb *, struct sockopt *);
 static int	sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_DECL(_net_inet6_ip6);	/* XXX Not in any common header. */
 
 static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW, 0,
     "IPv6 multicast");
 
 static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER;
 SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc,
     CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0,
     "Max source filters per group");
 
 static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER;
 SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc,
     CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0,
     "Max source filters per socket");
 
 /* TODO Virtualize this switch. */
 int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
 SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
     &in6_mcast_loop, 0, "Loopback multicast datagrams by default");
 
 static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters,
     "Per-interface stack-wide source filters");
 
 #ifdef KTR
 /*
  * Inline function which wraps assertions for a valid ifp.
  * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
  * is detached.
  */
 static int __inline
 in6m_is_ifp_detached(const struct in6_multi *inm)
 {
 	struct ifnet *ifp;
 
 	KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->in6m_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that network-layer notion of ifp is the
 		 * same as that of link-layer.
 		 */
 		KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	return (ifp == NULL);
 }
 #endif
 
 /*
  * Initialize an in6_mfilter structure to a known state at t0, t1
  * with an empty source filter list.
  */
 static __inline void
 im6f_init(struct in6_mfilter *imf, const int st0, const int st1)
 {
 	memset(imf, 0, sizeof(struct in6_mfilter));
 	RB_INIT(&imf->im6f_sources);
 	imf->im6f_st[0] = st0;
 	imf->im6f_st[1] = st1;
 }
 
-/*
- * Resize the ip6_moptions vector to the next power-of-two minus 1.
- * May be called with locks held; do not sleep.
- */
-static int
-im6o_grow(struct ip6_moptions *imo)
+struct in6_mfilter *
+ip6_mfilter_alloc(const int mflags, const int st0, const int st1)
 {
-	struct in6_multi	**nmships;
-	struct in6_multi	**omships;
-	struct in6_mfilter	 *nmfilters;
-	struct in6_mfilter	 *omfilters;
-	size_t			  idx;
-	size_t			  newmax;
-	size_t			  oldmax;
+	struct in6_mfilter *imf;
 
-	nmships = NULL;
-	nmfilters = NULL;
-	omships = imo->im6o_membership;
-	omfilters = imo->im6o_mfilters;
-	oldmax = imo->im6o_max_memberships;
-	newmax = ((oldmax + 1) * 2) - 1;
+	imf = malloc(sizeof(*imf), M_IN6MFILTER, mflags);
 
-	if (newmax <= IPV6_MAX_MEMBERSHIPS) {
-		nmships = (struct in6_multi **)realloc(omships,
-		    sizeof(struct in6_multi *) * newmax, M_IP6MOPTS, M_NOWAIT);
-		nmfilters = (struct in6_mfilter *)realloc(omfilters,
-		    sizeof(struct in6_mfilter) * newmax, M_IN6MFILTER,
-		    M_NOWAIT);
-		if (nmships != NULL && nmfilters != NULL) {
-			/* Initialize newly allocated source filter heads. */
-			for (idx = oldmax; idx < newmax; idx++) {
-				im6f_init(&nmfilters[idx], MCAST_UNDEFINED,
-				    MCAST_EXCLUDE);
-			}
-			imo->im6o_max_memberships = newmax;
-			imo->im6o_membership = nmships;
-			imo->im6o_mfilters = nmfilters;
-		}
-	}
+	if (imf != NULL)
+		im6f_init(imf, st0, st1);
 
-	if (nmships == NULL || nmfilters == NULL) {
-		if (nmships != NULL)
-			free(nmships, M_IP6MOPTS);
-		if (nmfilters != NULL)
-			free(nmfilters, M_IN6MFILTER);
-		return (ETOOMANYREFS);
-	}
+	return (imf);
+}
 
-	return (0);
+void
+ip6_mfilter_free(struct in6_mfilter *imf)
+{
+
+	im6f_purge(imf);
+	free(imf, M_IN6MFILTER);
 }
 
 /*
  * Find an IPv6 multicast group entry for this ip6_moptions instance
  * which matches the specified group, and optionally an interface.
  * Return its index into the array, or -1 if not found.
  */
-static size_t
+static struct in6_mfilter *
 im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group)
 {
 	const struct sockaddr_in6 *gsin6;
-	struct in6_multi	**pinm;
-	int		  idx;
-	int		  nmships;
+        struct in6_mfilter *imf;
+        struct in6_multi *inm;
 
-	gsin6 = (const struct sockaddr_in6 *)group;
+        gsin6 = (const struct sockaddr_in6 *)group;
 
-	/* The im6o_membership array may be lazy allocated. */
-	if (imo->im6o_membership == NULL || imo->im6o_num_memberships == 0)
-		return (-1);
-
-	nmships = imo->im6o_num_memberships;
-	pinm = &imo->im6o_membership[0];
-	for (idx = 0; idx < nmships; idx++, pinm++) {
-		if (*pinm == NULL)
+	IP6_MFILTER_FOREACH(imf, &imo->im6o_head) {
+		inm = imf->im6f_in6m;
+		if (inm == NULL)
 			continue;
-		if ((ifp == NULL || ((*pinm)->in6m_ifp == ifp)) &&
-		    IN6_ARE_ADDR_EQUAL(&(*pinm)->in6m_addr,
+		if ((ifp == NULL || (inm->in6m_ifp == ifp)) &&
+		    IN6_ARE_ADDR_EQUAL(&inm->in6m_addr,
 		    &gsin6->sin6_addr)) {
 			break;
 		}
 	}
-	if (idx >= nmships)
-		idx = -1;
-
-	return (idx);
+	return (imf);
 }
 
 /*
  * Find an IPv6 multicast source entry for this imo which matches
  * the given group index for this socket, and source address.
  *
  * XXX TODO: The scope ID, if present in src, is stripped before
  * any comparison. We SHOULD enforce scope/zone checks where the source
  * filter entry has a link scope.
  *
  * NOTE: This does not check if the entry is in-mode, merely if
  * it exists, which may not be the desired behaviour.
  */
 static struct in6_msource *
-im6o_match_source(const struct ip6_moptions *imo, const size_t gidx,
-    const struct sockaddr *src)
+im6o_match_source(struct in6_mfilter *imf, const struct sockaddr *src)
 {
 	struct ip6_msource	 find;
-	struct in6_mfilter	*imf;
 	struct ip6_msource	*ims;
 	const sockunion_t	*psa;
 
 	KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__));
-	KASSERT(gidx != -1 && gidx < imo->im6o_num_memberships,
-	    ("%s: invalid index %d\n", __func__, (int)gidx));
 
-	/* The im6o_mfilters array may be lazy allocated. */
-	if (imo->im6o_mfilters == NULL)
-		return (NULL);
-	imf = &imo->im6o_mfilters[gidx];
-
 	psa = (const sockunion_t *)src;
 	find.im6s_addr = psa->sin6.sin6_addr;
 	in6_clearscope(&find.im6s_addr);		/* XXX */
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 
 	return ((struct in6_msource *)ims);
 }
 
 /*
  * Perform filtering for multicast datagrams on a socket by group and source.
  *
  * Returns 0 if a datagram should be allowed through, or various error codes
  * if the socket was not a member of the group, or the source was muted, etc.
  */
 int
 im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group, const struct sockaddr *src)
 {
-	size_t gidx;
+	struct in6_mfilter *imf;
 	struct in6_msource *ims;
 	int mode;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 
-	gidx = im6o_match_group(imo, ifp, group);
-	if (gidx == -1)
+	imf = im6o_match_group(imo, ifp, group);
+	if (imf == NULL)
 		return (MCAST_NOTGMEMBER);
 
 	/*
 	 * Check if the source was included in an (S,G) join.
 	 * Allow reception on exclusive memberships by default,
 	 * reject reception on inclusive memberships by default.
 	 * Exclude source only if an in-mode exclude filter exists.
 	 * Include source only if an in-mode include filter exists.
 	 * NOTE: We are comparing group state here at MLD t1 (now)
 	 * with socket-layer t0 (since last downcall).
 	 */
-	mode = imo->im6o_mfilters[gidx].im6f_st[1];
-	ims = im6o_match_source(imo, gidx, src);
+	mode = imf->im6f_st[1];
+	ims = im6o_match_source(imf, src);
 
 	if ((ims == NULL && mode == MCAST_INCLUDE) ||
 	    (ims != NULL && ims->im6sl_st[0] != mode))
 		return (MCAST_NOTSMEMBER);
 
 	return (MCAST_PASS);
 }
 
 /*
  * Find and return a reference to an in6_multi record for (ifp, group),
  * and bump its reference count.
  * If one does not exist, try to allocate it, and update link-layer multicast
  * filters on ifp to listen for group.
  * Assumes the IN6_MULTI lock is held across the call.
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in6_getmulti(struct ifnet *ifp, const struct in6_addr *group,
     struct in6_multi **pinm)
 {
 	struct epoch_tracker	 et;
 	struct sockaddr_in6	 gsin6;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm;
 	int			 error;
 
 	error = 0;
 
 	/*
 	 * XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK;
 	 * if_addmulti() takes this mutex itself, so we must drop and
 	 * re-acquire around the call.
 	 */
 	IN6_MULTI_LOCK_ASSERT();
 	IN6_MULTI_LIST_LOCK();
 	IF_ADDR_WLOCK(ifp);
 	NET_EPOCH_ENTER(et);
 	inm = in6m_lookup_locked(ifp, group);
 	NET_EPOCH_EXIT(et);
 
 	if (inm != NULL) {
 		/*
 		 * If we already joined this group, just bump the
 		 * refcount and return it.
 		 */
 		KASSERT(inm->in6m_refcount >= 1,
 		    ("%s: bad refcount %d", __func__, inm->in6m_refcount));
 		in6m_acquire_locked(inm);
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	memset(&gsin6, 0, sizeof(gsin6));
 	gsin6.sin6_family = AF_INET6;
 	gsin6.sin6_len = sizeof(struct sockaddr_in6);
 	gsin6.sin6_addr = *group;
 
 	/*
 	 * Check if a link-layer group is already associated
 	 * with this network-layer group on the given ifnet.
 	 */
 	IN6_MULTI_LIST_UNLOCK();
 	IF_ADDR_WUNLOCK(ifp);
 	error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma);
 	if (error != 0)
 		return (error);
 	IN6_MULTI_LIST_LOCK();
 	IF_ADDR_WLOCK(ifp);
 
 	/*
 	 * If something other than netinet6 is occupying the link-layer
 	 * group, print a meaningful error message and back out of
 	 * the allocation.
 	 * Otherwise, bump the refcount on the existing network-layer
 	 * group association and return it.
 	 */
 	if (ifma->ifma_protospec != NULL) {
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 #ifdef INVARIANTS
 		KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
 		    __func__));
 		KASSERT(ifma->ifma_addr->sa_family == AF_INET6,
 		    ("%s: ifma not AF_INET6", __func__));
 		KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
 		if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp ||
 		    !IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group))
 			panic("%s: ifma %p is inconsistent with %p (%p)",
 			    __func__, ifma, inm, group);
 #endif
 		in6m_acquire_locked(inm);
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	IF_ADDR_WLOCK_ASSERT(ifp);
 
 	/*
 	 * A new in6_multi record is needed; allocate and initialize it.
 	 * We DO NOT perform an MLD join as the in6_ layer may need to
 	 * push an initial source list down to MLD to support SSM.
 	 *
 	 * The initial source filter state is INCLUDE, {} as per the RFC.
 	 * Pending state-changes per group are subject to a bounds check.
 	 */
 	inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO);
 	if (inm == NULL) {
 		IN6_MULTI_LIST_UNLOCK();
 		IF_ADDR_WUNLOCK(ifp);
 		if_delmulti_ifma(ifma);
 		return (ENOMEM);
 	}
 	inm->in6m_addr = *group;
 	inm->in6m_ifp = ifp;
 	inm->in6m_mli = MLD_IFINFO(ifp);
 	inm->in6m_ifma = ifma;
 	inm->in6m_refcount = 1;
 	inm->in6m_state = MLD_NOT_MEMBER;
 	mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES);
 
 	inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED;
 	inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 	RB_INIT(&inm->in6m_srcs);
 
 	ifma->ifma_protospec = inm;
 	*pinm = inm;
 
  out_locked:
 	IN6_MULTI_LIST_UNLOCK();
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Drop a reference to an in6_multi record.
  *
  * If the refcount drops to 0, free the in6_multi record and
  * delete the underlying link-layer membership.
  */
 static void
 in6m_release(struct in6_multi *inm)
 {
 	struct ifmultiaddr *ifma;
 	struct ifnet *ifp;
 
 	CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount);
 
 	MPASS(inm->in6m_refcount == 0);
 	CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm);
 
 	ifma = inm->in6m_ifma;
 	ifp = inm->in6m_ifp;
 	MPASS(ifma->ifma_llifma == NULL);
 
 	/* XXX this access is not covered by IF_ADDR_LOCK */
 	CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma);
 	KASSERT(ifma->ifma_protospec == NULL,
 	    ("%s: ifma_protospec != NULL", __func__));
 	if (ifp == NULL)
 		ifp = ifma->ifma_ifp;
 
 	if (ifp != NULL) {
 		CURVNET_SET(ifp->if_vnet);
 		in6m_purge(inm);
 		free(inm, M_IP6MADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 		CURVNET_RESTORE();
 		if_rele(ifp);
 	} else {
 		in6m_purge(inm);
 		free(inm, M_IP6MADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 	}
 }
 
 static struct grouptask free_gtask;
 static struct in6_multi_head in6m_free_list;
 static void in6m_release_task(void *arg __unused);
 static void in6m_init(void)
 {
 	SLIST_INIT(&in6m_free_list);
 	taskqgroup_config_gtask_init(NULL, &free_gtask, in6m_release_task, "in6m release task");
 }
 
 #ifdef EARLY_AP_STARTUP
 SYSINIT(in6m_init, SI_SUB_SMP + 1, SI_ORDER_FIRST,
 	in6m_init, NULL);
 #else
 SYSINIT(in6m_init, SI_SUB_ROOT_CONF - 1, SI_ORDER_SECOND,
 	in6m_init, NULL);
 #endif
 
 
 void
 in6m_release_list_deferred(struct in6_multi_head *inmh)
 {
 	if (SLIST_EMPTY(inmh))
 		return;
 	mtx_lock(&in6_multi_free_mtx);
 	SLIST_CONCAT(&in6m_free_list, inmh, in6_multi, in6m_nrele);
 	mtx_unlock(&in6_multi_free_mtx);
 	GROUPTASK_ENQUEUE(&free_gtask);
 }
 
 void
 in6m_release_wait(void)
 {
 
 	/* Wait for all jobs to complete. */
 	gtaskqueue_drain_all(free_gtask.gt_taskqueue);
 }
 
 void
 in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ifa6;
 	struct in6_multi_mship *imm, *imm_tmp;
 	struct ifmultiaddr *ifma, *ll_ifma;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	ifp = inm->in6m_ifp;
 	if (ifp == NULL)
 		return;		/* already called */
 
 	inm->in6m_ifp = NULL;
 	IF_ADDR_WLOCK_ASSERT(ifp);
 	ifma = inm->in6m_ifma;
 	if (ifma == NULL)
 		return;
 
 	if_ref(ifp);
 	if (ifma->ifma_flags & IFMA_F_ENQUEUED) {
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 	}
 	MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname);
 	if ((ll_ifma = ifma->ifma_llifma) != NULL) {
 		MPASS(ifma != ll_ifma);
 		ifma->ifma_llifma = NULL;
 		MPASS(ll_ifma->ifma_llifma == NULL);
 		MPASS(ll_ifma->ifma_ifp == ifp);
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
 				CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link);
 				ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 			}
 			MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname);
 			if_freemulti(ll_ifma);
 		}
 	}
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ifa6 = (void *)ifa;
 		LIST_FOREACH_SAFE(imm, &ifa6->ia6_memberships,
 		    i6mm_chain, imm_tmp) {
 			if (inm == imm->i6mm_maddr) {
 				LIST_REMOVE(imm, i6mm_chain);
 				free(imm, M_IP6MADDR);
 				in6m_rele_locked(inmh, inm);
 			}
 		}
 	}
 }
 
 static void
 in6m_release_task(void *arg __unused)
 {
 	struct in6_multi_head in6m_free_tmp;
 	struct in6_multi *inm, *tinm;
 
 	SLIST_INIT(&in6m_free_tmp);
 	mtx_lock(&in6_multi_free_mtx);
 	SLIST_CONCAT(&in6m_free_tmp, &in6m_free_list, in6_multi, in6m_nrele);
 	mtx_unlock(&in6_multi_free_mtx);
 	IN6_MULTI_LOCK();
 	SLIST_FOREACH_SAFE(inm, &in6m_free_tmp, in6m_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&in6m_free_tmp, in6m_nrele);
 		in6m_release(inm);
 	}
 	IN6_MULTI_UNLOCK();
 }
 
 /*
  * Clear recorded source entries for a group.
  * Used by the MLD code. Caller must hold the IN6_MULTI lock.
  * FIXME: Should reap.
  */
 void
 in6m_clear_recorded(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 		if (ims->im6s_stp) {
 			ims->im6s_stp = 0;
 			--inm->in6m_st[1].iss_rec;
 		}
 	}
 	KASSERT(inm->in6m_st[1].iss_rec == 0,
 	    ("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec));
 }
 
 /*
  * Record a source as pending for a Source-Group MLDv2 query.
  * This lives here as it modifies the shared tree.
  *
  * inm is the group descriptor.
  * naddr is the address of the source to record in network-byte order.
  *
  * If the net.inet6.mld.sgalloc sysctl is non-zero, we will
  * lazy-allocate a source node in response to an SG query.
  * Otherwise, no allocation is performed. This saves some memory
  * with the trade-off that the source will not be reported to the
  * router if joined in the window between the query response and
  * the group actually being joined on the local host.
  *
  * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed.
  * This turns off the allocation of a recorded source entry if
  * the group has not been joined.
  *
  * Return 0 if the source didn't exist or was already marked as recorded.
  * Return 1 if the source was marked as recorded by this function.
  * Return <0 if any error occurred (negated errno code).
  */
 int
 in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	find.im6s_addr = *addr;
 	ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
 	if (ims && ims->im6s_stp)
 		return (0);
 	if (ims == NULL) {
 		if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
 			return (-ENOSPC);
 		nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (-ENOMEM);
 		nims->im6s_addr = find.im6s_addr;
 		RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
 		++inm->in6m_nsrc;
 		ims = nims;
 	}
 
 	/*
 	 * Mark the source as recorded and update the recorded
 	 * source count.
 	 */
 	++ims->im6s_stp;
 	++inm->in6m_st[1].iss_rec;
 
 	return (1);
 }
 
 /*
  * Return a pointer to an in6_msource owned by an in6_mfilter,
  * given its source address.
  * Lazy-allocate if needed. If this is a new entry its filter state is
  * undefined at t0.
  *
  * imf is the filter set being modified.
  * addr is the source address.
  *
  * SMPng: May be called with locks held; malloc must not block.
  */
 static int
 im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin,
     struct in6_msource **plims)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 	struct in6_msource	*lims;
 	int			 error;
 
 	error = 0;
 	ims = NULL;
 	lims = NULL;
 
 	find.im6s_addr = psin->sin6_addr;
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 	lims = (struct in6_msource *)ims;
 	if (lims == NULL) {
 		if (imf->im6f_nsrc == in6_mcast_maxsocksrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		lims = (struct in6_msource *)nims;
 		lims->im6s_addr = find.im6s_addr;
 		lims->im6sl_st[0] = MCAST_UNDEFINED;
 		RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
 		++imf->im6f_nsrc;
 	}
 
 	*plims = lims;
 
 	return (error);
 }
 
 /*
  * Graft a source entry into an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being in the new filter mode at t1.
  *
  * Return the pointer to the new node, otherwise return NULL.
  */
 static struct in6_msource *
 im6f_graft(struct in6_mfilter *imf, const uint8_t st1,
     const struct sockaddr_in6 *psin)
 {
 	struct ip6_msource	*nims;
 	struct in6_msource	*lims;
 
 	nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
 	    M_NOWAIT | M_ZERO);
 	if (nims == NULL)
 		return (NULL);
 	lims = (struct in6_msource *)nims;
 	lims->im6s_addr = psin->sin6_addr;
 	lims->im6sl_st[0] = MCAST_UNDEFINED;
 	lims->im6sl_st[1] = st1;
 	RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
 	++imf->im6f_nsrc;
 
 	return (lims);
 }
 
 /*
  * Prune a source entry from an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being left at t1, it is not freed.
  *
  * Return 0 if no error occurred, otherwise return an errno value.
  */
 static int
 im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	find.im6s_addr = psin->sin6_addr;
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 	if (ims == NULL)
 		return (ENOENT);
 	lims = (struct in6_msource *)ims;
 	lims->im6sl_st[1] = MCAST_UNDEFINED;
 	return (0);
 }
 
 /*
  * Revert socket-layer filter set deltas at t1 to t0 state.
  */
 static void
 im6f_rollback(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == lims->im6sl_st[1]) {
 			/* no change at t1 */
 			continue;
 		} else if (lims->im6sl_st[0] != MCAST_UNDEFINED) {
 			/* revert change to existing source at t1 */
 			lims->im6sl_st[1] = lims->im6sl_st[0];
 		} else {
 			/* revert source added t1 */
 			CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 			RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 			free(ims, M_IN6MFILTER);
 			imf->im6f_nsrc--;
 		}
 	}
 	imf->im6f_st[1] = imf->im6f_st[0];
 }
 
 /*
  * Mark socket-layer filter set as INCLUDE {} at t1.
  */
 static void
 im6f_leave(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		lims->im6sl_st[1] = MCAST_UNDEFINED;
 	}
 	imf->im6f_st[1] = MCAST_INCLUDE;
 }
 
 /*
  * Mark socket-layer filter set deltas as committed.
  */
 static void
 im6f_commit(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		lims->im6sl_st[0] = lims->im6sl_st[1];
 	}
 	imf->im6f_st[0] = imf->im6f_st[1];
 }
 
 /*
  * Reap unreferenced sources from socket-layer filter set.
  */
 static void
 im6f_reap(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		lims = (struct in6_msource *)ims;
 		if ((lims->im6sl_st[0] == MCAST_UNDEFINED) &&
 		    (lims->im6sl_st[1] == MCAST_UNDEFINED)) {
 			CTR2(KTR_MLD, "%s: free lims %p", __func__, ims);
 			RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 			free(ims, M_IN6MFILTER);
 			imf->im6f_nsrc--;
 		}
 	}
 }
 
 /*
  * Purge socket-layer filter set.
  */
 static void
 im6f_purge(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 		free(ims, M_IN6MFILTER);
 		imf->im6f_nsrc--;
 	}
 	imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED;
 	KASSERT(RB_EMPTY(&imf->im6f_sources),
 	    ("%s: im6f_sources not empty", __func__));
 }
 
 /*
  * Look up a source filter entry for a multicast group.
  *
  * inm is the group descriptor to work with.
  * addr is the IPv6 address to look up.
  * noalloc may be non-zero to suppress allocation of sources.
  * *pims will be set to the address of the retrieved or allocated source.
  *
  * SMPng: NOTE: may be called with locks held.
  * Return 0 if successful, otherwise return a non-zero error code.
  */
 static int
 in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr,
     const int noalloc, struct ip6_msource **pims)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	find.im6s_addr = *addr;
 	ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
 	if (ims == NULL && !noalloc) {
 		if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		nims->im6s_addr = *addr;
 		RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
 		++inm->in6m_nsrc;
 		ims = nims;
 		CTR3(KTR_MLD, "%s: allocated %s as %p", __func__,
 		    ip6_sprintf(ip6tbuf, addr), ims);
 	}
 
 	*pims = ims;
 	return (0);
 }
 
 /*
  * Merge socket-layer source into MLD-layer source.
  * If rollback is non-zero, perform the inverse of the merge.
  */
 static void
 im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims,
     const int rollback)
 {
 	int n = rollback ? -1 : 1;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 
 	ip6_sprintf(ip6tbuf, &lims->im6s_addr);
 #endif
 
 	if (lims->im6sl_st[0] == MCAST_EXCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].ex -= n;
 	} else if (lims->im6sl_st[0] == MCAST_INCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].in -= n;
 	}
 
 	if (lims->im6sl_st[1] == MCAST_EXCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].ex += n;
 	} else if (lims->im6sl_st[1] == MCAST_INCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].in += n;
 	}
 }
 
 /*
  * Atomically update the global in6_multi state, when a membership's
  * filter list is being updated in any way.
  *
  * imf is the per-inpcb-membership group filter pointer.
  * A fake imf may be passed for in-kernel consumers.
  *
  * XXX This is a candidate for a set-symmetric-difference style loop
  * which would eliminate the repeated lookup from root of ims nodes,
  * as they share the same key space.
  *
  * If any error occurred this function will back out of refcounts
  * and return a non-zero value.
  */
 static int
 in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *nims;
 	struct in6_msource	*lims;
 	int			 schanged, error;
 	int			 nsrc0, nsrc1;
 
 	schanged = 0;
 	error = 0;
 	nsrc1 = nsrc0 = 0;
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	/*
 	 * Update the source filters first, as this may fail.
 	 * Maintain count of in-mode filters at t0, t1. These are
 	 * used to work out if we transition into ASM mode or not.
 	 * Maintain a count of source filters whose state was
 	 * actually modified by this operation.
 	 */
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++;
 		if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++;
 		if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue;
 		error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims);
 		++schanged;
 		if (error)
 			break;
 		im6s_merge(nims, lims, 0);
 	}
 	if (error) {
 		struct ip6_msource *bims;
 
 		RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) {
 			lims = (struct in6_msource *)ims;
 			if (lims->im6sl_st[0] == lims->im6sl_st[1])
 				continue;
 			(void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims);
 			if (bims == NULL)
 				continue;
 			im6s_merge(bims, lims, 1);
 		}
 		goto out_reap;
 	}
 
 	CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1",
 	    __func__, nsrc0, nsrc1);
 
 	/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
 	if (imf->im6f_st[0] == imf->im6f_st[1] &&
 	    imf->im6f_st[1] == MCAST_INCLUDE) {
 		if (nsrc1 == 0) {
 			CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
 			--inm->in6m_st[1].iss_in;
 		}
 	}
 
 	/* Handle filter mode transition on socket. */
 	if (imf->im6f_st[0] != imf->im6f_st[1]) {
 		CTR3(KTR_MLD, "%s: imf transition %d to %d",
 		    __func__, imf->im6f_st[0], imf->im6f_st[1]);
 
 		if (imf->im6f_st[0] == MCAST_EXCLUDE) {
 			CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__);
 			--inm->in6m_st[1].iss_ex;
 		} else if (imf->im6f_st[0] == MCAST_INCLUDE) {
 			CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
 			--inm->in6m_st[1].iss_in;
 		}
 
 		if (imf->im6f_st[1] == MCAST_EXCLUDE) {
 			CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__);
 			inm->in6m_st[1].iss_ex++;
 		} else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
 			CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__);
 			inm->in6m_st[1].iss_in++;
 		}
 	}
 
 	/*
 	 * Track inm filter state in terms of listener counts.
 	 * If there are any exclusive listeners, stack-wide
 	 * membership is exclusive.
 	 * Otherwise, if only inclusive listeners, stack-wide is inclusive.
 	 * If no listeners remain, state is undefined at t1,
 	 * and the MLD lifecycle for this group should finish.
 	 */
 	if (inm->in6m_st[1].iss_ex > 0) {
 		CTR1(KTR_MLD, "%s: transition to EX", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE;
 	} else if (inm->in6m_st[1].iss_in > 0) {
 		CTR1(KTR_MLD, "%s: transition to IN", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_INCLUDE;
 	} else {
 		CTR1(KTR_MLD, "%s: transition to UNDEF", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 	}
 
 	/* Decrement ASM listener count on transition out of ASM mode. */
 	if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
 		if ((imf->im6f_st[1] != MCAST_EXCLUDE) ||
 		    (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) {
 			CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__);
 			--inm->in6m_st[1].iss_asm;
 		}
 	}
 
 	/* Increment ASM listener count on transition to ASM mode. */
 	if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
 		CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__);
 		inm->in6m_st[1].iss_asm++;
 	}
 
 	CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm);
 	in6m_print(inm);
 
 out_reap:
 	if (schanged > 0) {
 		CTR1(KTR_MLD, "%s: sources changed; reaping", __func__);
 		in6m_reap(inm);
 	}
 	return (error);
 }
 
 /*
  * Mark an in6_multi's filter set deltas as committed.
  * Called by MLD after a state change has been enqueued.
  */
 void
 in6m_commit(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims;
 
 	CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm);
 	CTR1(KTR_MLD, "%s: pre commit:", __func__);
 	in6m_print(inm);
 
 	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 		ims->im6s_st[0] = ims->im6s_st[1];
 	}
 	inm->in6m_st[0] = inm->in6m_st[1];
 }
 
 /*
  * Reap unreferenced nodes from an in6_multi's filter set.
  */
 static void
 in6m_reap(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
 		if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 ||
 		    ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 ||
 		    ims->im6s_stp != 0)
 			continue;
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
 		free(ims, M_IP6MSOURCE);
 		inm->in6m_nsrc--;
 	}
 }
 
 /*
  * Purge all source nodes from an in6_multi's filter set.
  */
 static void
 in6m_purge(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
 		free(ims, M_IP6MSOURCE);
 		inm->in6m_nsrc--;
 	}
 	/* Free state-change requests that might be queued. */
 	mbufq_drain(&inm->in6m_scq);
 }
 
 /*
  * Join a multicast address w/o sources.
  * KAME compatibility entry point.
  *
  * SMPng: Assume no mc locks held by caller.
  */
 int
 in6_joingroup(struct ifnet *ifp, const struct in6_addr *mcaddr,
     /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
     const int delay)
 {
 	int error;
 
 	IN6_MULTI_LOCK();
 	error = in6_joingroup_locked(ifp, mcaddr, NULL, pinm, delay);
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Join a multicast group; real entry point.
  *
  * Only preserves atomicity at inm level.
  * NOTE: imf argument cannot be const due to sys/tree.h limitations.
  *
  * If the MLD downcall fails, the group is not joined, and an error
  * code is returned.
  */
 int
 in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr,
     /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
     const int delay)
 {
 	struct in6_multi_head    inmh;
 	struct in6_mfilter	 timf;
 	struct in6_multi	*inm;
 	struct ifmultiaddr *ifma;
 	int			 error;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	/*
 	 * Sanity: Check scope zone ID was set for ifp, if and
 	 * only if group is scoped to an interface.
 	 */
 	KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr),
 	    ("%s: not a multicast address", __func__));
 	if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) ||
 	    IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) {
 		KASSERT(mcaddr->s6_addr16[1] != 0,
 		    ("%s: scope zone ID not set", __func__));
 	}
 
 	IN6_MULTI_LOCK_ASSERT();
 	IN6_MULTI_LIST_UNLOCK_ASSERT();
 
 	CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__,
 	    ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp));
 
 	error = 0;
 	inm = NULL;
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		imf = &timf;
 	}
 	error = in6_getmulti(ifp, mcaddr, &inm);
 	if (error) {
 		CTR1(KTR_MLD, "%s: in6_getmulti() failure", __func__);
 		return (error);
 	}
 
 	IN6_MULTI_LIST_LOCK();
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 		goto out_in6m_release;
 	}
 
 	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 	error = mld_change_state(inm, delay);
 	if (error) {
 		CTR1(KTR_MLD, "%s: failed to update source", __func__);
 		goto out_in6m_release;
 	}
 
 out_in6m_release:
 	SLIST_INIT(&inmh);
 	if (error) {
 		struct epoch_tracker et;
 
 		CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_protospec == inm) {
 				ifma->ifma_protospec = NULL;
 				break;
 			}
 		}
 		in6m_disconnect_locked(&inmh, inm);
 		in6m_rele_locked(&inmh, inm);
 		NET_EPOCH_EXIT(et);
 	} else {
 		*pinm = inm;
 	}
 	IN6_MULTI_LIST_UNLOCK();
 	in6m_release_list_deferred(&inmh);
 	return (error);
 }
 
 /*
  * Leave a multicast group; unlocked entry point.
  */
 int
 in6_leavegroup(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	int error;
 
 	IN6_MULTI_LOCK();
 	error = in6_leavegroup_locked(inm, imf);
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Leave a multicast group; real entry point.
  * All source filters will be expunged.
  *
  * Only preserves atomicity at inm level.
  *
  * Holding the write lock for the INP which contains imf
  * is highly advisable. We can't assert for it as imf does not
  * contain a back-pointer to the owning inp.
  *
  * Note: This is not the same as in6m_release(*) as this function also
  * makes a state change downcall into MLD.
  */
 int
 in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	struct in6_multi_head	 inmh;
 	struct in6_mfilter	 timf;
 	struct ifnet *ifp;
 	int			 error;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	error = 0;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__,
 	    inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    (in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)),
 	    imf);
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
 		imf = &timf;
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 *
 	 * As this particular invocation should not cause any memory
 	 * to be allocated, and there is no opportunity to roll back
 	 * the transaction, it MUST NOT fail.
 	 */
 
 	ifp = inm->in6m_ifp;
 	IN6_MULTI_LIST_LOCK();
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
 
 	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 	error = 0;
 	if (ifp)
 		error = mld_change_state(inm, 0);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 
 	CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
 	if (ifp)
 		IF_ADDR_WLOCK(ifp);
 
 	SLIST_INIT(&inmh);
 	if (inm->in6m_refcount == 1)
 		in6m_disconnect_locked(&inmh, inm);
 	in6m_rele_locked(&inmh, inm);
 	if (ifp)
 		IF_ADDR_WUNLOCK(ifp);
 	IN6_MULTI_LIST_UNLOCK();
 	in6m_release_list_deferred(&inmh);
 	return (error);
 }
 
 
 /*
  * Block or unblock an ASM multicast source on an inpcb.
  * This implements the delta-based API described in RFC 3678.
  *
  * The delta-based API applies only to exclusive-mode memberships.
  * An MLD downcall will be performed.
  *
  * SMPng: NOTE: Must take Giant as a join may create a new ifma.
  *
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_msource		*ims;
 	struct in6_multi			*inm;
-	size_t				 idx;
 	uint16_t			 fmode;
 	int				 error, doblock;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	ifp = NULL;
 	error = 0;
 	doblock = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 
 	switch (sopt->sopt_name) {
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = sooptcopyin(sopt, &gsr,
 		    sizeof(struct group_source_req),
 		    sizeof(struct group_source_req));
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (ssa->sin6.sin6_family != AF_INET6 ||
 		    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 
 		ifp = ifnet_byindex(gsr.gsr_interface);
 
 		if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
 			doblock = 1;
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	/*
 	 * Check if we are actually a member of this group.
 	 */
 	imo = in6p_findmoptions(inp);
-	idx = im6o_match_group(imo, ifp, &gsa->sa);
-	if (idx == -1 || imo->im6o_mfilters == NULL) {
+	imf = im6o_match_group(imo, ifp, &gsa->sa);
+	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
+	inm = imf->im6f_in6m;
 
-	KASSERT(imo->im6o_mfilters != NULL,
-	    ("%s: im6o_mfilters not allocated", __func__));
-	imf = &imo->im6o_mfilters[idx];
-	inm = imo->im6o_membership[idx];
-
 	/*
 	 * Attempting to use the delta-based API on an
 	 * non exclusive-mode membership is an error.
 	 */
 	fmode = imf->im6f_st[0];
 	if (fmode != MCAST_EXCLUDE) {
 		error = EINVAL;
 		goto out_in6p_locked;
 	}
 
 	/*
 	 * Deal with error cases up-front:
 	 *  Asked to block, but already blocked; or
 	 *  Asked to unblock, but nothing to unblock.
 	 * If adding a new block entry, allocate it.
 	 */
-	ims = im6o_match_source(imo, idx, &ssa->sa);
+	ims = im6o_match_source(imf, &ssa->sa);
 	if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
 		CTR3(KTR_MLD, "%s: source %s %spresent", __func__,
 		    ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
 		    doblock ? "" : "not ");
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	if (doblock) {
 		CTR2(KTR_MLD, "%s: %s source", __func__, "block");
 		ims = im6f_graft(imf, fmode, &ssa->sin6);
 		if (ims == NULL)
 			error = ENOMEM;
 	} else {
 		CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
 		error = im6f_prune(imf, &ssa->sin6);
 	}
 
 	if (error) {
 		CTR1(KTR_MLD, "%s: merge imf state failed", __func__);
 		goto out_im6f_rollback;
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	IN6_MULTI_LIST_LOCK();
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 	else {
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 	}
 
 	IN6_MULTI_LIST_UNLOCK();
 
 out_im6f_rollback:
 	if (error)
 		im6f_rollback(imf);
 	else
 		im6f_commit(imf);
 
 	im6f_reap(imf);
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Given an inpcb, return its multicast options structure pointer.  Accepts
  * an unlocked inpcb pointer, but will return it locked.  May sleep.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  * SMPng: NOTE: Returns with the INP write lock held.
  */
 static struct ip6_moptions *
 in6p_findmoptions(struct inpcb *inp)
 {
 	struct ip6_moptions	 *imo;
-	struct in6_multi		**immp;
-	struct in6_mfilter	 *imfp;
-	size_t			  idx;
 
 	INP_WLOCK(inp);
 	if (inp->in6p_moptions != NULL)
 		return (inp->in6p_moptions);
 
 	INP_WUNLOCK(inp);
 
 	imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK);
-	immp = malloc(sizeof(*immp) * IPV6_MIN_MEMBERSHIPS, M_IP6MOPTS,
-	    M_WAITOK | M_ZERO);
-	imfp = malloc(sizeof(struct in6_mfilter) * IPV6_MIN_MEMBERSHIPS,
-	    M_IN6MFILTER, M_WAITOK);
 
 	imo->im6o_multicast_ifp = NULL;
 	imo->im6o_multicast_hlim = V_ip6_defmcasthlim;
 	imo->im6o_multicast_loop = in6_mcast_loop;
-	imo->im6o_num_memberships = 0;
-	imo->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
-	imo->im6o_membership = immp;
+	STAILQ_INIT(&imo->im6o_head);
 
-	/* Initialize per-group source filters. */
-	for (idx = 0; idx < IPV6_MIN_MEMBERSHIPS; idx++)
-		im6f_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE);
-	imo->im6o_mfilters = imfp;
-
 	INP_WLOCK(inp);
 	if (inp->in6p_moptions != NULL) {
-		free(imfp, M_IN6MFILTER);
-		free(immp, M_IP6MOPTS);
 		free(imo, M_IP6MOPTS);
 		return (inp->in6p_moptions);
 	}
 	inp->in6p_moptions = imo;
 	return (imo);
 }
 
 /*
  * Discard the IPv6 multicast options (and source filters).
  *
  * SMPng: NOTE: assumes INP write lock is held.
  *
  * XXX can all be safely deferred to epoch_call
  *
  */
 
 static void
 inp_gcmoptions(struct ip6_moptions *imo)
 {
-	struct in6_mfilter	*imf;
+	struct in6_mfilter *imf;
 	struct in6_multi *inm;
 	struct ifnet *ifp;
-	size_t			 idx, nmships;
 
-	nmships = imo->im6o_num_memberships;
-	for (idx = 0; idx < nmships; ++idx) {
-		imf = imo->im6o_mfilters ? &imo->im6o_mfilters[idx] : NULL;
-		if (imf)
-			im6f_leave(imf);
-		inm = imo->im6o_membership[idx];
-		ifp = inm->in6m_ifp;
-		if (ifp != NULL) {
-			CURVNET_SET(ifp->if_vnet);
-			(void)in6_leavegroup(inm, imf);
-			CURVNET_RESTORE();
-		} else {
-			(void)in6_leavegroup(inm, imf);
-		}
-		if (imf)
-			im6f_purge(imf);
-	}
+	while ((imf = ip6_mfilter_first(&imo->im6o_head)) != NULL) {
+                ip6_mfilter_remove(&imo->im6o_head, imf);
 
-	if (imo->im6o_mfilters)
-		free(imo->im6o_mfilters, M_IN6MFILTER);
-	free(imo->im6o_membership, M_IP6MOPTS);
-	free(imo, M_IP6MOPTS);
+                im6f_leave(imf);
+                if ((inm = imf->im6f_in6m) != NULL) {
+                        if ((ifp = inm->in6m_ifp) != NULL) {
+                                CURVNET_SET(ifp->if_vnet);
+                                (void)in6_leavegroup(inm, imf);
+                                CURVNET_RESTORE();
+                        } else {
+                                (void)in6_leavegroup(inm, imf);
+                        }
+                }
+                ip6_mfilter_free(imf);
+        }
+        free(imo, M_IP6MOPTS);
 }
 
 void
 ip6_freemoptions(struct ip6_moptions *imo)
 {
 	if (imo == NULL)
 		return;
 	inp_gcmoptions(imo);
 }
 
 /*
  * Atomically get source filters on a socket for an IPv6 multicast group.
  * Called with INP lock held; returns with lock released.
  */
 static int
 in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct ip6_moptions	*imo;
 	struct in6_mfilter	*imf;
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 	struct sockaddr_in6	*psin;
 	struct sockaddr_storage	*ptss;
 	struct sockaddr_storage	*tss;
 	int			 error;
-	size_t			 idx, nsrcs, ncsrcs;
+	size_t			 nsrcs, ncsrcs;
 
 	INP_WLOCK_ASSERT(inp);
 
 	imo = inp->in6p_moptions;
 	KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__));
 
 	INP_WUNLOCK(inp);
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_group.ss_family != AF_INET6 ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
 		return (EADDRNOTAVAIL);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	INP_WLOCK(inp);
 
 	/*
 	 * Lookup group on the socket.
 	 */
-	idx = im6o_match_group(imo, ifp, &gsa->sa);
-	if (idx == -1 || imo->im6o_mfilters == NULL) {
+	imf = im6o_match_group(imo, ifp, &gsa->sa);
+	if (imf == NULL) {
 		INP_WUNLOCK(inp);
 		return (EADDRNOTAVAIL);
 	}
-	imf = &imo->im6o_mfilters[idx];
 
 	/*
 	 * Ignore memberships which are in limbo.
 	 */
 	if (imf->im6f_st[1] == MCAST_UNDEFINED) {
 		INP_WUNLOCK(inp);
 		return (EAGAIN);
 	}
 	msfr.msfr_fmode = imf->im6f_st[1];
 
 	/*
 	 * If the user specified a buffer, copy out the source filter
 	 * entries to userland gracefully.
 	 * We only copy out the number of entries which userland
 	 * has asked for, but we always tell userland how big the
 	 * buffer really needs to be.
 	 */
 	if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
 		msfr.msfr_nsrcs = in6_mcast_maxsocksrc;
 	tss = NULL;
 	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
 		tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_NOWAIT | M_ZERO);
 		if (tss == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOBUFS);
 		}
 	}
 
 	/*
 	 * Count number of sources in-mode at t0.
 	 * If buffer space exists and remains, copy out source entries.
 	 */
 	nsrcs = msfr.msfr_nsrcs;
 	ncsrcs = 0;
 	ptss = tss;
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == MCAST_UNDEFINED ||
 		    lims->im6sl_st[0] != imf->im6f_st[0])
 			continue;
 		++ncsrcs;
 		if (tss != NULL && nsrcs > 0) {
 			psin = (struct sockaddr_in6 *)ptss;
 			psin->sin6_family = AF_INET6;
 			psin->sin6_len = sizeof(struct sockaddr_in6);
 			psin->sin6_addr = lims->im6s_addr;
 			psin->sin6_port = 0;
 			--nsrcs;
 			++ptss;
 		}
 	}
 
 	INP_WUNLOCK(inp);
 
 	if (tss != NULL) {
 		error = copyout(tss, msfr.msfr_srcs,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		free(tss, M_TEMP);
 		if (error)
 			return (error);
 	}
 
 	msfr.msfr_nsrcs = ncsrcs;
 	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
 
 	return (error);
 }
 
 /*
  * Return the IP multicast options in response to user getsockopt().
  */
 int
 ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip6_moptions	*im6o;
 	int			 error;
 	u_int			 optval;
 
 	INP_WLOCK(inp);
 	im6o = inp->in6p_moptions;
 	/*
 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
 	 * or is a divert socket, reject it.
 	 */
 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = 0;
 	switch (sopt->sopt_name) {
 	case IPV6_MULTICAST_IF:
 		if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) {
 			optval = 0;
 		} else {
 			optval = im6o->im6o_multicast_ifp->if_index;
 		}
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MULTICAST_HOPS:
 		if (im6o == NULL)
 			optval = V_ip6_defmcasthlim;
 		else
 			optval = im6o->im6o_multicast_hlim;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MULTICAST_LOOP:
 		if (im6o == NULL)
 			optval = in6_mcast_loop; /* XXX VIMAGE */
 		else
 			optval = im6o->im6o_multicast_loop;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MSFILTER:
 		if (im6o == NULL) {
 			error = EADDRNOTAVAIL;
 			INP_WUNLOCK(inp);
 		} else {
 			error = in6p_get_source_filters(inp, sopt);
 		}
 		break;
 
 	default:
 		INP_WUNLOCK(inp);
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Look up the ifnet to use for a multicast group membership,
  * given the address of an IPv6 group.
  *
  * This routine exists to support legacy IPv6 multicast applications.
  *
  * If inp is non-NULL, use this socket's current FIB number for any
  * required FIB lookup. Look up the group address in the unicast FIB,
  * and use its ifp; usually, this points to the default next-hop.
  * If the FIB lookup fails, return NULL.
  *
  * FUTURE: Support multiple forwarding tables for IPv6.
  *
  * Returns NULL if no ifp could be found.
  */
 static struct ifnet *
 in6p_lookup_mcast_ifp(const struct inpcb *in6p,
     const struct sockaddr_in6 *gsin6)
 {
 	struct nhop6_basic	nh6;
 	struct in6_addr		dst;
 	uint32_t		scopeid;
 	uint32_t		fibnum;
 
 	KASSERT(in6p->inp_vflag & INP_IPV6,
 	    ("%s: not INP_IPV6 inpcb", __func__));
 	KASSERT(gsin6->sin6_family == AF_INET6,
 	    ("%s: not AF_INET6 group", __func__));
 
 	in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid);
 	fibnum = in6p ? in6p->inp_inc.inc_fibnum : RT_DEFAULT_FIB;
 	if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0)
 		return (NULL);
 
 	return (nh6.nh_ifp);
 }
 
 /*
  * Join an IPv6 multicast group, possibly with a source.
  *
  * FIXME: The KAME use of the unspecified address (::)
  * to join *all* multicast groups is currently unsupported.
  */
 static int
 in6p_join_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct in6_multi_head		 inmh;
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_multi		*inm;
 	struct in6_msource		*lims;
-	size_t				 idx;
 	int				 error, is_new;
 
 	SLIST_INIT(&inmh);
 	ifp = NULL;
-	imf = NULL;
 	lims = NULL;
 	error = 0;
-	is_new = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	/*
 	 * Chew everything into struct group_source_req.
 	 * Overwrite the port field if present, as the sockaddr
 	 * being copied in may be matched with a binary comparison.
 	 * Ignore passed-in scope ID.
 	 */
 	switch (sopt->sopt_name) {
 	case IPV6_JOIN_GROUP: {
 		struct ipv6_mreq mreq;
 
 		error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
 		    sizeof(struct ipv6_mreq));
 		if (error)
 			return (error);
 
 		gsa->sin6.sin6_family = AF_INET6;
 		gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
 
 		if (mreq.ipv6mr_interface == 0) {
 			ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
 		} else {
 			if (V_if_index < mreq.ipv6mr_interface)
 				return (EADDRNOTAVAIL);
 			ifp = ifnet_byindex(mreq.ipv6mr_interface);
 		}
 		CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p",
 		    __func__, mreq.ipv6mr_interface, ifp);
 	} break;
 
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_JOIN_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			if (ssa->sin6.sin6_family != AF_INET6 ||
 			    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 			if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
 				return (EINVAL);
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&ssa->sin6.sin6_addr);
 			ssa->sin6.sin6_port = 0;
 			ssa->sin6.sin6_scope_id = 0;
 		}
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EADDRNOTAVAIL);
 
 	gsa->sin6.sin6_port = 0;
 	gsa->sin6.sin6_scope_id = 0;
 
 	/*
 	 * Always set the scope zone ID on memberships created from userland.
 	 * Use the passed-in ifp to do this.
 	 * XXX The in6_setscope() return value is meaningless.
 	 * XXX SCOPE6_LOCK() is taken by in6_setscope().
 	 */
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
+	IN6_MULTI_LOCK();
+
+	/*
+	 * Find the membership in the membership list.
+	 */
 	imo = in6p_findmoptions(inp);
-	idx = im6o_match_group(imo, ifp, &gsa->sa);
-	if (idx == -1) {
+	imf = im6o_match_group(imo, ifp, &gsa->sa);
+	if (imf == NULL) {
 		is_new = 1;
+		inm = NULL;
+
+		if (ip6_mfilter_count(&imo->im6o_head) >= IPV6_MAX_MEMBERSHIPS) {
+			error = ENOMEM;
+			goto out_in6p_locked;
+		}
 	} else {
-		inm = imo->im6o_membership[idx];
-		imf = &imo->im6o_mfilters[idx];
+		is_new = 0;
+		inm = imf->im6f_in6m;
+
 		if (ssa->ss.ss_family != AF_UNSPEC) {
 			/*
 			 * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
 			 * is an error. On an existing inclusive membership,
 			 * it just adds the source to the filter list.
 			 */
 			if (imf->im6f_st[1] != MCAST_INCLUDE) {
 				error = EINVAL;
 				goto out_in6p_locked;
 			}
 			/*
 			 * Throw out duplicates.
 			 *
 			 * XXX FIXME: This makes a naive assumption that
 			 * even if entries exist for *ssa in this imf,
 			 * they will be rejected as dupes, even if they
 			 * are not valid in the current mode (in-mode).
 			 *
 			 * in6_msource is transactioned just as for anything
 			 * else in SSM -- but note naive use of in6m_graft()
 			 * below for allocating new filter entries.
 			 *
 			 * This is only an issue if someone mixes the
 			 * full-state SSM API with the delta-based API,
 			 * which is discouraged in the relevant RFCs.
 			 */
-			lims = im6o_match_source(imo, idx, &ssa->sa);
+			lims = im6o_match_source(imf, &ssa->sa);
 			if (lims != NULL /*&&
 			    lims->im6sl_st[1] == MCAST_INCLUDE*/) {
 				error = EADDRNOTAVAIL;
 				goto out_in6p_locked;
 			}
 		} else {
 			/*
 			 * MCAST_JOIN_GROUP alone, on any existing membership,
 			 * is rejected, to stop the same inpcb tying up
 			 * multiple refs to the in_multi.
 			 * On an existing inclusive membership, this is also
 			 * an error; if you want to change filter mode,
 			 * you must use the userland API setsourcefilter().
 			 * XXX We don't reject this for imf in UNDEFINED
 			 * state at t1, because allocation of a filter
 			 * is atomic with allocation of a membership.
 			 */
 			error = EINVAL;
 			goto out_in6p_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
-	if (is_new) {
-		if (imo->im6o_num_memberships == imo->im6o_max_memberships) {
-			error = im6o_grow(imo);
-			if (error)
-				goto out_in6p_locked;
-		}
-		/*
-		 * Allocate the new slot upfront so we can deal with
-		 * grafting the new source filter in same code path
-		 * as for join-source on existing membership.
-		 */
-		idx = imo->im6o_num_memberships;
-		imo->im6o_membership[idx] = NULL;
-		imo->im6o_num_memberships++;
-		KASSERT(imo->im6o_mfilters != NULL,
-		    ("%s: im6f_mfilters vector was not allocated", __func__));
-		imf = &imo->im6o_mfilters[idx];
-		KASSERT(RB_EMPTY(&imf->im6f_sources),
-		    ("%s: im6f_sources not empty", __func__));
-	}
-
 	/*
 	 * Graft new source into filter list for this inpcb's
 	 * membership of the group. The in6_multi may not have
 	 * been allocated yet if this is a new membership, however,
 	 * the in_mfilter slot will be allocated and must be initialized.
 	 *
 	 * Note: Grafting of exclusive mode filters doesn't happen
 	 * in this path.
 	 * XXX: Should check for non-NULL lims (node exists but may
 	 * not be in-mode) for interop with full-state API.
 	 */
 	if (ssa->ss.ss_family != AF_UNSPEC) {
 		/* Membership starts in IN mode */
 		if (is_new) {
 			CTR1(KTR_MLD, "%s: new join w/source", __func__);
-			im6f_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE);
+			imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE);
+			if (imf == NULL) {
+				error = ENOMEM;
+				goto out_in6p_locked;
+			}
 		} else {
 			CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
 		}
 		lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6);
 		if (lims == NULL) {
 			CTR1(KTR_MLD, "%s: merge imf state failed",
 			    __func__);
 			error = ENOMEM;
-			goto out_im6o_free;
+			goto out_in6p_locked;
 		}
 	} else {
 		/* No address specified; Membership starts in EX mode */
 		if (is_new) {
 			CTR1(KTR_MLD, "%s: new join w/o source", __func__);
-			im6f_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE);
+			imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE);
+			if (imf == NULL) {
+				error = ENOMEM;
+				goto out_in6p_locked;
+			}
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
-	in_pcbref(inp);
-	INP_WUNLOCK(inp);
-	IN6_MULTI_LOCK();
-
 	if (is_new) {
+		in_pcbref(inp);
+		INP_WUNLOCK(inp);
+
 		error = in6_joingroup_locked(ifp, &gsa->sin6.sin6_addr, imf,
-		    &inm, 0);
+		    &imf->im6f_in6m, 0);
+
+		INP_WLOCK(inp);
+		if (in_pcbrele_wlocked(inp)) {
+			error = ENXIO;
+			goto out_in6p_unlocked;
+		}
 		if (error) {
-			IN6_MULTI_UNLOCK();
-			goto out_im6o_free;
+			goto out_in6p_locked;
 		}
 		/*
 		 * NOTE: Refcount from in6_joingroup_locked()
 		 * is protecting membership.
 		 */
-		imo->im6o_membership[idx] = inm;
 	} else {
 		CTR1(KTR_MLD, "%s: merge inm state", __func__);
 		IN6_MULTI_LIST_LOCK();
 		error = in6m_merge(inm, imf);
-		if (error)
+		if (error) {
 			CTR1(KTR_MLD, "%s: failed to merge inm state",
 			    __func__);
-		else {
-			CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
-			error = mld_change_state(inm, 0);
-			if (error)
-				CTR1(KTR_MLD, "%s: failed mld downcall",
-				    __func__);
+			IN6_MULTI_LIST_UNLOCK();
+			im6f_rollback(imf);
+			im6f_reap(imf);
+			goto out_in6p_locked;
 		}
+		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
+		error = mld_change_state(inm, 0);
 		IN6_MULTI_LIST_UNLOCK();
-	}
 
-	IN6_MULTI_UNLOCK();
-	INP_WLOCK(inp);
-	if (in_pcbrele_wlocked(inp))
-		return (ENXIO);
-	if (error) {
-		im6f_rollback(imf);
-		if (is_new)
-			im6f_purge(imf);
-		else
+		if (error) {
+			CTR1(KTR_MLD, "%s: failed mld downcall",
+			     __func__);
+			im6f_rollback(imf);
 			im6f_reap(imf);
-	} else {
-		im6f_commit(imf);
-	}
-
-out_im6o_free:
-	if (error && is_new) {
-		inm = imo->im6o_membership[idx];
-		if (inm != NULL) {
-			IN6_MULTI_LIST_LOCK();
-			in6m_rele_locked(&inmh, inm);
-			IN6_MULTI_LIST_UNLOCK();
+			goto out_in6p_locked;
 		}
-		imo->im6o_membership[idx] = NULL;
-		--imo->im6o_num_memberships;
 	}
 
+	if (is_new)
+		ip6_mfilter_insert(&imo->im6o_head, imf);
+
+	im6f_commit(imf);
+	imf = NULL;
+
 out_in6p_locked:
 	INP_WUNLOCK(inp);
-	in6m_release_list_deferred(&inmh);
+out_in6p_unlocked:
+	IN6_MULTI_UNLOCK();
+
+	if (is_new && imf) {
+		if (imf->im6f_in6m != NULL) {
+			struct in6_multi_head inmh;
+
+			SLIST_INIT(&inmh);
+			SLIST_INSERT_HEAD(&inmh, imf->im6f_in6m, in6m_defer);
+			in6m_release_list_deferred(&inmh);
+		}
+		ip6_mfilter_free(imf);
+	}
 	return (error);
 }
 
 /*
  * Leave an IPv6 multicast group on an inpcb, possibly with a source.
  */
 static int
 in6p_leave_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ipv6_mreq		 mreq;
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_msource		*ims;
 	struct in6_multi		*inm;
 	uint32_t			 ifindex;
-	size_t				 idx;
-	int				 error, is_final;
+	int				 error;
+	bool				 is_final;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	ifp = NULL;
 	ifindex = 0;
 	error = 0;
-	is_final = 1;
+	is_final = true;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	/*
 	 * Chew everything passed in up into a struct group_source_req
 	 * as that is easier to process.
 	 * Note: Any embedded scope ID in the multicast group passed
 	 * in by userland is ignored, the interface index is the recommended
 	 * mechanism to specify an interface; see below.
 	 */
 	switch (sopt->sopt_name) {
 	case IPV6_LEAVE_GROUP:
 		error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
 		    sizeof(struct ipv6_mreq));
 		if (error)
 			return (error);
 		gsa->sin6.sin6_family = AF_INET6;
 		gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
 		gsa->sin6.sin6_port = 0;
 		gsa->sin6.sin6_scope_id = 0;
 		ifindex = mreq.ipv6mr_interface;
 		break;
 
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			if (ssa->sin6.sin6_family != AF_INET6 ||
 			    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 			if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
 				return (EINVAL);
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&ssa->sin6.sin6_addr);
 		}
 		gsa->sin6.sin6_port = 0;
 		gsa->sin6.sin6_scope_id = 0;
 		ifindex = gsr.gsr_interface;
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	/*
 	 * Validate interface index if provided. If no interface index
 	 * was provided separately, attempt to look the membership up
 	 * from the default scope as a last resort to disambiguate
 	 * the membership we are being asked to leave.
 	 * XXX SCOPE6 lock potentially taken here.
 	 */
 	if (ifindex != 0) {
 		if (V_if_index < ifindex)
 			return (EADDRNOTAVAIL);
 		ifp = ifnet_byindex(ifindex);
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 	} else {
 		error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone);
 		if (error)
 			return (EADDRNOTAVAIL);
 		/*
 		 * Some badly behaved applications don't pass an ifindex
 		 * or a scope ID, which is an API violation. In this case,
 		 * perform a lookup as per a v6 join.
 		 *
 		 * XXX For now, stomp on zone ID for the corner case.
 		 * This is not the 'KAME way', but we need to see the ifp
 		 * directly until such time as this implementation is
 		 * refactored, assuming the scope IDs are the way to go.
 		 */
 		ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]);
 		if (ifindex == 0) {
 			CTR2(KTR_MLD, "%s: warning: no ifindex, looking up "
 			    "ifp for group %s.", __func__,
 			    ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr));
 			ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
 		} else {
 			ifp = ifnet_byindex(ifindex);
 		}
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 	}
 
 	CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp);
 	KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__));
 
+	IN6_MULTI_LOCK();
+
 	/*
-	 * Find the membership in the membership array.
+	 * Find the membership in the membership list.
 	 */
 	imo = in6p_findmoptions(inp);
-	idx = im6o_match_group(imo, ifp, &gsa->sa);
-	if (idx == -1) {
+	imf = im6o_match_group(imo, ifp, &gsa->sa);
+	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
-	inm = imo->im6o_membership[idx];
-	imf = &imo->im6o_mfilters[idx];
+	inm = imf->im6f_in6m;
 
 	if (ssa->ss.ss_family != AF_UNSPEC)
-		is_final = 0;
+		is_final = false;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If we were instructed only to leave a given source, do so.
 	 * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
 	 */
 	if (is_final) {
+		ip6_mfilter_remove(&imo->im6o_head, imf);
 		im6f_leave(imf);
 	} else {
 		if (imf->im6f_st[0] == MCAST_EXCLUDE) {
 			error = EADDRNOTAVAIL;
 			goto out_in6p_locked;
 		}
-		ims = im6o_match_source(imo, idx, &ssa->sa);
+		ims = im6o_match_source(imf, &ssa->sa);
 		if (ims == NULL) {
 			CTR3(KTR_MLD, "%s: source %p %spresent", __func__,
 			    ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
 			    "not ");
 			error = EADDRNOTAVAIL;
 			goto out_in6p_locked;
 		}
 		CTR2(KTR_MLD, "%s: %s source", __func__, "block");
 		error = im6f_prune(imf, &ssa->sin6);
 		if (error) {
 			CTR1(KTR_MLD, "%s: merge imf state failed",
 			    __func__);
 			goto out_in6p_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
-	in_pcbref(inp);
-	INP_WUNLOCK(inp);
-	IN6_MULTI_LOCK();
-
-	if (is_final) {
-		/*
-		 * Give up the multicast address record to which
-		 * the membership points.
-		 */
-		(void)in6_leavegroup_locked(inm, imf);
-	} else {
+	if (!is_final) {
 		CTR1(KTR_MLD, "%s: merge inm state", __func__);
 		IN6_MULTI_LIST_LOCK();
 		error = in6m_merge(inm, imf);
-		if (error)
+		if (error) {
 			CTR1(KTR_MLD, "%s: failed to merge inm state",
 			    __func__);
-		else {
-			CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
-			error = mld_change_state(inm, 0);
-			if (error)
-				CTR1(KTR_MLD, "%s: failed mld downcall",
-				    __func__);
+			IN6_MULTI_LIST_UNLOCK();
+			im6f_rollback(imf);
+			im6f_reap(imf);
+                        goto out_in6p_locked;
 		}
+
+		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
+		error = mld_change_state(inm, 0);
 		IN6_MULTI_LIST_UNLOCK();
+		if (error) {
+			CTR1(KTR_MLD, "%s: failed mld downcall",
+			     __func__);
+			im6f_rollback(imf);
+			im6f_reap(imf);
+                        goto out_in6p_locked;
+		}
 	}
 
-	IN6_MULTI_UNLOCK();
-	INP_WLOCK(inp);
-	if (in_pcbrele_wlocked(inp))
-		return (ENXIO);
-
-	if (error)
-		im6f_rollback(imf);
-	else
-		im6f_commit(imf);
-
+	im6f_commit(imf);
 	im6f_reap(imf);
 
-	if (is_final) {
-		/* Remove the gap in the membership array. */
-		KASSERT(RB_EMPTY(&imf->im6f_sources),
-		    ("%s: im6f_sources not empty", __func__));
-		for (++idx; idx < imo->im6o_num_memberships; ++idx) {
-			imo->im6o_membership[idx - 1] = imo->im6o_membership[idx];
-			imo->im6o_mfilters[idx - 1] = imo->im6o_mfilters[idx];
-		}
-		im6f_init(&imo->im6o_mfilters[idx - 1], MCAST_UNDEFINED,
-		    MCAST_EXCLUDE);
-		imo->im6o_num_memberships--;
-	}
-
 out_in6p_locked:
 	INP_WUNLOCK(inp);
+
+	if (is_final && imf) {
+		/*
+		 * Give up the multicast address record to which
+		 * the membership points.
+		 */
+		(void)in6_leavegroup_locked(inm, imf);
+		ip6_mfilter_free(imf);
+	}
+
+	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Select the interface for transmitting IPv6 multicast datagrams.
  *
  * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn
  * may be passed to this socket option. An address of in6addr_any or an
  * interface index of 0 is used to remove a previous selection.
  * When no interface is selected, one is chosen for every send.
  */
 static int
 in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ifnet		*ifp;
 	struct ip6_moptions	*imo;
 	u_int			 ifindex;
 	int			 error;
 
 	if (sopt->sopt_valsize != sizeof(u_int))
 		return (EINVAL);
 
 	error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int));
 	if (error)
 		return (error);
 	if (V_if_index < ifindex)
 		return (EINVAL);
 	if (ifindex == 0)
 		ifp = NULL;
 	else {
 		ifp = ifnet_byindex(ifindex);
 		if (ifp == NULL)
 			return (EINVAL);
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EADDRNOTAVAIL);
 	}
 	imo = in6p_findmoptions(inp);
 	imo->im6o_multicast_ifp = ifp;
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 /*
  * Atomically set source filters on a socket for an IPv6 multicast group.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  */
 static int
 in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct in6_mfilter	*imf;
 	struct ip6_moptions	*imo;
 	struct in6_multi		*inm;
-	size_t			 idx;
 	int			 error;
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
 		return (ENOBUFS);
 
 	if (msfr.msfr_fmode != MCAST_EXCLUDE &&
 	    msfr.msfr_fmode != MCAST_INCLUDE)
 		return (EINVAL);
 
 	if (msfr.msfr_group.ss_family != AF_INET6 ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	gsa->sin6.sin6_port = 0;	/* ignore port */
 
 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
 		return (EADDRNOTAVAIL);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	/*
 	 * Take the INP write lock.
 	 * Check if this socket is a member of this group.
 	 */
 	imo = in6p_findmoptions(inp);
-	idx = im6o_match_group(imo, ifp, &gsa->sa);
-	if (idx == -1 || imo->im6o_mfilters == NULL) {
+	imf = im6o_match_group(imo, ifp, &gsa->sa);
+	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
-	inm = imo->im6o_membership[idx];
-	imf = &imo->im6o_mfilters[idx];
+	inm = imf->im6f_in6m;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	imf->im6f_st[1] = msfr.msfr_fmode;
 
 	/*
 	 * Apply any new source filters, if present.
 	 * Make a copy of the user-space source vector so
 	 * that we may copy them with a single copyin. This
 	 * allows us to deal with page faults up-front.
 	 */
 	if (msfr.msfr_nsrcs > 0) {
 		struct in6_msource	*lims;
 		struct sockaddr_in6	*psin;
 		struct sockaddr_storage	*kss, *pkss;
 		int			 i;
 
 		INP_WUNLOCK(inp);
  
 		CTR2(KTR_MLD, "%s: loading %lu source list entries",
 		    __func__, (unsigned long)msfr.msfr_nsrcs);
 		kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_WAITOK);
 		error = copyin(msfr.msfr_srcs, kss,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		if (error) {
 			free(kss, M_TEMP);
 			return (error);
 		}
 
 		INP_WLOCK(inp);
 
 		/*
 		 * Mark all source filters as UNDEFINED at t1.
 		 * Restore new group filter mode, as im6f_leave()
 		 * will set it to INCLUDE.
 		 */
 		im6f_leave(imf);
 		imf->im6f_st[1] = msfr.msfr_fmode;
 
 		/*
 		 * Update socket layer filters at t1, lazy-allocating
 		 * new entries. This saves a bunch of memory at the
 		 * cost of one RB_FIND() per source entry; duplicate
 		 * entries in the msfr_nsrcs vector are ignored.
 		 * If we encounter an error, rollback transaction.
 		 *
 		 * XXX This too could be replaced with a set-symmetric
 		 * difference like loop to avoid walking from root
 		 * every time, as the key space is common.
 		 */
 		for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
 			psin = (struct sockaddr_in6 *)pkss;
 			if (psin->sin6_family != AF_INET6) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 			if (psin->sin6_len != sizeof(struct sockaddr_in6)) {
 				error = EINVAL;
 				break;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) {
 				error = EINVAL;
 				break;
 			}
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&psin->sin6_addr);
 			error = im6f_get_source(imf, psin, &lims);
 			if (error)
 				break;
 			lims->im6sl_st[1] = imf->im6f_st[1];
 		}
 		free(kss, M_TEMP);
 	}
 
 	if (error)
 		goto out_im6f_rollback;
 
 	INP_WLOCK_ASSERT(inp);
 	IN6_MULTI_LIST_LOCK();
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 	else {
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 	}
 
 	IN6_MULTI_LIST_UNLOCK();
 
 out_im6f_rollback:
 	if (error)
 		im6f_rollback(imf);
 	else
 		im6f_commit(imf);
 
 	im6f_reap(imf);
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Set the IP multicast options in response to user setsockopt().
  *
  * Many of the socket options handled in this function duplicate the
  * functionality of socket options in the regular unicast API. However,
  * it is not possible to merge the duplicate code, because the idempotence
  * of the IPv6 multicast part of the BSD Sockets API must be preserved;
  * the effects of these options must be treated as separate and distinct.
  *
  * SMPng: XXX: Unlocked read of inp_socket believed OK.
  */
 int
 ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip6_moptions	*im6o;
 	int			 error;
 
 	error = 0;
 
 	/*
 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
 	 * or is a divert socket, reject it.
 	 */
 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
 		return (EOPNOTSUPP);
 
 	switch (sopt->sopt_name) {
 	case IPV6_MULTICAST_IF:
 		error = in6p_set_multicast_if(inp, sopt);
 		break;
 
 	case IPV6_MULTICAST_HOPS: {
 		int hlim;
 
 		if (sopt->sopt_valsize != sizeof(int)) {
 			error = EINVAL;
 			break;
 		}
 		error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int));
 		if (error)
 			break;
 		if (hlim < -1 || hlim > 255) {
 			error = EINVAL;
 			break;
 		} else if (hlim == -1) {
 			hlim = V_ip6_defmcasthlim;
 		}
 		im6o = in6p_findmoptions(inp);
 		im6o->im6o_multicast_hlim = hlim;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IPV6_MULTICAST_LOOP: {
 		u_int loop;
 
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.
 		 */
 		if (sopt->sopt_valsize != sizeof(u_int)) {
 			error = EINVAL;
 			break;
 		}
 		error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int));
 		if (error)
 			break;
 		if (loop > 1) {
 			error = EINVAL;
 			break;
 		}
 		im6o = in6p_findmoptions(inp);
 		im6o->im6o_multicast_loop = loop;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IPV6_JOIN_GROUP:
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		error = in6p_join_group(inp, sopt);
 		break;
 
 	case IPV6_LEAVE_GROUP:
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		error = in6p_leave_group(inp, sopt);
 		break;
 
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = in6p_block_unblock_source(inp, sopt);
 		break;
 
 	case IPV6_MSFILTER:
 		error = in6p_set_source_filters(inp, sopt);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Expose MLD's multicast filter mode and source list(s) to userland,
  * keyed by (ifindex, group).
  * The filter mode is written out as a uint32_t, followed by
  * 0..n of struct in6_addr.
  * For use by ifmcstat(8).
  * SMPng: NOTE: unlocked read of ifindex space.
  */
 static int
 sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_addr			 mcaddr;
 	struct in6_addr			 src;
 	struct epoch_tracker		 et;
 	struct ifnet			*ifp;
 	struct ifmultiaddr		*ifma;
 	struct in6_multi		*inm;
 	struct ip6_msource		*ims;
 	int				*name;
 	int				 retval;
 	u_int				 namelen;
 	uint32_t			 fmode, ifindex;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/* int: ifindex + 4 * 32 bits of IPv6 address */
 	if (namelen != 5)
 		return (EINVAL);
 
 	ifindex = name[0];
 	if (ifindex <= 0 || ifindex > V_if_index) {
 		CTR2(KTR_MLD, "%s: ifindex %u out of range",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 
 	memcpy(&mcaddr, &name[1], sizeof(struct in6_addr));
 	if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) {
 		CTR2(KTR_MLD, "%s: group %s is not multicast",
 		    __func__, ip6_sprintf(ip6tbuf, &mcaddr));
 		return (EINVAL);
 	}
 
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR2(KTR_MLD, "%s: no ifp for ifindex %u",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 	/*
 	 * Internal MLD lookups require that scope/zone ID is set.
 	 */
 	(void)in6_setscope(&mcaddr, ifp, NULL);
 
 	retval = sysctl_wire_old_buffer(req,
 	    sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr)));
 	if (retval)
 		return (retval);
 
 	IN6_MULTI_LOCK();
 	IN6_MULTI_LIST_LOCK();
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = in6m_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr))
 			continue;
 		fmode = inm->in6m_st[1].iss_fmode;
 		retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
 		if (retval != 0)
 			break;
 		RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 			CTR2(KTR_MLD, "%s: visit node %p", __func__, ims);
 			/*
 			 * Only copy-out sources which are in-mode.
 			 */
 			if (fmode != im6s_get_mode(inm, ims, 1)) {
 				CTR1(KTR_MLD, "%s: skip non-in-mode",
 				    __func__);
 				continue;
 			}
 			src = ims->im6s_addr;
 			retval = SYSCTL_OUT(req, &src,
 			    sizeof(struct in6_addr));
 			if (retval != 0)
 				break;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	IN6_MULTI_LIST_UNLOCK();
 	IN6_MULTI_UNLOCK();
 
 	return (retval);
 }
 
 #ifdef KTR
 
 static const char *in6m_modestrs[] = { "un", "in", "ex" };
 
 static const char *
 in6m_mode_str(const int mode)
 {
 
 	if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
 		return (in6m_modestrs[mode]);
 	return ("??");
 }
 
 static const char *in6m_statestrs[] = {
 	"not-member",
 	"silent",
 	"idle",
 	"lazy",
 	"sleeping",
 	"awakening",
 	"query-pending",
 	"sg-query-pending",
 	"leaving"
 };
 
 static const char *
 in6m_state_str(const int state)
 {
 
 	if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER)
 		return (in6m_statestrs[state]);
 	return ("??");
 }
 
 /*
  * Dump an in6_multi structure to the console.
  */
 void
 in6m_print(const struct in6_multi *inm)
 {
 	int t;
 	char ip6tbuf[INET6_ADDRSTRLEN];
 
 	if ((ktr_mask & KTR_MLD) == 0)
 		return;
 
 	printf("%s: --- begin in6m %p ---\n", __func__, inm);
 	printf("addr %s ifp %p(%s) ifma %p\n",
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp,
 	    if_name(inm->in6m_ifp),
 	    inm->in6m_ifma);
 	printf("timer %u state %s refcount %u scq.len %u\n",
 	    inm->in6m_timer,
 	    in6m_state_str(inm->in6m_state),
 	    inm->in6m_refcount,
 	    mbufq_len(&inm->in6m_scq));
 	printf("mli %p nsrc %lu sctimer %u scrv %u\n",
 	    inm->in6m_mli,
 	    inm->in6m_nsrc,
 	    inm->in6m_sctimer,
 	    inm->in6m_scrv);
 	for (t = 0; t < 2; t++) {
 		printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
 		    in6m_mode_str(inm->in6m_st[t].iss_fmode),
 		    inm->in6m_st[t].iss_asm,
 		    inm->in6m_st[t].iss_ex,
 		    inm->in6m_st[t].iss_in,
 		    inm->in6m_st[t].iss_rec);
 	}
 	printf("%s: --- end in6m %p ---\n", __func__, inm);
 }
 
 #else /* !KTR */
 
 void
 in6m_print(const struct in6_multi *inm)
 {
 
 }
 
 #endif /* KTR */
Index: head/sys/netinet6/in6_pcb.c
===================================================================
--- head/sys/netinet6/in6_pcb.c	(revision 349368)
+++ head/sys/netinet6/in6_pcb.c	(revision 349369)
@@ -1,1375 +1,1376 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_pcb.c,v 1.31 2001/05/21 05:45:10 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_pcbgroup.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/tcp_var.h>
 #include <netinet/ip6.h>
 #include <netinet/ip_var.h>
 
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/in_pcb.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/scope6_var.h>
 
 static struct inpcb *in6_pcblookup_hash_locked(struct inpcbinfo *,
     struct in6_addr *, u_int, struct in6_addr *, u_int, int, struct ifnet *);
 
 int
 in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
     struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	u_short	lport = 0;
 	int error, lookupflags = 0;
 	int reuseport = (so->so_options & SO_REUSEPORT);
 
 	/*
 	 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
 	 * so that we don't have to add to the (already messy) code below.
 	 */
 	int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	if (CK_STAILQ_EMPTY(&V_in6_ifaddrhead))	/* XXX broken! */
 		return (EADDRNOTAVAIL);
 	if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip6(cred, &inp->in6p_laddr,
 		    ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
 			return (error);
 	} else {
 		sin6 = (struct sockaddr_in6 *)nam;
 		if (nam->sa_len != sizeof(*sin6))
 			return (EINVAL);
 		/*
 		 * family check.
 		 */
 		if (nam->sa_family != AF_INET6)
 			return (EAFNOSUPPORT);
 
 		if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
 			return(error);
 
 		if ((error = prison_local_ip6(cred, &sin6->sin6_addr,
 		    ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
 			return (error);
 
 		lport = sin6->sin6_port;
 		if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow compepte duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 			/*
 			 * XXX: How to deal with SO_REUSEPORT_LB here?
 			 * Treat same as SO_REUSEPORT for now.
 			 */
 			if ((so->so_options &
 			    (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
 				reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			struct epoch_tracker et;
 			struct ifaddr *ifa;
 
 			sin6->sin6_port = 0;		/* yech... */
 			NET_EPOCH_ENTER(et);
 			if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin6)) ==
 			    NULL &&
 			    (inp->inp_flags & INP_BINDANY) == 0) {
 				NET_EPOCH_EXIT(et);
 				return (EADDRNOTAVAIL);
 			}
 
 			/*
 			 * XXX: bind to an anycast address might accidentally
 			 * cause sending a packet with anycast source address.
 			 * We should allow to bind to a deprecated address, since
 			 * the application dares to use it.
 			 */
 			if (ifa != NULL &&
 			    ((struct in6_ifaddr *)ifa)->ia6_flags &
 			    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) {
 				NET_EPOCH_EXIT(et);
 				return (EADDRNOTAVAIL);
 			}
 			NET_EPOCH_EXIT(et);
 		}
 		if (lport) {
 			struct inpcb *t;
 			struct tcptw *tw;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
 				return (EACCES);
 			if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) &&
 			    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
 				t = in6_pcblookup_local(pcbinfo,
 				    &sin6->sin6_addr, lport,
 				    INPLOOKUP_WILDCARD, cred);
 				if (t &&
 				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) &&
 				    (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
 				     !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) ||
 				     (t->inp_flags2 & INP_REUSEPORT) ||
 				     (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 
 				/*
 				 * If the socket is a BINDMULTI socket, then
 				 * the credentials need to match and the
 				 * original socket also has to have been bound
 				 * with BINDMULTI.
 				 */
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 
 #ifdef INET
 				if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
 				    IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 					struct sockaddr_in sin;
 
 					in6_sin6_2_sin(&sin, sin6);
 					t = in_pcblookup_local(pcbinfo,
 					    sin.sin_addr, lport,
 					    INPLOOKUP_WILDCARD, cred);
 					if (t &&
 					    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 					    ((t->inp_flags &
 					      INP_TIMEWAIT) == 0) &&
 					    (so->so_type != SOCK_STREAM ||
 					     ntohl(t->inp_faddr.s_addr) ==
 					      INADDR_ANY) &&
 					    (inp->inp_cred->cr_uid !=
 					     t->inp_cred->cr_uid))
 						return (EADDRINUSE);
 
 					if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 						return (EADDRINUSE);
 				}
 #endif
 			}
 			t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr,
 			    lport, lookupflags, cred);
 			if (t && (t->inp_flags & INP_TIMEWAIT)) {
 				/*
 				 * XXXRW: If an incpb has had its timewait
 				 * state recycled, we treat the address as
 				 * being in use (for now).  This is better
 				 * than a panic, but not desirable.
 				 */
 				tw = intotw(t);
 				if (tw == NULL ||
 				    ((reuseport & tw->tw_so_options) == 0 &&
 					 (reuseport_lb & tw->tw_so_options) == 0))
 					return (EADDRINUSE);
 			} else if (t && (reuseport & inp_so_options(t)) == 0 &&
 					   (reuseport_lb & inp_so_options(t)) == 0) {
 				return (EADDRINUSE);
 			}
 #ifdef INET
 			if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
 			    IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 				struct sockaddr_in sin;
 
 				in6_sin6_2_sin(&sin, sin6);
 				t = in_pcblookup_local(pcbinfo, sin.sin_addr,
 				   lport, lookupflags, cred);
 				if (t && t->inp_flags & INP_TIMEWAIT) {
 					tw = intotw(t);
 					if (tw == NULL)
 						return (EADDRINUSE);
 					if ((reuseport & tw->tw_so_options) == 0
 					    && (reuseport_lb & tw->tw_so_options) == 0
 					    && (ntohl(t->inp_laddr.s_addr) !=
 					        INADDR_ANY || ((inp->inp_vflag &
 					                INP_IPV6PROTO) ==
 					            (t->inp_vflag & INP_IPV6PROTO))))
 						return (EADDRINUSE);
 				} else if (t &&
 				    (reuseport & inp_so_options(t)) == 0 &&
 				    (reuseport_lb & inp_so_options(t)) == 0 &&
 				    (ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				        (t->inp_vflag & INP_IPV6PROTO) != 0)) {
 					return (EADDRINUSE);
 				}
 			}
 #endif
 		}
 		inp->in6p_laddr = sin6->sin6_addr;
 	}
 	if (lport == 0) {
 		if ((error = in6_pcbsetport(&inp->in6p_laddr, inp, cred)) != 0) {
 			/* Undo an address bind that may have occurred. */
 			inp->in6p_laddr = in6addr_any;
 			return (error);
 		}
 	} else {
 		inp->inp_lport = lport;
 		if (in_pcbinshash(inp) != 0) {
 			inp->in6p_laddr = in6addr_any;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 	return (0);
 }
 
 /*
  *   Transform old in6_pcbconnect() into an inner subroutine for new
  *   in6_pcbconnect(): Do some validity-checking on the remote
  *   address (in mbuf 'nam') and then determine local host address
  *   (i.e., which interface) to use to access that remote host.
  *
  *   This preserves definition of in6_pcbconnect(), while supporting a
  *   slightly different version for T/TCP.  (This is more than
  *   a bit of a kludge, but cleaning up the internal interfaces would
  *   have forced minor changes in every protocol).
  */
 static int
 in6_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
     struct in6_addr *plocal_addr6)
 {
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
 	int error = 0;
 	int scope_ambiguous = 0;
 	struct in6_addr in6a;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);	/* XXXRW: why? */
 
 	if (nam->sa_len != sizeof (*sin6))
 		return (EINVAL);
 	if (sin6->sin6_family != AF_INET6)
 		return (EAFNOSUPPORT);
 	if (sin6->sin6_port == 0)
 		return (EADDRNOTAVAIL);
 
 	if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
 		scope_ambiguous = 1;
 	if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
 		return(error);
 
 	if (!CK_STAILQ_EMPTY(&V_in6_ifaddrhead)) {
 		/*
 		 * If the destination address is UNSPECIFIED addr,
 		 * use the loopback addr, e.g ::1.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 			sin6->sin6_addr = in6addr_loopback;
 	}
 	if ((error = prison_remote_ip6(inp->inp_cred, &sin6->sin6_addr)) != 0)
 		return (error);
 
 	error = in6_selectsrc_socket(sin6, inp->in6p_outputopts,
 	    inp, inp->inp_cred, scope_ambiguous, &in6a, NULL);
 	if (error)
 		return (error);
 
 	/*
 	 * Do not update this earlier, in case we return with an error.
 	 *
 	 * XXX: this in6_selectsrc_socket result might replace the bound local
 	 * address with the address specified by setsockopt(IPV6_PKTINFO).
 	 * Is it the intended behavior?
 	 */
 	*plocal_addr6 = in6a;
 
 	/*
 	 * Don't do pcblookup call here; return interface in
 	 * plocal_addr6
 	 * and exit to caller, that will do the lookup.
 	 */
 
 	return (0);
 }
 
 /*
  * Outer subroutine:
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in6_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
     struct ucred *cred, struct mbuf *m)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
 	struct in6_addr addr6;
 	int error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	/*
 	 * Call inner routine, to assign local interface address.
 	 * in6_pcbladdr() may automatically fill in sin6_scope_id.
 	 */
 	if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0)
 		return (error);
 
 	if (in6_pcblookup_hash_locked(pcbinfo, &sin6->sin6_addr,
 			       sin6->sin6_port,
 			      IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
 			      ? &addr6 : &inp->in6p_laddr,
 			      inp->inp_lport, 0, NULL) != NULL) {
 		return (EADDRINUSE);
 	}
 	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 		if (inp->inp_lport == 0) {
 			error = in6_pcbbind(inp, (struct sockaddr *)0, cred);
 			if (error)
 				return (error);
 		}
 		inp->in6p_laddr = addr6;
 	}
 	inp->in6p_faddr = sin6->sin6_addr;
 	inp->inp_fport = sin6->sin6_port;
 	/* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
 	inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
 	if (inp->inp_flags & IN6P_AUTOFLOWLABEL)
 		inp->inp_flow |=
 		    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
 
 	in_pcbrehash_mbuf(inp, m);
 
 	return (0);
 }
 
 int
 in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 
 	return (in6_pcbconnect_mbuf(inp, nam, cred, NULL));
 }
 
 void
 in6_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	bzero((caddr_t)&inp->in6p_faddr, sizeof(inp->in6p_faddr));
 	inp->inp_fport = 0;
 	/* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
 	inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
 	in_pcbrehash(inp);
 }
 
 struct sockaddr *
 in6_sockaddr(in_port_t port, struct in6_addr *addr_p)
 {
 	struct sockaddr_in6 *sin6;
 
 	sin6 = malloc(sizeof *sin6, M_SONAME, M_WAITOK);
 	bzero(sin6, sizeof *sin6);
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(*sin6);
 	sin6->sin6_port = port;
 	sin6->sin6_addr = *addr_p;
 	(void)sa6_recoverscope(sin6); /* XXX: should catch errors */
 
 	return (struct sockaddr *)sin6;
 }
 
 struct sockaddr *
 in6_v4mapsin6_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in sin;
 	struct sockaddr_in6 *sin6_p;
 
 	bzero(&sin, sizeof sin);
 	sin.sin_family = AF_INET;
 	sin.sin_len = sizeof(sin);
 	sin.sin_port = port;
 	sin.sin_addr = *addr_p;
 
 	sin6_p = malloc(sizeof *sin6_p, M_SONAME,
 		M_WAITOK);
 	in6_sin_2_v4mapsin6(&sin, sin6_p);
 
 	return (struct sockaddr *)sin6_p;
 }
 
 int
 in6_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in6_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in6_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->in6p_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in6_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in6_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in6_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in6_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->in6p_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in6_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct	inpcb *inp;
 	int	error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in6_mapped_sockaddr: inp == NULL"));
 
 #ifdef INET
 	if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) {
 		error = in_getsockaddr(so, nam);
 		if (error == 0)
 			in6_sin_2_v4mapsin6_in_sock(nam);
 	} else
 #endif
 	{
 		/* scope issues will be handled in in6_getsockaddr(). */
 		error = in6_getsockaddr(so, nam);
 	}
 
 	return error;
 }
 
 int
 in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct	inpcb *inp;
 	int	error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in6_mapped_peeraddr: inp == NULL"));
 
 #ifdef INET
 	if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) {
 		error = in_getpeeraddr(so, nam);
 		if (error == 0)
 			in6_sin_2_v4mapsin6_in_sock(nam);
 	} else
 #endif
 	/* scope issues will be handled in in6_getpeeraddr(). */
 	error = in6_getpeeraddr(so, nam);
 
 	return error;
 }
 
 /*
  * Pass some notification to all connections of a protocol
  * associated with address dst.  The local address and/or port numbers
  * may be specified to limit the search.  The "usual action" will be
  * taken, depending on the ctlinput cmd.  The caller must filter any
  * cmds that are uninteresting (e.g., no error in the map).
  * Call the protocol specific routine (if any) to report
  * any errors for each matching socket.
  */
 void
 in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr *dst,
     u_int fport_arg, const struct sockaddr *src, u_int lport_arg,
     int cmd, void *cmdarg,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb *inp, *inp_temp;
 	struct sockaddr_in6 sa6_src, *sa6_dst;
 	u_short	fport = fport_arg, lport = lport_arg;
 	u_int32_t flowinfo;
 	int errno;
 
 	if ((unsigned)cmd >= PRC_NCMDS || dst->sa_family != AF_INET6)
 		return;
 
 	sa6_dst = (struct sockaddr_in6 *)dst;
 	if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr))
 		return;
 
 	/*
 	 * note that src can be NULL when we get notify by local fragmentation.
 	 */
 	sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src;
 	flowinfo = sa6_src.sin6_flowinfo;
 
 	/*
 	 * Redirects go to all references to the destination,
 	 * and use in6_rtchange to invalidate the route cache.
 	 * Dead host indications: also use in6_rtchange to invalidate
 	 * the cache, and deliver the error to all the sockets.
 	 * Otherwise, if we have knowledge of the local port and address,
 	 * deliver only to that socket.
 	 */
 	if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) {
 		fport = 0;
 		lport = 0;
 		bzero((caddr_t)&sa6_src.sin6_addr, sizeof(sa6_src.sin6_addr));
 
 		if (cmd != PRC_HOSTDEAD)
 			notify = in6_rtchange;
 	}
 	errno = inet6ctlerrmap[cmd];
 	INP_INFO_WLOCK(pcbinfo);
 	CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 		if ((inp->inp_vflag & INP_IPV6) == 0) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 
 		/*
 		 * If the error designates a new path MTU for a destination
 		 * and the application (associated with this socket) wanted to
 		 * know the value, notify.
 		 * XXX: should we avoid to notify the value to TCP sockets?
 		 */
 		if (cmd == PRC_MSGSIZE && cmdarg != NULL)
 			ip6_notify_pmtu(inp, (struct sockaddr_in6 *)dst,
 					*(u_int32_t *)cmdarg);
 
 		/*
 		 * Detect if we should notify the error. If no source and
 		 * destination ports are specifed, but non-zero flowinfo and
 		 * local address match, notify the error. This is the case
 		 * when the error is delivered with an encrypted buffer
 		 * by ESP. Otherwise, just compare addresses and ports
 		 * as usual.
 		 */
 		if (lport == 0 && fport == 0 && flowinfo &&
 		    inp->inp_socket != NULL &&
 		    flowinfo == (inp->inp_flow & IPV6_FLOWLABEL_MASK) &&
 		    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr))
 			goto do_notify;
 		else if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr,
 					     &sa6_dst->sin6_addr) ||
 			 inp->inp_socket == 0 ||
 			 (lport && inp->inp_lport != lport) ||
 			 (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) &&
 			  !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
 					      &sa6_src.sin6_addr)) ||
 			 (fport && inp->inp_fport != fport)) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 
 	  do_notify:
 		if (notify) {
 			if ((*notify)(inp, errno))
 				INP_WUNLOCK(inp);
 		} else
 			INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 /*
  * Lookup a PCB based on the local address and port.  Caller must hold the
  * hash lock.  No inpcb locks or references are acquired.
  */
 struct inpcb *
 in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr,
     u_short lport, int lookupflags, struct ucred *cred)
 {
 	struct inpcb *inp;
 	int matchwild = 3, wildcard;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(
 		    INP6_PCBHASHKEY(&in6addr_any), lport, 0,
 		    pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
 			if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 			    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
 			    inp->inp_lport == lport) {
 				/* Found. */
 				if (cred == NULL ||
 				    prison_equal_ip6(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		CK_LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 				if (cred != NULL &&
 				    !prison_equal_ip6(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					continue;
 				/* XXX inp locking */
 				if ((inp->inp_vflag & INP_IPV6) == 0)
 					continue;
 				if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
 					wildcard++;
 				if (!IN6_IS_ADDR_UNSPECIFIED(
 					&inp->in6p_laddr)) {
 					if (IN6_IS_ADDR_UNSPECIFIED(laddr))
 						wildcard++;
 					else if (!IN6_ARE_ADDR_EQUAL(
 					    &inp->in6p_laddr, laddr))
 						continue;
 				} else {
 					if (!IN6_IS_ADDR_UNSPECIFIED(laddr))
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0)
 						break;
 				}
 			}
 		}
 		return (match);
 	}
 }
 
 void
 in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb *in6p;
+	struct in6_multi *inm;
+	struct in6_mfilter *imf;
 	struct ip6_moptions *im6o;
-	int i, gap;
 
 	INP_INFO_WLOCK(pcbinfo);
 	CK_LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) {
 		INP_WLOCK(in6p);
 		if (__predict_false(in6p->inp_flags2 & INP_FREED)) {
 			INP_WUNLOCK(in6p);
 			continue;
 		}
 		im6o = in6p->in6p_moptions;
 		if ((in6p->inp_vflag & INP_IPV6) && im6o != NULL) {
 			/*
 			 * Unselect the outgoing ifp for multicast if it
 			 * is being detached.
 			 */
 			if (im6o->im6o_multicast_ifp == ifp)
 				im6o->im6o_multicast_ifp = NULL;
 			/*
 			 * Drop multicast group membership if we joined
 			 * through the interface being detached.
 			 */
-			gap = 0;
-			for (i = 0; i < im6o->im6o_num_memberships; i++) {
-				if (im6o->im6o_membership[i]->in6m_ifp ==
-				    ifp) {
-					in6_leavegroup(im6o->im6o_membership[i], NULL);
-					gap++;
-				} else if (gap != 0) {
-					im6o->im6o_membership[i - gap] =
-					    im6o->im6o_membership[i];
-				}
+restart:
+			IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) {
+				if ((inm = imf->im6f_in6m) == NULL)
+					continue;
+				if (inm->in6m_ifp != ifp)
+					continue;
+				ip6_mfilter_remove(&im6o->im6o_head, imf);
+				IN6_MULTI_LOCK_ASSERT();
+				in6_leavegroup_locked(inm, NULL);
+				ip6_mfilter_free(imf);
+				goto restart;
 			}
-			im6o->im6o_num_memberships -= gap;
 		}
 		INP_WUNLOCK(in6p);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 /*
  * Check for alternatives when higher level complains
  * about service problems.  For now, invalidate cached
  * routing information.  If the route was created dynamically
  * (by a redirect), time to try a default gateway again.
  */
 void
 in6_losing(struct inpcb *inp)
 {
 
 	RO_INVALIDATE_CACHE(&inp->inp_route6);
 }
 
 /*
  * After a routing change, flush old routing
  * and allocate a (hopefully) better one.
  */
 struct inpcb *
 in6_rtchange(struct inpcb *inp, int errno __unused)
 {
 
 	RO_INVALIDATE_CACHE(&inp->inp_route6);
 	return inp;
 }
 
 static struct inpcb *
 in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
     const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr,
     uint16_t fport, int lookupflags)
 {
 	struct inpcb *local_wild;
 	const struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	uint32_t idx;
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
 
 	/*
 	 * Order of socket selection:
 	 * 1. non-wild.
 	 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
 	 *
 	 * NOTE:
 	 * - Load balanced group does not contain jailed sockets.
 	 * - Load balanced does not contain IPv4 mapped INET6 wild sockets.
 	 */
 	local_wild = NULL;
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 #ifdef INET
 		if (!(grp->il_vflag & INP_IPV6))
 			continue;
 #endif
 		if (grp->il_lport != lport)
 			continue;
 
 		idx = INP_PCBLBGROUP_PKTHASH(INP6_PCBHASHKEY(faddr), lport,
 		    fport) % grp->il_inpcnt;
 		if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr))
 			return (grp->il_inp[idx]);
 		if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) &&
 		    (lookupflags & INPLOOKUP_WILDCARD) != 0)
 			local_wild = grp->il_inp[idx];
 	}
 	return (local_wild);
 }
 
 #ifdef PCBGROUP
 /*
  * Lookup PCB in hash list, using pcbgroup tables.
  */
 static struct inpcb *
 in6_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
     struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr,
     u_int lport_arg, int lookupflags, struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 	bool locked;
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	INP_GROUP_LOCK(pcbgroup);
 	head = &pcbgroup->ipg_hashbase[INP_PCBHASH(
 	    INP6_PCBHASHKEY(faddr), lport, fport, pcbgroup->ipg_hashmask)];
 	CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV6) == 0)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) &&
 		    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP6))
 				goto found;
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL) {
 		inp = tmpinp;
 		goto found;
 	}
 
 	/*
 	 * Then look for a wildcard match in the pcbgroup.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbgroup->ipg_hashbase[
 		    INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
 
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) ||
 			    inp->inp_lport != lport) {
 				continue;
 			}
 
 			injail = prison_flag(inp->inp_cred, PR_IP6);
 			if (injail) {
 				if (prison_check_ip6(inp->inp_cred,
 				    laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 				if (injail)
 					jail_wild = inp;
 				else
 					local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 		if (inp != NULL)
 			goto found;
 	}
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbinfo->ipi_wildbase[INP_PCBHASH(
 		    INP6_PCBHASHKEY(&in6addr_any), lport, 0,
 		    pcbinfo->ipi_wildmask)];
 		CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
 
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) ||
 			    inp->inp_lport != lport) {
 				continue;
 			}
 
 			injail = prison_flag(inp->inp_cred, PR_IP6);
 			if (injail) {
 				if (prison_check_ip6(inp->inp_cred,
 				    laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 				if (injail)
 					jail_wild = inp;
 				else
 					local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 		if (inp != NULL)
 			goto found;
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 	INP_GROUP_UNLOCK(pcbgroup);
 	return (NULL);
 
 found:
 	if (lookupflags & INPLOOKUP_WLOCKPCB)
 		locked = INP_TRY_WLOCK(inp);
 	else if (lookupflags & INPLOOKUP_RLOCKPCB)
 		locked = INP_TRY_RLOCK(inp);
 	else
 		panic("%s: locking buf", __func__);
 	if (!locked)
 		in_pcbref(inp);
 	INP_GROUP_UNLOCK(pcbgroup);
 	if (!locked) {
 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
 			INP_WLOCK(inp);
 			if (in_pcbrele_wlocked(inp))
 				return (NULL);
 		} else {
 			INP_RLOCK(inp);
 			if (in_pcbrele_rlocked(inp))
 				return (NULL);
 		}
 	}
 #ifdef INVARIANTS
 	if (lookupflags & INPLOOKUP_WLOCKPCB)
 		INP_WLOCK_ASSERT(inp);
 	else
 		INP_RLOCK_ASSERT(inp);
 #endif
 	return (inp);
 }
 #endif /* PCBGROUP */
 
 /*
  * Lookup PCB in hash list.
  */
 static struct inpcb *
 in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
     u_int fport_arg, struct in6_addr *laddr, u_int lport_arg,
     int lookupflags, struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(
 	    INP6_PCBHASHKEY(faddr), lport, fport, pcbinfo->ipi_hashmask)];
 	CK_LIST_FOREACH(inp, head, inp_hash) {
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV6) == 0)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) &&
 		    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP6))
 				return (inp);
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
 
 	/*
 	 * Then look in lb group (for wildcard match).
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		inp = in6_pcblookup_lbgroup(pcbinfo, laddr, lport, faddr,
 		    fport, lookupflags);
 		if (inp != NULL)
 			return (inp);
 	}
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(
 		    INP6_PCBHASHKEY(&in6addr_any), lport, 0,
 		    pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
 
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) ||
 			    inp->inp_lport != lport) {
 				continue;
 			}
 
 			injail = prison_flag(inp->inp_cred, PR_IP6);
 			if (injail) {
 				if (prison_check_ip6(inp->inp_cred,
 				    laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) {
 				if (injail)
 					return (inp);
 				else
 					local_exact = inp;
 			} else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 				if (injail)
 					jail_wild = inp;
 				else
 					local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		if (jail_wild != NULL)
 			return (jail_wild);
 		if (local_exact != NULL)
 			return (local_exact);
 		if (local_wild != NULL)
 			return (local_wild);
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 
 	/*
 	 * Not found.
 	 */
 	return (NULL);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
  * hash list lock, and will return the inpcb locked (i.e., requires
  * INPLOOKUP_LOCKPCB).
  */
 static struct inpcb *
 in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
     u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags,
     struct ifnet *ifp)
 {
 	struct inpcb *inp;
 
 	INP_HASH_RLOCK(pcbinfo);
 	inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 	    (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
 	if (inp != NULL) {
 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
 			INP_WLOCK(inp);
 			if (__predict_false(inp->inp_flags2 & INP_FREED)) {
 				INP_WUNLOCK(inp);
 				inp = NULL;
 			}
 		} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 			INP_RLOCK(inp);
 			if (__predict_false(inp->inp_flags2 & INP_FREED)) {
 				INP_RUNLOCK(inp);
 				inp = NULL;
 			}
 		} else
 			panic("%s: locking bug", __func__);
 #ifdef INVARIANTS
 		if (inp != NULL) {
 			if (lookupflags & INPLOOKUP_WLOCKPCB)
 				INP_WLOCK_ASSERT(inp);
 			else
 				INP_RLOCK_ASSERT(inp);
 		}
 #endif
 	}
 	INP_HASH_RUNLOCK(pcbinfo);
 	return (inp);
 }
 
 /*
  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
  * from which a pre-calculated hash value may be extracted.
  *
  * Possibly more of this logic should be in in6_pcbgroup.c.
  */
 struct inpcb *
 in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport,
     struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 {
 #if defined(PCBGROUP) && !defined(RSS)
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	/*
 	 * When not using RSS, use connection groups in preference to the
 	 * reservation table when looking up 4-tuples.  When using RSS, just
 	 * use the reservation table, due to the cost of the Toeplitz hash
 	 * in software.
 	 *
 	 * XXXRW: This policy belongs in the pcbgroup code, as in principle
 	 * we could be doing RSS with a non-Toeplitz hash that is affordable
 	 * in software.
 	 */
 #if defined(PCBGROUP) && !defined(RSS)
 	if (in_pcbgroup_enabled(pcbinfo)) {
 		pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 	}
 #endif
 	return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 
 struct inpcb *
 in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
     u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, struct mbuf *m)
 {
 #ifdef PCBGROUP
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 #ifdef PCBGROUP
 	/*
 	 * If we can use a hardware-generated hash to look up the connection
 	 * group, use that connection group to find the inpcb.  Otherwise
 	 * fall back on a software hash -- or the reservation table if we're
 	 * using RSS.
 	 *
 	 * XXXRW: As above, that policy belongs in the pcbgroup code.
 	 */
 	if (in_pcbgroup_enabled(pcbinfo) &&
 	    M_HASHTYPE_TEST(m, M_HASHTYPE_NONE) == 0) {
 		pcbgroup = in6_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
 		    m->m_pkthdr.flowid);
 		if (pcbgroup != NULL)
 			return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr,
 			    fport, laddr, lport, lookupflags, ifp));
 #ifndef RSS
 		pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 #endif
 	}
 #endif
 	return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 
 void
 init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m, int srcordst)
 {
 	struct ip6_hdr *ip;
 
 	ip = mtod(m, struct ip6_hdr *);
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_len = sizeof(*sin6);
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_addr = srcordst ? ip->ip6_dst : ip->ip6_src;
 
 	(void)sa6_recoverscope(sin6); /* XXX: should catch errors... */
 
 	return;
 }
Index: head/sys/netinet6/in6_var.h
===================================================================
--- head/sys/netinet6/in6_var.h	(revision 349368)
+++ head/sys/netinet6/in6_var.h	(revision 349369)
@@ -1,876 +1,928 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_var.h,v 1.56 2001/03/29 05:34:31 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1985, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_var.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NETINET6_IN6_VAR_H_
 #define _NETINET6_IN6_VAR_H_
 
 #include <sys/tree.h>
 #include <sys/counter.h>
 
 #ifdef _KERNEL
 #include <sys/fnv_hash.h>
 #include <sys/libkern.h>
 #endif
 
 /*
  * Interface address, Internet version.  One of these structures
  * is allocated for each interface with an Internet address.
  * The ifaddr structure contains the protocol-independent part
  * of the structure and is assumed to be first.
  */
 
 /*
  * pltime/vltime are just for future reference (required to implements 2
  * hour rule for hosts).  they should never be modified by nd6_timeout or
  * anywhere else.
  *	userland -> kernel: accept pltime/vltime
  *	kernel -> userland: throw up everything
  *	in kernel: modify preferred/expire only
  */
 struct in6_addrlifetime {
 	time_t ia6t_expire;	/* valid lifetime expiration time */
 	time_t ia6t_preferred;	/* preferred lifetime expiration time */
 	u_int32_t ia6t_vltime;	/* valid lifetime */
 	u_int32_t ia6t_pltime;	/* prefix lifetime */
 };
 
 struct nd_ifinfo;
 struct scope6_id;
 struct lltable;
 struct mld_ifsoftc;
 struct in6_multi;
 
 struct in6_ifextra {
 	counter_u64_t *in6_ifstat;
 	counter_u64_t *icmp6_ifstat;
 	struct nd_ifinfo *nd_ifinfo;
 	struct scope6_id *scope6_id;
 	struct lltable *lltable;
 	struct mld_ifsoftc *mld_ifinfo;
 };
 
 #define	LLTABLE6(ifp)	(((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->lltable)
 
 #ifdef _KERNEL
 
 SLIST_HEAD(in6_multi_head, in6_multi);
 MALLOC_DECLARE(M_IP6MADDR);
 
 struct	in6_ifaddr {
 	struct	ifaddr ia_ifa;		/* protocol-independent info */
 #define	ia_ifp		ia_ifa.ifa_ifp
 #define ia_flags	ia_ifa.ifa_flags
 	struct	sockaddr_in6 ia_addr;	/* interface address */
 	struct	sockaddr_in6 ia_net;	/* network number of interface */
 	struct	sockaddr_in6 ia_dstaddr; /* space for destination addr */
 	struct	sockaddr_in6 ia_prefixmask; /* prefix mask */
 	u_int32_t ia_plen;		/* prefix length */
 	CK_STAILQ_ENTRY(in6_ifaddr)	ia_link;	/* list of IPv6 addresses */
 	int	ia6_flags;
 
 	struct in6_addrlifetime ia6_lifetime;
 	time_t	ia6_createtime; /* the creation time of this address, which is
 				 * currently used for temporary addresses only.
 				 */
 	time_t	ia6_updatetime;
 
 	/* back pointer to the ND prefix (for autoconfigured addresses only) */
 	struct nd_prefix *ia6_ndpr;
 
 	/* multicast addresses joined from the kernel */
 	LIST_HEAD(, in6_multi_mship) ia6_memberships;
 	/* entry in bucket of inet6 addresses */
 	CK_LIST_ENTRY(in6_ifaddr) ia6_hash;
 };
 
 /* List of in6_ifaddr's. */
 CK_STAILQ_HEAD(in6_ifaddrhead, in6_ifaddr);
 CK_LIST_HEAD(in6_ifaddrlisthead, in6_ifaddr);
 #endif	/* _KERNEL */
 
 /* control structure to manage address selection policy */
 struct in6_addrpolicy {
 	struct sockaddr_in6 addr; /* prefix address */
 	struct sockaddr_in6 addrmask; /* prefix mask */
 	int preced;		/* precedence */
 	int label;		/* matching label */
 	u_quad_t use;		/* statistics */
 };
 
 /*
  * IPv6 interface statistics, as defined in RFC2465 Ipv6IfStatsEntry (p12).
  */
 struct in6_ifstat {
 	uint64_t ifs6_in_receive;	/* # of total input datagram */
 	uint64_t ifs6_in_hdrerr;	/* # of datagrams with invalid hdr */
 	uint64_t ifs6_in_toobig;	/* # of datagrams exceeded MTU */
 	uint64_t ifs6_in_noroute;	/* # of datagrams with no route */
 	uint64_t ifs6_in_addrerr;	/* # of datagrams with invalid dst */
 	uint64_t ifs6_in_protounknown;	/* # of datagrams with unknown proto */
 					/* NOTE: increment on final dst if */
 	uint64_t ifs6_in_truncated;	/* # of truncated datagrams */
 	uint64_t ifs6_in_discard;	/* # of discarded datagrams */
 					/* NOTE: fragment timeout is not here */
 	uint64_t ifs6_in_deliver;	/* # of datagrams delivered to ULP */
 					/* NOTE: increment on final dst if */
 	uint64_t ifs6_out_forward;	/* # of datagrams forwarded */
 					/* NOTE: increment on outgoing if */
 	uint64_t ifs6_out_request;	/* # of outgoing datagrams from ULP */
 					/* NOTE: does not include forwrads */
 	uint64_t ifs6_out_discard;	/* # of discarded datagrams */
 	uint64_t ifs6_out_fragok;	/* # of datagrams fragmented */
 	uint64_t ifs6_out_fragfail;	/* # of datagrams failed on fragment */
 	uint64_t ifs6_out_fragcreat;	/* # of fragment datagrams */
 					/* NOTE: this is # after fragment */
 	uint64_t ifs6_reass_reqd;	/* # of incoming fragmented packets */
 					/* NOTE: increment on final dst if */
 	uint64_t ifs6_reass_ok;		/* # of reassembled packets */
 					/* NOTE: this is # after reass */
 					/* NOTE: increment on final dst if */
 	uint64_t ifs6_reass_fail;	/* # of reass failures */
 					/* NOTE: may not be packet count */
 					/* NOTE: increment on final dst if */
 	uint64_t ifs6_in_mcast;		/* # of inbound multicast datagrams */
 	uint64_t ifs6_out_mcast;	/* # of outbound multicast datagrams */
 };
 
 /*
  * ICMPv6 interface statistics, as defined in RFC2466 Ipv6IfIcmpEntry.
  * XXX: I'm not sure if this file is the right place for this structure...
  */
 struct icmp6_ifstat {
 	/*
 	 * Input statistics
 	 */
 	/* ipv6IfIcmpInMsgs, total # of input messages */
 	uint64_t ifs6_in_msg;
 	/* ipv6IfIcmpInErrors, # of input error messages */
 	uint64_t ifs6_in_error;
 	/* ipv6IfIcmpInDestUnreachs, # of input dest unreach errors */
 	uint64_t ifs6_in_dstunreach;
 	/* ipv6IfIcmpInAdminProhibs, # of input administratively prohibited errs */
 	uint64_t ifs6_in_adminprohib;
 	/* ipv6IfIcmpInTimeExcds, # of input time exceeded errors */
 	uint64_t ifs6_in_timeexceed;
 	/* ipv6IfIcmpInParmProblems, # of input parameter problem errors */
 	uint64_t ifs6_in_paramprob;
 	/* ipv6IfIcmpInPktTooBigs, # of input packet too big errors */
 	uint64_t ifs6_in_pkttoobig;
 	/* ipv6IfIcmpInEchos, # of input echo requests */
 	uint64_t ifs6_in_echo;
 	/* ipv6IfIcmpInEchoReplies, # of input echo replies */
 	uint64_t ifs6_in_echoreply;
 	/* ipv6IfIcmpInRouterSolicits, # of input router solicitations */
 	uint64_t ifs6_in_routersolicit;
 	/* ipv6IfIcmpInRouterAdvertisements, # of input router advertisements */
 	uint64_t ifs6_in_routeradvert;
 	/* ipv6IfIcmpInNeighborSolicits, # of input neighbor solicitations */
 	uint64_t ifs6_in_neighborsolicit;
 	/* ipv6IfIcmpInNeighborAdvertisements, # of input neighbor advertisements */
 	uint64_t ifs6_in_neighboradvert;
 	/* ipv6IfIcmpInRedirects, # of input redirects */
 	uint64_t ifs6_in_redirect;
 	/* ipv6IfIcmpInGroupMembQueries, # of input MLD queries */
 	uint64_t ifs6_in_mldquery;
 	/* ipv6IfIcmpInGroupMembResponses, # of input MLD reports */
 	uint64_t ifs6_in_mldreport;
 	/* ipv6IfIcmpInGroupMembReductions, # of input MLD done */
 	uint64_t ifs6_in_mlddone;
 
 	/*
 	 * Output statistics. We should solve unresolved routing problem...
 	 */
 	/* ipv6IfIcmpOutMsgs, total # of output messages */
 	uint64_t ifs6_out_msg;
 	/* ipv6IfIcmpOutErrors, # of output error messages */
 	uint64_t ifs6_out_error;
 	/* ipv6IfIcmpOutDestUnreachs, # of output dest unreach errors */
 	uint64_t ifs6_out_dstunreach;
 	/* ipv6IfIcmpOutAdminProhibs, # of output administratively prohibited errs */
 	uint64_t ifs6_out_adminprohib;
 	/* ipv6IfIcmpOutTimeExcds, # of output time exceeded errors */
 	uint64_t ifs6_out_timeexceed;
 	/* ipv6IfIcmpOutParmProblems, # of output parameter problem errors */
 	uint64_t ifs6_out_paramprob;
 	/* ipv6IfIcmpOutPktTooBigs, # of output packet too big errors */
 	uint64_t ifs6_out_pkttoobig;
 	/* ipv6IfIcmpOutEchos, # of output echo requests */
 	uint64_t ifs6_out_echo;
 	/* ipv6IfIcmpOutEchoReplies, # of output echo replies */
 	uint64_t ifs6_out_echoreply;
 	/* ipv6IfIcmpOutRouterSolicits, # of output router solicitations */
 	uint64_t ifs6_out_routersolicit;
 	/* ipv6IfIcmpOutRouterAdvertisements, # of output router advertisements */
 	uint64_t ifs6_out_routeradvert;
 	/* ipv6IfIcmpOutNeighborSolicits, # of output neighbor solicitations */
 	uint64_t ifs6_out_neighborsolicit;
 	/* ipv6IfIcmpOutNeighborAdvertisements, # of output neighbor advertisements */
 	uint64_t ifs6_out_neighboradvert;
 	/* ipv6IfIcmpOutRedirects, # of output redirects */
 	uint64_t ifs6_out_redirect;
 	/* ipv6IfIcmpOutGroupMembQueries, # of output MLD queries */
 	uint64_t ifs6_out_mldquery;
 	/* ipv6IfIcmpOutGroupMembResponses, # of output MLD reports */
 	uint64_t ifs6_out_mldreport;
 	/* ipv6IfIcmpOutGroupMembReductions, # of output MLD done */
 	uint64_t ifs6_out_mlddone;
 };
 
 struct	in6_ifreq {
 	char	ifr_name[IFNAMSIZ];
 	union {
 		struct	sockaddr_in6 ifru_addr;
 		struct	sockaddr_in6 ifru_dstaddr;
 		int	ifru_flags;
 		int	ifru_flags6;
 		int	ifru_metric;
 		caddr_t	ifru_data;
 		struct in6_addrlifetime ifru_lifetime;
 		struct in6_ifstat ifru_stat;
 		struct icmp6_ifstat ifru_icmp6stat;
 		u_int32_t ifru_scope_id[16];
 	} ifr_ifru;
 };
 
 struct	in6_aliasreq {
 	char	ifra_name[IFNAMSIZ];
 	struct	sockaddr_in6 ifra_addr;
 	struct	sockaddr_in6 ifra_dstaddr;
 	struct	sockaddr_in6 ifra_prefixmask;
 	int	ifra_flags;
 	struct in6_addrlifetime ifra_lifetime;
 	int	ifra_vhid;
 };
 
 /* pre-10.x compat */
 struct	oin6_aliasreq {
 	char	ifra_name[IFNAMSIZ];
 	struct	sockaddr_in6 ifra_addr;
 	struct	sockaddr_in6 ifra_dstaddr;
 	struct	sockaddr_in6 ifra_prefixmask;
 	int	ifra_flags;
 	struct in6_addrlifetime ifra_lifetime;
 };
 
 /* prefix type macro */
 #define IN6_PREFIX_ND	1
 #define IN6_PREFIX_RR	2
 
 /*
  * prefix related flags passed between kernel(NDP related part) and
  * user land command(ifconfig) and daemon(rtadvd).
  */
 struct in6_prflags {
 	struct prf_ra {
 		u_char onlink : 1;
 		u_char autonomous : 1;
 		u_char reserved : 6;
 	} prf_ra;
 	u_char prf_reserved1;
 	u_short prf_reserved2;
 	/* want to put this on 4byte offset */
 	struct prf_rr {
 		u_char decrvalid : 1;
 		u_char decrprefd : 1;
 		u_char reserved : 6;
 	} prf_rr;
 	u_char prf_reserved3;
 	u_short prf_reserved4;
 };
 
 struct  in6_prefixreq {
 	char	ipr_name[IFNAMSIZ];
 	u_char	ipr_origin;
 	u_char	ipr_plen;
 	u_int32_t ipr_vltime;
 	u_int32_t ipr_pltime;
 	struct in6_prflags ipr_flags;
 	struct	sockaddr_in6 ipr_prefix;
 };
 
 #define PR_ORIG_RA	0
 #define PR_ORIG_RR	1
 #define PR_ORIG_STATIC	2
 #define PR_ORIG_KERNEL	3
 
 #define ipr_raf_onlink		ipr_flags.prf_ra.onlink
 #define ipr_raf_auto		ipr_flags.prf_ra.autonomous
 
 #define ipr_statef_onlink	ipr_flags.prf_state.onlink
 
 #define ipr_rrf_decrvalid	ipr_flags.prf_rr.decrvalid
 #define ipr_rrf_decrprefd	ipr_flags.prf_rr.decrprefd
 
 struct	in6_rrenumreq {
 	char	irr_name[IFNAMSIZ];
 	u_char	irr_origin;
 	u_char	irr_m_len;	/* match len for matchprefix */
 	u_char	irr_m_minlen;	/* minlen for matching prefix */
 	u_char	irr_m_maxlen;	/* maxlen for matching prefix */
 	u_char	irr_u_uselen;	/* uselen for adding prefix */
 	u_char	irr_u_keeplen;	/* keeplen from matching prefix */
 	struct irr_raflagmask {
 		u_char onlink : 1;
 		u_char autonomous : 1;
 		u_char reserved : 6;
 	} irr_raflagmask;
 	u_int32_t irr_vltime;
 	u_int32_t irr_pltime;
 	struct in6_prflags irr_flags;
 	struct	sockaddr_in6 irr_matchprefix;
 	struct	sockaddr_in6 irr_useprefix;
 };
 
 #define irr_raf_mask_onlink	irr_raflagmask.onlink
 #define irr_raf_mask_auto	irr_raflagmask.autonomous
 #define irr_raf_mask_reserved	irr_raflagmask.reserved
 
 #define irr_raf_onlink		irr_flags.prf_ra.onlink
 #define irr_raf_auto		irr_flags.prf_ra.autonomous
 
 #define irr_statef_onlink	irr_flags.prf_state.onlink
 
 #define irr_rrf			irr_flags.prf_rr
 #define irr_rrf_decrvalid	irr_flags.prf_rr.decrvalid
 #define irr_rrf_decrprefd	irr_flags.prf_rr.decrprefd
 
 /*
  * Given a pointer to an in6_ifaddr (ifaddr),
  * return a pointer to the addr as a sockaddr_in6
  */
 #define IA6_IN6(ia)	(&((ia)->ia_addr.sin6_addr))
 #define IA6_DSTIN6(ia)	(&((ia)->ia_dstaddr.sin6_addr))
 #define IA6_MASKIN6(ia)	(&((ia)->ia_prefixmask.sin6_addr))
 #define IA6_SIN6(ia)	(&((ia)->ia_addr))
 #define IA6_DSTSIN6(ia)	(&((ia)->ia_dstaddr))
 #define IFA_IN6(x)	(&((struct sockaddr_in6 *)((x)->ifa_addr))->sin6_addr)
 #define IFA_DSTIN6(x)	(&((struct sockaddr_in6 *)((x)->ifa_dstaddr))->sin6_addr)
 
 #define IFPR_IN6(x)	(&((struct sockaddr_in6 *)((x)->ifpr_prefix))->sin6_addr)
 
 #ifdef _KERNEL
 #define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m)	(	\
 	(((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \
 	(((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \
 	(((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \
 	(((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 )
 #define IN6_MASK_ADDR(a, m)	do { \
 	(a)->s6_addr32[0] &= (m)->s6_addr32[0]; \
 	(a)->s6_addr32[1] &= (m)->s6_addr32[1]; \
 	(a)->s6_addr32[2] &= (m)->s6_addr32[2]; \
 	(a)->s6_addr32[3] &= (m)->s6_addr32[3]; \
 } while (0)
 #endif
 
 #define SIOCSIFADDR_IN6		 _IOW('i', 12, struct in6_ifreq)
 #define SIOCGIFADDR_IN6		_IOWR('i', 33, struct in6_ifreq)
 
 #ifdef _KERNEL
 /*
  * SIOCSxxx ioctls should be unused (see comments in in6.c), but
  * we do not shift numbers for binary compatibility.
  */
 #define SIOCSIFDSTADDR_IN6	 _IOW('i', 14, struct in6_ifreq)
 #define SIOCSIFNETMASK_IN6	 _IOW('i', 22, struct in6_ifreq)
 #endif
 
 #define SIOCGIFDSTADDR_IN6	_IOWR('i', 34, struct in6_ifreq)
 #define SIOCGIFNETMASK_IN6	_IOWR('i', 37, struct in6_ifreq)
 
 #define SIOCDIFADDR_IN6		 _IOW('i', 25, struct in6_ifreq)
 #define OSIOCAIFADDR_IN6	 _IOW('i', 26, struct oin6_aliasreq)
 #define SIOCAIFADDR_IN6		 _IOW('i', 27, struct in6_aliasreq)
 
 #define SIOCSIFPHYADDR_IN6       _IOW('i', 70, struct in6_aliasreq)
 #define	SIOCGIFPSRCADDR_IN6	_IOWR('i', 71, struct in6_ifreq)
 #define	SIOCGIFPDSTADDR_IN6	_IOWR('i', 72, struct in6_ifreq)
 
 #define SIOCGIFAFLAG_IN6	_IOWR('i', 73, struct in6_ifreq)
 
 #ifdef _KERNEL
 #define OSIOCGIFINFO_IN6	_IOWR('i', 76, struct in6_ondireq)
 #endif
 #define SIOCGIFINFO_IN6		_IOWR('i', 108, struct in6_ndireq)
 #define SIOCSIFINFO_IN6		_IOWR('i', 109, struct in6_ndireq)
 #define SIOCSNDFLUSH_IN6	_IOWR('i', 77, struct in6_ifreq)
 #define SIOCGNBRINFO_IN6	_IOWR('i', 78, struct in6_nbrinfo)
 #define SIOCSPFXFLUSH_IN6	_IOWR('i', 79, struct in6_ifreq)
 #define SIOCSRTRFLUSH_IN6	_IOWR('i', 80, struct in6_ifreq)
 
 #define SIOCGIFALIFETIME_IN6	_IOWR('i', 81, struct in6_ifreq)
 #define SIOCGIFSTAT_IN6		_IOWR('i', 83, struct in6_ifreq)
 #define SIOCGIFSTAT_ICMP6	_IOWR('i', 84, struct in6_ifreq)
 
 #define SIOCSDEFIFACE_IN6	_IOWR('i', 85, struct in6_ndifreq)
 #define SIOCGDEFIFACE_IN6	_IOWR('i', 86, struct in6_ndifreq)
 
 #define SIOCSIFINFO_FLAGS	_IOWR('i', 87, struct in6_ndireq) /* XXX */
 
 #define SIOCSSCOPE6		_IOW('i', 88, struct in6_ifreq)
 #define SIOCGSCOPE6		_IOWR('i', 89, struct in6_ifreq)
 #define SIOCGSCOPE6DEF		_IOWR('i', 90, struct in6_ifreq)
 
 #define SIOCSIFPREFIX_IN6	_IOW('i', 100, struct in6_prefixreq) /* set */
 #define SIOCGIFPREFIX_IN6	_IOWR('i', 101, struct in6_prefixreq) /* get */
 #define SIOCDIFPREFIX_IN6	_IOW('i', 102, struct in6_prefixreq) /* del */
 #define SIOCAIFPREFIX_IN6	_IOW('i', 103, struct in6_rrenumreq) /* add */
 #define SIOCCIFPREFIX_IN6	_IOW('i', 104, \
 				     struct in6_rrenumreq) /* change */
 #define SIOCSGIFPREFIX_IN6	_IOW('i', 105, \
 				     struct in6_rrenumreq) /* set global */
 
 #define SIOCGETSGCNT_IN6	_IOWR('u', 106, \
 				      struct sioc_sg_req6) /* get s,g pkt cnt */
 #define SIOCGETMIFCNT_IN6	_IOWR('u', 107, \
 				      struct sioc_mif_req6) /* get pkt cnt per if */
 
 #define SIOCAADDRCTL_POLICY	_IOW('u', 108, struct in6_addrpolicy)
 #define SIOCDADDRCTL_POLICY	_IOW('u', 109, struct in6_addrpolicy)
 
 #define IN6_IFF_ANYCAST		0x01	/* anycast address */
 #define IN6_IFF_TENTATIVE	0x02	/* tentative address */
 #define IN6_IFF_DUPLICATED	0x04	/* DAD detected duplicate */
 #define IN6_IFF_DETACHED	0x08	/* may be detached from the link */
 #define IN6_IFF_DEPRECATED	0x10	/* deprecated address */
 #define IN6_IFF_NODAD		0x20	/* don't perform DAD on this address
 					 * (obsolete)
 					 */
 #define IN6_IFF_AUTOCONF	0x40	/* autoconfigurable address. */
 #define IN6_IFF_TEMPORARY	0x80	/* temporary (anonymous) address. */
 #define	IN6_IFF_PREFER_SOURCE	0x0100	/* preferred address for SAS */
 
 /* do not input/output */
 #define IN6_IFF_NOTREADY (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED)
 
 #ifdef _KERNEL
 #define IN6_ARE_SCOPE_CMP(a,b) ((a)-(b))
 #define IN6_ARE_SCOPE_EQUAL(a,b) ((a)==(b))
 #endif
 
 #ifdef _KERNEL
 VNET_DECLARE(struct in6_ifaddrhead, in6_ifaddrhead);
 VNET_DECLARE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl);
 VNET_DECLARE(u_long, in6_ifaddrhmask);
 #define	V_in6_ifaddrhead		VNET(in6_ifaddrhead)
 #define	V_in6_ifaddrhashtbl		VNET(in6_ifaddrhashtbl)
 #define	V_in6_ifaddrhmask		VNET(in6_ifaddrhmask)
 
 #define	IN6ADDR_NHASH_LOG2		8
 #define	IN6ADDR_NHASH			(1 << IN6ADDR_NHASH_LOG2)
 #define	IN6ADDR_HASHVAL(x)		(in6_addrhash(x))
 #define	IN6ADDR_HASH(x) \
     (&V_in6_ifaddrhashtbl[IN6ADDR_HASHVAL(x) & V_in6_ifaddrhmask])
 
 static __inline uint32_t
 in6_addrhash(const struct in6_addr *in6)
 {
 	uint32_t x;
 
 	x = in6->s6_addr32[0] ^ in6->s6_addr32[1] ^ in6->s6_addr32[2] ^
 	    in6->s6_addr32[3];
 	return (fnv_32_buf(&x, sizeof(x), FNV1_32_INIT));
 }
 
 extern struct rmlock in6_ifaddr_lock;
 #define	IN6_IFADDR_LOCK_ASSERT()	rm_assert(&in6_ifaddr_lock, RA_LOCKED)
 #define	IN6_IFADDR_RLOCK(t)		rm_rlock(&in6_ifaddr_lock, (t))
 #define	IN6_IFADDR_RLOCK_ASSERT()	rm_assert(&in6_ifaddr_lock, RA_RLOCKED)
 #define	IN6_IFADDR_RUNLOCK(t)		rm_runlock(&in6_ifaddr_lock, (t))
 #define	IN6_IFADDR_WLOCK()		rm_wlock(&in6_ifaddr_lock)
 #define	IN6_IFADDR_WLOCK_ASSERT()	rm_assert(&in6_ifaddr_lock, RA_WLOCKED)
 #define	IN6_IFADDR_WUNLOCK()		rm_wunlock(&in6_ifaddr_lock)
 
 #define in6_ifstat_inc(ifp, tag) \
 do {								\
 	if (ifp)						\
 		counter_u64_add(((struct in6_ifextra *)		\
 		    ((ifp)->if_afdata[AF_INET6]))->in6_ifstat[	\
 		    offsetof(struct in6_ifstat, tag) / sizeof(uint64_t)], 1);\
 } while (/*CONSTCOND*/ 0)
 
 extern u_char inet6ctlerrmap[];
 VNET_DECLARE(unsigned long, in6_maxmtu);
 #define	V_in6_maxmtu			VNET(in6_maxmtu)
 #endif /* _KERNEL */
 
 /*
  * IPv6 multicast MLD-layer source entry.
  */
 struct ip6_msource {
 	RB_ENTRY(ip6_msource)	im6s_link;	/* RB tree links */
 	struct in6_addr		im6s_addr;
 	struct im6s_st {
 		uint16_t	ex;		/* # of exclusive members */
 		uint16_t	in;		/* # of inclusive members */
 	}			im6s_st[2];	/* state at t0, t1 */
 	uint8_t			im6s_stp;	/* pending query */
 };
 RB_HEAD(ip6_msource_tree, ip6_msource);
 
 /*
  * IPv6 multicast PCB-layer source entry.
  *
  * NOTE: overlapping use of struct ip6_msource fields at start.
  */
 struct in6_msource {
 	RB_ENTRY(ip6_msource)	im6s_link;	/* Common field */
 	struct in6_addr		im6s_addr;	/* Common field */
 	uint8_t			im6sl_st[2];	/* state before/at commit */
 };
 
 #ifdef _KERNEL
 /*
  * IPv6 source tree comparison function.
  *
  * An ordered predicate is necessary; bcmp() is not documented to return
  * an indication of order, memcmp() is, and is an ISO C99 requirement.
  */
 static __inline int
 ip6_msource_cmp(const struct ip6_msource *a, const struct ip6_msource *b)
 {
 
 	return (memcmp(&a->im6s_addr, &b->im6s_addr, sizeof(struct in6_addr)));
 }
 RB_PROTOTYPE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp);
 
 /*
  * IPv6 multicast PCB-layer group filter descriptor.
  */
 struct in6_mfilter {
 	struct ip6_msource_tree	im6f_sources; /* source list for (S,G) */
 	u_long			im6f_nsrc;    /* # of source entries */
 	uint8_t			im6f_st[2];   /* state before/at commit */
+	struct in6_multi       *im6f_in6m;    /* associated multicast address */
+	STAILQ_ENTRY(in6_mfilter) im6f_entry; /* list entry */
 };
+
+/*
+ * Helper types and functions for IPv4 multicast filters.
+ */
+STAILQ_HEAD(ip6_mfilter_head, in6_mfilter);
+
+struct in6_mfilter *ip6_mfilter_alloc(int mflags, int st0, int st1);
+void ip6_mfilter_free(struct in6_mfilter *);
+
+static inline void
+ip6_mfilter_init(struct ip6_mfilter_head *head)
+{
+
+	STAILQ_INIT(head);
+}
+
+static inline struct in6_mfilter *
+ip6_mfilter_first(const struct ip6_mfilter_head *head)
+{
+
+	return (STAILQ_FIRST(head));
+}
+
+static inline void
+ip6_mfilter_insert(struct ip6_mfilter_head *head, struct in6_mfilter *imf)
+{
+
+	STAILQ_INSERT_TAIL(head, imf, im6f_entry);
+}
+
+static inline void
+ip6_mfilter_remove(struct ip6_mfilter_head *head, struct in6_mfilter *imf)
+{
+
+	STAILQ_REMOVE(head, imf, in6_mfilter, im6f_entry);
+}
+
+#define	IP6_MFILTER_FOREACH(imf, head) \
+	STAILQ_FOREACH(imf, head, im6f_entry)
+
+static inline size_t
+ip6_mfilter_count(struct ip6_mfilter_head *head)
+{
+	struct in6_mfilter *imf;
+	size_t num = 0;
+
+	STAILQ_FOREACH(imf, head, im6f_entry)
+		num++;
+	return (num);
+}
 
 /*
  * Legacy KAME IPv6 multicast membership descriptor.
  */
 struct in6_multi_mship {
 	struct	in6_multi *i6mm_maddr;
 	LIST_ENTRY(in6_multi_mship) i6mm_chain;
 };
 
 /*
  * IPv6 group descriptor.
  *
  * For every entry on an ifnet's if_multiaddrs list which represents
  * an IP multicast group, there is one of these structures.
  *
  * If any source filters are present, then a node will exist in the RB-tree
  * to permit fast lookup by source whenever an operation takes place.
  * This permits pre-order traversal when we issue reports.
  * Source filter trees are kept separately from the socket layer to
  * greatly simplify locking.
  *
  * When MLDv2 is active, in6m_timer is the response to group query timer.
  * The state-change timer in6m_sctimer is separate; whenever state changes
  * for the group the state change record is generated and transmitted,
  * and kept if retransmissions are necessary.
  *
  * FUTURE: in6m_link is now only used when groups are being purged
  * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but
  * because it is at the very start of the struct, we can't do this
  * w/o breaking the ABI for ifmcstat.
  */
 struct in6_multi {
 	struct	in6_addr in6m_addr;	/* IPv6 multicast address */
 	struct	ifnet *in6m_ifp;	/* back pointer to ifnet */
 	struct	ifmultiaddr *in6m_ifma;	/* back pointer to ifmultiaddr */
 	u_int	in6m_refcount;		/* reference count */
 	u_int	in6m_state;		/* state of the membership */
 	u_int	in6m_timer;		/* MLD6 listener report timer */
 
 	/* New fields for MLDv2 follow. */
 	struct mld_ifsoftc	*in6m_mli;	/* MLD info */
 	SLIST_ENTRY(in6_multi)	 in6m_nrele;	/* to-be-released by MLD */
 	SLIST_ENTRY(in6_multi)	 in6m_defer;	/* deferred MLDv1 */
 	struct ip6_msource_tree	 in6m_srcs;	/* tree of sources */
 	u_long			 in6m_nsrc;	/* # of tree entries */
 
 	struct mbufq		 in6m_scq;	/* queue of pending
 						 * state-change packets */
 	struct timeval		 in6m_lastgsrtv;	/* last G-S-R query */
 	uint16_t		 in6m_sctimer;	/* state-change timer */
 	uint16_t		 in6m_scrv;	/* state-change rexmit count */
 
 	/*
 	 * SSM state counters which track state at T0 (the time the last
 	 * state-change report's RV timer went to zero) and T1
 	 * (time of pending report, i.e. now).
 	 * Used for computing MLDv2 state-change reports. Several refcounts
 	 * are maintained here to optimize for common use-cases.
 	 */
 	struct in6m_st {
 		uint16_t	iss_fmode;	/* MLD filter mode */
 		uint16_t	iss_asm;	/* # of ASM listeners */
 		uint16_t	iss_ex;		/* # of exclusive members */
 		uint16_t	iss_in;		/* # of inclusive members */
 		uint16_t	iss_rec;	/* # of recorded sources */
 	}			in6m_st[2];	/* state at t0, t1 */
 };
 
 void in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm);
 
 /*
  * Helper function to derive the filter mode on a source entry
  * from its internal counters. Predicates are:
  *  A source is only excluded if all listeners exclude it.
  *  A source is only included if no listeners exclude it,
  *  and at least one listener includes it.
  * May be used by ifmcstat(8).
  */
 static __inline uint8_t
 im6s_get_mode(const struct in6_multi *inm, const struct ip6_msource *ims,
     uint8_t t)
 {
 
 	t = !!t;
 	if (inm->in6m_st[t].iss_ex > 0 &&
 	    inm->in6m_st[t].iss_ex == ims->im6s_st[t].ex)
 		return (MCAST_EXCLUDE);
 	else if (ims->im6s_st[t].in > 0 && ims->im6s_st[t].ex == 0)
 		return (MCAST_INCLUDE);
 	return (MCAST_UNDEFINED);
 }
 
 /*
  * Lock macros for IPv6 layer multicast address lists.  IPv6 lock goes
  * before link layer multicast locks in the lock order.  In most cases,
  * consumers of IN_*_MULTI() macros should acquire the locks before
  * calling them; users of the in_{add,del}multi() functions should not.
  */
 extern struct mtx in6_multi_list_mtx;
 extern struct sx in6_multi_sx;
 
 #define	IN6_MULTI_LIST_LOCK()		mtx_lock(&in6_multi_list_mtx)
 #define	IN6_MULTI_LIST_UNLOCK()	mtx_unlock(&in6_multi_list_mtx)
 #define	IN6_MULTI_LIST_LOCK_ASSERT()	mtx_assert(&in6_multi_list_mtx, MA_OWNED)
 #define	IN6_MULTI_LIST_UNLOCK_ASSERT() mtx_assert(&in6_multi_list_mtx, MA_NOTOWNED)
 
 #define	IN6_MULTI_LOCK()		sx_xlock(&in6_multi_sx)
 #define	IN6_MULTI_UNLOCK()	sx_xunlock(&in6_multi_sx)
 #define	IN6_MULTI_LOCK_ASSERT()	sx_assert(&in6_multi_sx, SA_XLOCKED)
 #define	IN6_MULTI_UNLOCK_ASSERT() sx_assert(&in6_multi_sx, SA_XUNLOCKED)
 
 /*
  * Get the in6_multi pointer from a ifmultiaddr.
  * Returns NULL if ifmultiaddr is no longer valid.
  */
 static __inline struct in6_multi *
 in6m_ifmultiaddr_get_inm(struct ifmultiaddr *ifma)
 {
 
 	NET_EPOCH_ASSERT();
 
 	return ((ifma->ifma_addr->sa_family != AF_INET6 ||	
 	    (ifma->ifma_flags & IFMA_F_ENQUEUED) == 0) ? NULL :
 	    ifma->ifma_protospec);
 }
 
 /*
  * Look up an in6_multi record for an IPv6 multicast address
  * on the interface ifp.
  * If no record found, return NULL.
  */
 static __inline struct in6_multi *
 in6m_lookup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr)
 {
 	struct ifmultiaddr *ifma;
 	struct in6_multi *inm;
 
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = in6m_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, mcaddr))
 			return (inm);
 	}
 	return (NULL);
 }
 
 /*
  * Wrapper for in6m_lookup_locked().
  *
  * SMPng: Assumes that neithr the IN6_MULTI_LOCK() or IF_ADDR_LOCK() are held.
  */
 static __inline struct in6_multi *
 in6m_lookup(struct ifnet *ifp, const struct in6_addr *mcaddr)
 {
 	struct epoch_tracker et;
 	struct in6_multi *inm;
 
 	IN6_MULTI_LIST_LOCK();
 	NET_EPOCH_ENTER(et);
 	inm = in6m_lookup_locked(ifp, mcaddr);
 	NET_EPOCH_EXIT(et);
 	IN6_MULTI_LIST_UNLOCK();
 
 	return (inm);
 }
 
 /* Acquire an in6_multi record. */
 static __inline void
 in6m_acquire_locked(struct in6_multi *inm)
 {
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 	++inm->in6m_refcount;
 }
 
 static __inline void
 in6m_acquire(struct in6_multi *inm)
 {
 	IN6_MULTI_LIST_LOCK();
 	in6m_acquire_locked(inm);
 	IN6_MULTI_LIST_UNLOCK();
 }
 
 static __inline void
 in6m_rele_locked(struct in6_multi_head *inmh, struct in6_multi *inm)
 {
 	KASSERT(inm->in6m_refcount > 0, ("refcount == %d inm: %p", inm->in6m_refcount, inm));
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	if (--inm->in6m_refcount == 0) {
 		MPASS(inm->in6m_ifp == NULL);
 		inm->in6m_ifma->ifma_protospec = NULL;
 		MPASS(inm->in6m_ifma->ifma_llifma == NULL);
 		SLIST_INSERT_HEAD(inmh, inm, in6m_nrele);
 	}
 }
 
 struct ip6_moptions;
 struct sockopt;
 struct inpcbinfo;
 
 /* Multicast KPIs. */
 int	im6o_mc_filter(const struct ip6_moptions *, const struct ifnet *,
 	    const struct sockaddr *, const struct sockaddr *);
 int in6_joingroup(struct ifnet *, const struct in6_addr *,
 	    struct in6_mfilter *, struct in6_multi **, int);
 int	in6_joingroup_locked(struct ifnet *, const struct in6_addr *,
 	    struct in6_mfilter *, struct in6_multi **, int);
 int	in6_leavegroup(struct in6_multi *, struct in6_mfilter *);
 int	in6_leavegroup_locked(struct in6_multi *, struct in6_mfilter *);
 void	in6m_clear_recorded(struct in6_multi *);
 void	in6m_commit(struct in6_multi *);
 void	in6m_print(const struct in6_multi *);
 int	in6m_record_source(struct in6_multi *, const struct in6_addr *);
 void	in6m_release_list_deferred(struct in6_multi_head *);
 void	in6m_release_wait(void);
 void	ip6_freemoptions(struct ip6_moptions *);
 int	ip6_getmoptions(struct inpcb *, struct sockopt *);
 int	ip6_setmoptions(struct inpcb *, struct sockopt *);
 
 /* flags to in6_update_ifa */
 #define IN6_IFAUPDATE_DADDELAY	0x1 /* first time to configure an address */
 
 int	in6_mask2len(struct in6_addr *, u_char *);
 int	in6_control(struct socket *, u_long, caddr_t, struct ifnet *,
 	struct thread *);
 int	in6_update_ifa(struct ifnet *, struct in6_aliasreq *,
 	struct in6_ifaddr *, int);
 void	in6_prepare_ifra(struct in6_aliasreq *, const struct in6_addr *,
 	const struct in6_addr *);
 void	in6_purgeaddr(struct ifaddr *);
 int	in6if_do_dad(struct ifnet *);
 void	in6_savemkludge(struct in6_ifaddr *);
 void	*in6_domifattach(struct ifnet *);
 void	in6_domifdetach(struct ifnet *, void *);
 int	in6_domifmtu(struct ifnet *);
 void	in6_setmaxmtu(void);
 int	in6_if2idlen(struct ifnet *);
 struct in6_ifaddr *in6ifa_ifpforlinklocal(struct ifnet *, int);
 struct in6_ifaddr *in6ifa_ifpwithaddr(struct ifnet *, const struct in6_addr *);
 struct in6_ifaddr *in6ifa_ifwithaddr(const struct in6_addr *, uint32_t);
 struct in6_ifaddr *in6ifa_llaonifp(struct ifnet *);
 int	in6_addr2zoneid(struct ifnet *, struct in6_addr *, u_int32_t *);
 int	in6_matchlen(struct in6_addr *, struct in6_addr *);
 int	in6_are_prefix_equal(struct in6_addr *, struct in6_addr *, int);
 void	in6_prefixlen2mask(struct in6_addr *, int);
 int	in6_prefix_ioctl(struct socket *, u_long, caddr_t,
 	struct ifnet *);
 int	in6_prefix_add_ifid(int, struct in6_ifaddr *);
 void	in6_prefix_remove_ifid(int, struct in6_ifaddr *);
 void	in6_purgeprefix(struct ifnet *);
 
 int	in6_is_addr_deprecated(struct sockaddr_in6 *);
 int	in6_src_ioctl(u_long, caddr_t);
 
 void	in6_newaddrmsg(struct in6_ifaddr *, int);
 /*
  * Extended API for IPv6 FIB support.
  */
 struct mbuf *ip6_tryforward(struct mbuf *);
 void	in6_rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *,
 	    int, struct sockaddr *, u_int);
 int	in6_rtrequest(int, struct sockaddr *, struct sockaddr *,
 	    struct sockaddr *, int, struct rtentry **, u_int);
 void	in6_rtalloc(struct route_in6 *, u_int);
 void	in6_rtalloc_ign(struct route_in6 *, u_long, u_int);
 struct rtentry *in6_rtalloc1(struct sockaddr *, int, u_long, u_int);
 #endif /* _KERNEL */
 
 #endif /* _NETINET6_IN6_VAR_H_ */
Index: head/sys/netinet6/ip6_var.h
===================================================================
--- head/sys/netinet6/ip6_var.h	(revision 349368)
+++ head/sys/netinet6/ip6_var.h	(revision 349369)
@@ -1,450 +1,449 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_var.h,v 1.62 2001/05/03 14:51:48 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_var.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NETINET6_IP6_VAR_H_
 #define _NETINET6_IP6_VAR_H_
 
 #include <sys/epoch.h>
 
 /*
  * IP6 reassembly queue structure.  Each fragment
  * being reassembled is attached to one of these structures.
  */
 struct	ip6q {
 	struct ip6asfrag *ip6q_down;
 	struct ip6asfrag *ip6q_up;
 	u_int32_t	ip6q_ident;
 	u_int8_t	ip6q_nxt;
 	u_int8_t	ip6q_ecn;
 	u_int8_t	ip6q_ttl;
 	struct in6_addr ip6q_src, ip6q_dst;
 	struct ip6q	*ip6q_next;
 	struct ip6q	*ip6q_prev;
 	int		ip6q_unfrglen;	/* len of unfragmentable part */
 #ifdef notyet
 	u_char		*ip6q_nxtp;
 #endif
 	int		ip6q_nfrag;	/* # of fragments */
 	struct label	*ip6q_label;
 };
 
 struct	ip6asfrag {
 	struct ip6asfrag *ip6af_down;
 	struct ip6asfrag *ip6af_up;
 	struct mbuf	*ip6af_m;
 	int		ip6af_offset;	/* offset in ip6af_m to next header */
 	int		ip6af_frglen;	/* fragmentable part length */
 	int		ip6af_off;	/* fragment offset */
 	u_int16_t	ip6af_mff;	/* more fragment bit in frag off */
 };
 
 #define IP6_REASS_MBUF(ip6af) (*(struct mbuf **)&((ip6af)->ip6af_m))
 
 /*
  * IP6 reinjecting structure.
  */
 struct ip6_direct_ctx {
 	uint32_t	ip6dc_nxt;	/* next header to process */
 	uint32_t	ip6dc_off;	/* offset to next header */
 };
 
+#if defined(_NETINET6_IN6_VAR_H_) && defined(_KERNEL)
 /*
  * Structure attached to inpcb.in6p_moptions and
  * passed to ip6_output when IPv6 multicast options are in use.
  * This structure is lazy-allocated.
  */
 struct ip6_moptions {
 	struct	ifnet *im6o_multicast_ifp; /* ifp for outgoing multicasts */
 	u_char	im6o_multicast_hlim;	/* hoplimit for outgoing multicasts */
 	u_char	im6o_multicast_loop;	/* 1 >= hear sends if a member */
-	u_short	im6o_num_memberships;	/* no. memberships this socket */
-	u_short	im6o_max_memberships;	/* max memberships this socket */
-	struct	in6_multi **im6o_membership;	/* group memberships */
-	struct	in6_mfilter *im6o_mfilters;	/* source filters */
-	struct	epoch_context imo6_epoch_ctx;
+	struct ip6_mfilter_head im6o_head; /* group membership list */
 };
-
+#else
+struct ip6_moptions;
+#endif
 /*
  * Control options for outgoing packets
  */
 
 /* Routing header related info */
 struct	ip6po_rhinfo {
 	struct	ip6_rthdr *ip6po_rhi_rthdr; /* Routing header */
 	struct	route_in6 ip6po_rhi_route; /* Route to the 1st hop */
 };
 #define ip6po_rthdr	ip6po_rhinfo.ip6po_rhi_rthdr
 #define ip6po_route	ip6po_rhinfo.ip6po_rhi_route
 
 /* Nexthop related info */
 struct	ip6po_nhinfo {
 	struct	sockaddr *ip6po_nhi_nexthop;
 	struct	route_in6 ip6po_nhi_route; /* Route to the nexthop */
 };
 #define ip6po_nexthop	ip6po_nhinfo.ip6po_nhi_nexthop
 #define ip6po_nextroute	ip6po_nhinfo.ip6po_nhi_route
 
 struct	ip6_pktopts {
 	struct	mbuf *ip6po_m;	/* Pointer to mbuf storing the data */
 	int	ip6po_hlim;	/* Hoplimit for outgoing packets */
 
 	/* Outgoing IF/address information */
 	struct	in6_pktinfo *ip6po_pktinfo;
 
 	/* Next-hop address information */
 	struct	ip6po_nhinfo ip6po_nhinfo;
 
 	struct	ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */
 
 	/* Destination options header (before a routing header) */
 	struct	ip6_dest *ip6po_dest1;
 
 	/* Routing header related info. */
 	struct	ip6po_rhinfo ip6po_rhinfo;
 
 	/* Destination options header (after a routing header) */
 	struct	ip6_dest *ip6po_dest2;
 
 	int	ip6po_tclass;	/* traffic class */
 
 	int	ip6po_minmtu;  /* fragment vs PMTU discovery policy */
 #define IP6PO_MINMTU_MCASTONLY	-1 /* default; send at min MTU for multicast*/
 #define IP6PO_MINMTU_DISABLE	 0 /* always perform pmtu disc */
 #define IP6PO_MINMTU_ALL	 1 /* always send at min MTU */
 
 	int	ip6po_prefer_tempaddr;  /* whether temporary addresses are
 					   preferred as source address */
 #define IP6PO_TEMPADDR_SYSTEM	-1 /* follow the system default */
 #define IP6PO_TEMPADDR_NOTPREFER 0 /* not prefer temporary address */
 #define IP6PO_TEMPADDR_PREFER	 1 /* prefer temporary address */
 
 	int ip6po_flags;
 #if 0	/* parameters in this block is obsolete. do not reuse the values. */
 #define IP6PO_REACHCONF	0x01	/* upper-layer reachability confirmation. */
 #define IP6PO_MINMTU	0x02	/* use minimum MTU (IPV6_USE_MIN_MTU) */
 #endif
 #define IP6PO_DONTFRAG	0x04	/* disable fragmentation (IPV6_DONTFRAG) */
 #define IP6PO_USECOA	0x08	/* use care of address */
 };
 
 /*
  * Control options for incoming packets
  */
 
 struct	ip6stat {
 	uint64_t ip6s_total;		/* total packets received */
 	uint64_t ip6s_tooshort;		/* packet too short */
 	uint64_t ip6s_toosmall;		/* not enough data */
 	uint64_t ip6s_fragments;	/* fragments received */
 	uint64_t ip6s_fragdropped;	/* frags dropped(dups, out of space) */
 	uint64_t ip6s_fragtimeout;	/* fragments timed out */
 	uint64_t ip6s_fragoverflow;	/* fragments that exceeded limit */
 	uint64_t ip6s_forward;		/* packets forwarded */
 	uint64_t ip6s_cantforward;	/* packets rcvd for unreachable dest */
 	uint64_t ip6s_redirectsent;	/* packets forwarded on same net */
 	uint64_t ip6s_delivered;	/* datagrams delivered to upper level*/
 	uint64_t ip6s_localout;		/* total ip packets generated here */
 	uint64_t ip6s_odropped;		/* lost packets due to nobufs, etc. */
 	uint64_t ip6s_reassembled;	/* total packets reassembled ok */
 	uint64_t ip6s_atomicfrags;	/* atomic fragments */
 	uint64_t ip6s_fragmented;	/* datagrams successfully fragmented */
 	uint64_t ip6s_ofragments;	/* output fragments created */
 	uint64_t ip6s_cantfrag;		/* don't fragment flag was set, etc. */
 	uint64_t ip6s_badoptions;	/* error in option processing */
 	uint64_t ip6s_noroute;		/* packets discarded due to no route */
 	uint64_t ip6s_badvers;		/* ip6 version != 6 */
 	uint64_t ip6s_rawout;		/* total raw ip packets generated */
 	uint64_t ip6s_badscope;		/* scope error */
 	uint64_t ip6s_notmember;	/* don't join this multicast group */
 #define	IP6S_HDRCNT		256	/* headers count */
 	uint64_t ip6s_nxthist[IP6S_HDRCNT]; /* next header history */
 	uint64_t ip6s_m1;		/* one mbuf */
 #define	IP6S_M2MMAX		32
 	uint64_t ip6s_m2m[IP6S_M2MMAX];	/* two or more mbuf */
 	uint64_t ip6s_mext1;		/* one ext mbuf */
 	uint64_t ip6s_mext2m;		/* two or more ext mbuf */
 	uint64_t ip6s_exthdrtoolong;	/* ext hdr are not contiguous */
 	uint64_t ip6s_nogif;		/* no match gif found */
 	uint64_t ip6s_toomanyhdr;	/* discarded due to too many headers */
 
 	/*
 	 * statistics for improvement of the source address selection
 	 * algorithm:
 	 * XXX: hardcoded 16 = # of ip6 multicast scope types + 1
 	 */
 #define	IP6S_RULESMAX		16
 #define	IP6S_SCOPECNT		16
 	/* number of times that address selection fails */
 	uint64_t ip6s_sources_none;
 	/* number of times that an address on the outgoing I/F is chosen */
 	uint64_t ip6s_sources_sameif[IP6S_SCOPECNT];
 	/* number of times that an address on a non-outgoing I/F is chosen */
 	uint64_t ip6s_sources_otherif[IP6S_SCOPECNT];
 	/*
 	 * number of times that an address that has the same scope
 	 * from the destination is chosen.
 	 */
 	uint64_t ip6s_sources_samescope[IP6S_SCOPECNT];
 	/*
 	 * number of times that an address that has a different scope
 	 * from the destination is chosen.
 	 */
 	uint64_t ip6s_sources_otherscope[IP6S_SCOPECNT];
 	/* number of times that a deprecated address is chosen */
 	uint64_t ip6s_sources_deprecated[IP6S_SCOPECNT];
 
 	/* number of times that each rule of source selection is applied. */
 	uint64_t ip6s_sources_rule[IP6S_RULESMAX];
 };
 
 #ifdef _KERNEL
 #include <sys/counter.h>
 
 VNET_PCPUSTAT_DECLARE(struct ip6stat, ip6stat);
 #define	IP6STAT_ADD(name, val)	\
     VNET_PCPUSTAT_ADD(struct ip6stat, ip6stat, name, (val))
 #define	IP6STAT_SUB(name, val)	IP6STAT_ADD(name, -(val))
 #define	IP6STAT_INC(name)	IP6STAT_ADD(name, 1)
 #define	IP6STAT_DEC(name)	IP6STAT_SUB(name, 1)
 #endif
 
 #ifdef _KERNEL
 /* flags passed to ip6_output as last parameter */
 #define	IPV6_UNSPECSRC		0x01	/* allow :: as the source address */
 #define	IPV6_FORWARDING		0x02	/* most of IPv6 header exists */
 #define	IPV6_MINMTU		0x04	/* use minimum MTU (IPV6_USE_MIN_MTU) */
 
 #ifdef __NO_STRICT_ALIGNMENT
 #define IP6_HDR_ALIGNED_P(ip)	1
 #else
 #define IP6_HDR_ALIGNED_P(ip)	((((intptr_t) (ip)) & 3) == 0)
 #endif
 
 VNET_DECLARE(int, ip6_defhlim);		/* default hop limit */
 VNET_DECLARE(int, ip6_defmcasthlim);	/* default multicast hop limit */
 VNET_DECLARE(int, ip6_forwarding);	/* act as router? */
 VNET_DECLARE(int, ip6_use_deprecated);	/* allow deprecated addr as source */
 VNET_DECLARE(int, ip6_rr_prune);	/* router renumbering prefix
 					 * walk list every 5 sec.    */
 VNET_DECLARE(int, ip6_mcast_pmtu);	/* enable pMTU discovery for multicast? */
 VNET_DECLARE(int, ip6_v6only);
 #define	V_ip6_defhlim			VNET(ip6_defhlim)
 #define	V_ip6_defmcasthlim		VNET(ip6_defmcasthlim)
 #define	V_ip6_forwarding		VNET(ip6_forwarding)
 #define	V_ip6_use_deprecated		VNET(ip6_use_deprecated)
 #define	V_ip6_rr_prune			VNET(ip6_rr_prune)
 #define	V_ip6_mcast_pmtu		VNET(ip6_mcast_pmtu)
 #define	V_ip6_v6only			VNET(ip6_v6only)
 
 VNET_DECLARE(struct socket *, ip6_mrouter);	/* multicast routing daemon */
 VNET_DECLARE(int, ip6_sendredirects);	/* send IP redirects when forwarding? */
 VNET_DECLARE(int, ip6_maxfragpackets);	/* Maximum packets in reassembly
 					 * queue */
 extern int ip6_maxfrags;		/* Maximum fragments in reassembly
 					 * queue */
 VNET_DECLARE(int, ip6_maxfragbucketsize); /* Maximum reassembly queues per bucket */
 VNET_DECLARE(int, ip6_maxfragsperpacket); /* Maximum fragments per packet */
 VNET_DECLARE(int, ip6_accept_rtadv);	/* Acts as a host not a router */
 VNET_DECLARE(int, ip6_no_radr);		/* No defroute from RA */
 VNET_DECLARE(int, ip6_norbit_raif);	/* Disable R-bit in NA on RA
 					 * receiving IF. */
 VNET_DECLARE(int, ip6_rfc6204w3);	/* Accept defroute from RA even when
 					   forwarding enabled */
 VNET_DECLARE(int, ip6_log_interval);
 VNET_DECLARE(time_t, ip6_log_time);
 VNET_DECLARE(int, ip6_hdrnestlimit);	/* upper limit of # of extension
 					 * headers */
 VNET_DECLARE(int, ip6_dad_count);	/* DupAddrDetectionTransmits */
 #define	V_ip6_mrouter			VNET(ip6_mrouter)
 #define	V_ip6_sendredirects		VNET(ip6_sendredirects)
 #define	V_ip6_maxfragpackets		VNET(ip6_maxfragpackets)
 #define	V_ip6_maxfragbucketsize		VNET(ip6_maxfragbucketsize)
 #define	V_ip6_maxfragsperpacket		VNET(ip6_maxfragsperpacket)
 #define	V_ip6_accept_rtadv		VNET(ip6_accept_rtadv)
 #define	V_ip6_no_radr			VNET(ip6_no_radr)
 #define	V_ip6_norbit_raif		VNET(ip6_norbit_raif)
 #define	V_ip6_rfc6204w3			VNET(ip6_rfc6204w3)
 #define	V_ip6_log_interval		VNET(ip6_log_interval)
 #define	V_ip6_log_time			VNET(ip6_log_time)
 #define	V_ip6_hdrnestlimit		VNET(ip6_hdrnestlimit)
 #define	V_ip6_dad_count			VNET(ip6_dad_count)
 
 VNET_DECLARE(int, ip6_auto_flowlabel);
 VNET_DECLARE(int, ip6_auto_linklocal);
 #define	V_ip6_auto_flowlabel		VNET(ip6_auto_flowlabel)
 #define	V_ip6_auto_linklocal		VNET(ip6_auto_linklocal)
 
 VNET_DECLARE(int, ip6_use_tempaddr);	/* Whether to use temporary addresses */
 VNET_DECLARE(int, ip6_prefer_tempaddr);	/* Whether to prefer temporary
 					 * addresses in the source address
 					 * selection */
 #define	V_ip6_use_tempaddr		VNET(ip6_use_tempaddr)
 #define	V_ip6_prefer_tempaddr		VNET(ip6_prefer_tempaddr)
 
 VNET_DECLARE(int, ip6_use_defzone);	/* Whether to use the default scope
 					 * zone when unspecified */
 #define	V_ip6_use_defzone		VNET(ip6_use_defzone)
 
 VNET_DECLARE(struct pfil_head *, inet6_pfil_head);
 #define	V_inet6_pfil_head	VNET(inet6_pfil_head)
 #define	PFIL_INET6_NAME		"inet6"
 
 #ifdef IPSTEALTH
 VNET_DECLARE(int, ip6stealth);
 #define	V_ip6stealth			VNET(ip6stealth)
 #endif
 
 #ifdef EXPERIMENTAL
 VNET_DECLARE(int, nd6_ignore_ipv6_only_ra);
 #define	V_nd6_ignore_ipv6_only_ra	VNET(nd6_ignore_ipv6_only_ra)
 #endif
 
 extern struct	pr_usrreqs rip6_usrreqs;
 struct sockopt;
 
 struct inpcb;
 
 int	icmp6_ctloutput(struct socket *, struct sockopt *sopt);
 
 struct in6_ifaddr;
 void	ip6_init(void);
 int	ip6proto_register(short);
 int	ip6proto_unregister(short);
 
 void	ip6_input(struct mbuf *);
 void	ip6_direct_input(struct mbuf *);
 void	ip6_freepcbopts(struct ip6_pktopts *);
 
 int	ip6_unknown_opt(u_int8_t *, struct mbuf *, int);
 int	ip6_get_prevhdr(const struct mbuf *, int);
 int	ip6_nexthdr(const struct mbuf *, int, int, int *);
 int	ip6_lasthdr(const struct mbuf *, int, int, int *);
 
 extern int	(*ip6_mforward)(struct ip6_hdr *, struct ifnet *,
     struct mbuf *);
 
 int	ip6_process_hopopts(struct mbuf *, u_int8_t *, int, u_int32_t *,
 				 u_int32_t *);
 struct mbuf	**ip6_savecontrol_v4(struct inpcb *, struct mbuf *,
 	    struct mbuf **, int *);
 void	ip6_savecontrol(struct inpcb *, struct mbuf *, struct mbuf **);
 void	ip6_notify_pmtu(struct inpcb *, struct sockaddr_in6 *, u_int32_t);
 int	ip6_sysctl(int *, u_int, void *, size_t *, void *, size_t);
 
 void	ip6_forward(struct mbuf *, int);
 
 void	ip6_mloopback(struct ifnet *, struct mbuf *);
 int	ip6_output(struct mbuf *, struct ip6_pktopts *,
 			struct route_in6 *,
 			int,
 			struct ip6_moptions *, struct ifnet **,
 			struct inpcb *);
 int	ip6_ctloutput(struct socket *, struct sockopt *);
 int	ip6_raw_ctloutput(struct socket *, struct sockopt *);
 void	ip6_initpktopts(struct ip6_pktopts *);
 int	ip6_setpktopts(struct mbuf *, struct ip6_pktopts *,
 	struct ip6_pktopts *, struct ucred *, int);
 void	ip6_clearpktopts(struct ip6_pktopts *, int);
 struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int);
 int	ip6_optlen(struct inpcb *);
 int	ip6_deletefraghdr(struct mbuf *, int, int);
 int	ip6_fragment(struct ifnet *, struct mbuf *, int, u_char, int,
 			uint32_t);
 
 int	route6_input(struct mbuf **, int *, int);
 
 void	frag6_set_bucketsize(void);
 void	frag6_init(void);
 int	frag6_input(struct mbuf **, int *, int);
 void	frag6_slowtimo(void);
 void	frag6_drain(void);
 
 void	rip6_init(void);
 int	rip6_input(struct mbuf **, int *, int);
 void	rip6_ctlinput(int, struct sockaddr *, void *);
 int	rip6_ctloutput(struct socket *, struct sockopt *);
 int	rip6_output(struct mbuf *, struct socket *, ...);
 int	rip6_usrreq(struct socket *,
 	    int, struct mbuf *, struct mbuf *, struct mbuf *, struct thread *);
 
 int	dest6_input(struct mbuf **, int *, int);
 int	none_input(struct mbuf **, int *, int);
 
 int	in6_selectsrc_socket(struct sockaddr_in6 *, struct ip6_pktopts *,
     struct inpcb *, struct ucred *, int, struct in6_addr *, int *);
 int	in6_selectsrc_addr(uint32_t, const struct in6_addr *,
     uint32_t, struct ifnet *, struct in6_addr *, int *);
 int in6_selectroute(struct sockaddr_in6 *, struct ip6_pktopts *,
 	struct ip6_moptions *, struct route_in6 *, struct ifnet **,
 	struct rtentry **);
 int	in6_selectroute_fib(struct sockaddr_in6 *, struct ip6_pktopts *,
 	    struct ip6_moptions *, struct route_in6 *, struct ifnet **,
 	    struct rtentry **, u_int);
 u_int32_t ip6_randomid(void);
 u_int32_t ip6_randomflowlabel(void);
 void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset);
 #endif /* _KERNEL */
 
 #endif /* !_NETINET6_IP6_VAR_H_ */
Index: head/sys/netpfil/pf/if_pfsync.c
===================================================================
--- head/sys/netpfil/pf/if_pfsync.c	(revision 349368)
+++ head/sys/netpfil/pf/if_pfsync.c	(revision 349369)
@@ -1,2552 +1,2549 @@
 /*-
  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND ISC)
  *
  * Copyright (c) 2002 Michael Shalayeff
  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *
  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
 /*
  * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $
  *
  * Revisions picked from OpenBSD after revision 1.110 import:
  * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input()
  * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
  * 1.120, 1.175 - use monotonic time_uptime
  * 1.122 - reduce number of updates for non-TCP sessions
  * 1.125, 1.127 - rewrite merge or stale processing
  * 1.128 - cleanups
  * 1.146 - bzero() mbuf before sparsely filling it with data
  * 1.170 - SIOCSIFMTU checks
  * 1.126, 1.142 - deferred packets processing
  * 1.173 - correct expire time processing
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_pf.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/endian.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/vnet.h>
 #include <net/pfvar.h>
 #include <net/if_pfsync.h>
 
 #include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 
 #define PFSYNC_MINPKT ( \
 	sizeof(struct ip) + \
 	sizeof(struct pfsync_header) + \
 	sizeof(struct pfsync_subheader) )
 
 struct pfsync_bucket;
 
 struct pfsync_pkt {
 	struct ip *ip;
 	struct in_addr src;
 	u_int8_t flags;
 };
 
 static int	pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
 		    struct pfsync_state_peer *);
 static int	pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
 static int	pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
 
 static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
 	pfsync_in_clr,			/* PFSYNC_ACT_CLR */
 	pfsync_in_ins,			/* PFSYNC_ACT_INS */
 	pfsync_in_iack,			/* PFSYNC_ACT_INS_ACK */
 	pfsync_in_upd,			/* PFSYNC_ACT_UPD */
 	pfsync_in_upd_c,		/* PFSYNC_ACT_UPD_C */
 	pfsync_in_ureq,			/* PFSYNC_ACT_UPD_REQ */
 	pfsync_in_del,			/* PFSYNC_ACT_DEL */
 	pfsync_in_del_c,		/* PFSYNC_ACT_DEL_C */
 	pfsync_in_error,		/* PFSYNC_ACT_INS_F */
 	pfsync_in_error,		/* PFSYNC_ACT_DEL_F */
 	pfsync_in_bus,			/* PFSYNC_ACT_BUS */
 	pfsync_in_tdb,			/* PFSYNC_ACT_TDB */
 	pfsync_in_eof			/* PFSYNC_ACT_EOF */
 };
 
 struct pfsync_q {
 	void		(*write)(struct pf_state *, void *);
 	size_t		len;
 	u_int8_t	action;
 };
 
 /* we have one of these for every PFSYNC_S_ */
 static void	pfsync_out_state(struct pf_state *, void *);
 static void	pfsync_out_iack(struct pf_state *, void *);
 static void	pfsync_out_upd_c(struct pf_state *, void *);
 static void	pfsync_out_del(struct pf_state *, void *);
 
 static struct pfsync_q pfsync_qs[] = {
 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
 	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
 	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
 	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
 };
 
 static void	pfsync_q_ins(struct pf_state *, int, bool);
 static void	pfsync_q_del(struct pf_state *, bool, struct pfsync_bucket *);
 
 static void	pfsync_update_state(struct pf_state *);
 
 struct pfsync_upd_req_item {
 	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
 	struct pfsync_upd_req			ur_msg;
 };
 
 struct pfsync_deferral {
 	struct pfsync_softc		*pd_sc;
 	TAILQ_ENTRY(pfsync_deferral)	pd_entry;
 	u_int				pd_refs;
 	struct callout			pd_tmo;
 
 	struct pf_state			*pd_st;
 	struct mbuf			*pd_m;
 };
 
 struct pfsync_sofct;
 
 struct pfsync_bucket
 {
 	int			b_id;
 	struct pfsync_softc	*b_sc;
 	struct mtx		b_mtx;
 	struct callout		b_tmo;
 	int			b_flags;
 #define	PFSYNCF_BUCKET_PUSH	0x00000001
 
 	size_t			b_len;
 	TAILQ_HEAD(, pf_state)			b_qs[PFSYNC_S_COUNT];
 	TAILQ_HEAD(, pfsync_upd_req_item)	b_upd_req_list;
 	TAILQ_HEAD(, pfsync_deferral)		b_deferrals;
 	u_int			b_deferred;
 	void			*b_plus;
 	size_t			b_pluslen;
 
 	struct  ifaltq b_snd;
 };
 
 struct pfsync_softc {
 	/* Configuration */
 	struct ifnet		*sc_ifp;
 	struct ifnet		*sc_sync_if;
 	struct ip_moptions	sc_imo;
 	struct in_addr		sc_sync_peer;
 	uint32_t		sc_flags;
 #define	PFSYNCF_OK		0x00000001
 #define	PFSYNCF_DEFER		0x00000002
 	uint8_t			sc_maxupdates;
 	struct ip		sc_template;
 	struct mtx		sc_mtx;
 
 	/* Queued data */
 	struct pfsync_bucket	*sc_buckets;
 
 	/* Bulk update info */
 	struct mtx		sc_bulk_mtx;
 	uint32_t		sc_ureq_sent;
 	int			sc_bulk_tries;
 	uint32_t		sc_ureq_received;
 	int			sc_bulk_hashid;
 	uint64_t		sc_bulk_stateid;
 	uint32_t		sc_bulk_creatorid;
 	struct callout		sc_bulk_tmo;
 	struct callout		sc_bulkfail_tmo;
 };
 
 #define	PFSYNC_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
 #define	PFSYNC_UNLOCK(sc)	mtx_unlock(&(sc)->sc_mtx)
 #define	PFSYNC_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
 
 #define PFSYNC_BUCKET_LOCK(b)		mtx_lock(&(b)->b_mtx)
 #define PFSYNC_BUCKET_UNLOCK(b)		mtx_unlock(&(b)->b_mtx)
 #define PFSYNC_BUCKET_LOCK_ASSERT(b)	mtx_assert(&(b)->b_mtx, MA_OWNED)
 
 #define	PFSYNC_BLOCK(sc)	mtx_lock(&(sc)->sc_bulk_mtx)
 #define	PFSYNC_BUNLOCK(sc)	mtx_unlock(&(sc)->sc_bulk_mtx)
 #define	PFSYNC_BLOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
 
 static const char pfsyncname[] = "pfsync";
 static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data");
 VNET_DEFINE_STATIC(struct pfsync_softc	*, pfsyncif) = NULL;
 #define	V_pfsyncif		VNET(pfsyncif)
 VNET_DEFINE_STATIC(void *, pfsync_swi_cookie) = NULL;
 #define	V_pfsync_swi_cookie	VNET(pfsync_swi_cookie)
 VNET_DEFINE_STATIC(struct pfsyncstats, pfsyncstats);
 #define	V_pfsyncstats		VNET(pfsyncstats)
 VNET_DEFINE_STATIC(int, pfsync_carp_adj) = CARP_MAXSKEW;
 #define	V_pfsync_carp_adj	VNET(pfsync_carp_adj)
 
 static void	pfsync_timeout(void *);
 static void	pfsync_push(struct pfsync_bucket *);
 static void	pfsync_push_all(struct pfsync_softc *);
 static void	pfsyncintr(void *);
 static int	pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
-		    void *);
+		    struct in_mfilter *imf);
 static void	pfsync_multicast_cleanup(struct pfsync_softc *);
 static void	pfsync_pointers_init(void);
 static void	pfsync_pointers_uninit(void);
 static int	pfsync_init(void);
 static void	pfsync_uninit(void);
 
 static unsigned long pfsync_buckets;
 
 SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
 SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(pfsyncstats), pfsyncstats,
     "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
 SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
     &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
 SYSCTL_ULONG(_net_pfsync, OID_AUTO, pfsync_buckets, CTLFLAG_RDTUN,
     &pfsync_buckets, 0, "Number of pfsync hash buckets");
 
 static int	pfsync_clone_create(struct if_clone *, int, caddr_t);
 static void	pfsync_clone_destroy(struct ifnet *);
 static int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
 		    struct pf_state_peer *);
 static int	pfsyncoutput(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
 
 static int	pfsync_defer(struct pf_state *, struct mbuf *);
 static void	pfsync_undefer(struct pfsync_deferral *, int);
 static void	pfsync_undefer_state(struct pf_state *, int);
 static void	pfsync_defer_tmo(void *);
 
 static void	pfsync_request_update(u_int32_t, u_int64_t);
 static bool	pfsync_update_state_req(struct pf_state *);
 
 static void	pfsync_drop(struct pfsync_softc *);
 static void	pfsync_sendout(int, int);
 static void	pfsync_send_plus(void *, size_t);
 
 static void	pfsync_bulk_start(void);
 static void	pfsync_bulk_status(u_int8_t);
 static void	pfsync_bulk_update(void *);
 static void	pfsync_bulk_fail(void *);
 
 static void	pfsync_detach_ifnet(struct ifnet *);
 #ifdef IPSEC
 static void	pfsync_update_net_tdb(struct pfsync_tdb *);
 #endif
 static struct pfsync_bucket	*pfsync_get_bucket(struct pfsync_softc *,
 		    struct pf_state *);
 
 
 #define PFSYNC_MAX_BULKTRIES	12
 
 VNET_DEFINE(struct if_clone *, pfsync_cloner);
 #define	V_pfsync_cloner	VNET(pfsync_cloner)
 
 static int
 pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
 {
 	struct pfsync_softc *sc;
 	struct ifnet *ifp;
 	struct pfsync_bucket *b;
 	int c, q;
 
 	if (unit != 0)
 		return (EINVAL);
 
 	if (! pfsync_buckets)
 		pfsync_buckets = mp_ncpus * 2;
 
 	sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
 	sc->sc_flags |= PFSYNCF_OK;
 	sc->sc_maxupdates = 128;
 
 	ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
 	if (ifp == NULL) {
 		free(sc, M_PFSYNC);
 		return (ENOSPC);
 	}
 	if_initname(ifp, pfsyncname, unit);
 	ifp->if_softc = sc;
 	ifp->if_ioctl = pfsyncioctl;
 	ifp->if_output = pfsyncoutput;
 	ifp->if_type = IFT_PFSYNC;
 	ifp->if_hdrlen = sizeof(struct pfsync_header);
 	ifp->if_mtu = ETHERMTU;
 	mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF);
 	mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
 	callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
 	callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
 
 	if_attach(ifp);
 
 	bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
 
 	sc->sc_buckets = mallocarray(pfsync_buckets, sizeof(*sc->sc_buckets),
 	    M_PFSYNC, M_ZERO | M_WAITOK);
 	for (c = 0; c < pfsync_buckets; c++) {
 		b = &sc->sc_buckets[c];
 		mtx_init(&b->b_mtx, "pfsync bucket", NULL, MTX_DEF);
 
 		b->b_id = c;
 		b->b_sc = sc;
 		b->b_len = PFSYNC_MINPKT;
 
 		for (q = 0; q < PFSYNC_S_COUNT; q++)
 			TAILQ_INIT(&b->b_qs[q]);
 
 		TAILQ_INIT(&b->b_upd_req_list);
 		TAILQ_INIT(&b->b_deferrals);
 
 		callout_init(&b->b_tmo, 1);
 
 		b->b_snd.ifq_maxlen = ifqmaxlen;
 	}
 
 	V_pfsyncif = sc;
 
 	return (0);
 }
 
 static void
 pfsync_clone_destroy(struct ifnet *ifp)
 {
 	struct pfsync_softc *sc = ifp->if_softc;
 	struct pfsync_bucket *b;
 	int c;
 
 	for (c = 0; c < pfsync_buckets; c++) {
 		b = &sc->sc_buckets[c];
 		/*
 		 * At this stage, everything should have already been
 		 * cleared by pfsync_uninit(), and we have only to
 		 * drain callouts.
 		 */
 		while (b->b_deferred > 0) {
 			struct pfsync_deferral *pd =
 			    TAILQ_FIRST(&b->b_deferrals);
 
 			TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry);
 			b->b_deferred--;
 			if (callout_stop(&pd->pd_tmo) > 0) {
 				pf_release_state(pd->pd_st);
 				m_freem(pd->pd_m);
 				free(pd, M_PFSYNC);
 			} else {
 				pd->pd_refs++;
 				callout_drain(&pd->pd_tmo);
 				free(pd, M_PFSYNC);
 			}
 		}
 
 		callout_drain(&b->b_tmo);
 	}
 
 	callout_drain(&sc->sc_bulkfail_tmo);
 	callout_drain(&sc->sc_bulk_tmo);
 
 	if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 		(*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
 	bpfdetach(ifp);
 	if_detach(ifp);
 
 	pfsync_drop(sc);
 
 	if_free(ifp);
-	if (sc->sc_imo.imo_membership)
-		pfsync_multicast_cleanup(sc);
+	pfsync_multicast_cleanup(sc);
 	mtx_destroy(&sc->sc_mtx);
 	mtx_destroy(&sc->sc_bulk_mtx);
 
 	free(sc->sc_buckets, M_PFSYNC);
 	free(sc, M_PFSYNC);
 
 	V_pfsyncif = NULL;
 }
 
 static int
 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
     struct pf_state_peer *d)
 {
 	if (s->scrub.scrub_flag && d->scrub == NULL) {
 		d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
 		if (d->scrub == NULL)
 			return (ENOMEM);
 	}
 
 	return (0);
 }
 
 
 static int
 pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 #ifndef	__NO_STRICT_ALIGNMENT
 	struct pfsync_state_key key[2];
 #endif
 	struct pfsync_state_key *kw, *ks;
 	struct pf_state	*st = NULL;
 	struct pf_state_key *skw = NULL, *sks = NULL;
 	struct pf_rule *r = NULL;
 	struct pfi_kif	*kif;
 	int error;
 
 	PF_RULES_RASSERT();
 
 	if (sp->creatorid == 0) {
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("%s: invalid creator id: %08x\n", __func__,
 			    ntohl(sp->creatorid));
 		return (EINVAL);
 	}
 
 	if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("%s: unknown interface: %s\n", __func__,
 			    sp->ifname);
 		if (flags & PFSYNC_SI_IOCTL)
 			return (EINVAL);
 		return (0);	/* skip this state */
 	}
 
 	/*
 	 * If the ruleset checksums match or the state is coming from the ioctl,
 	 * it's safe to associate the state with the rule of that number.
 	 */
 	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
 	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
 	    pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
 		r = pf_main_ruleset.rules[
 		    PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
 	else
 		r = &V_pf_default_rule;
 
 	if ((r->max_states &&
 	    counter_u64_fetch(r->states_cur) >= r->max_states))
 		goto cleanup;
 
 	/*
 	 * XXXGL: consider M_WAITOK in ioctl path after.
 	 */
 	if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
 		goto cleanup;
 
 	if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
 		goto cleanup;
 
 #ifndef	__NO_STRICT_ALIGNMENT
 	bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2);
 	kw = &key[PF_SK_WIRE];
 	ks = &key[PF_SK_STACK];
 #else
 	kw = &sp->key[PF_SK_WIRE];
 	ks = &sp->key[PF_SK_STACK];
 #endif
 
 	if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) ||
 	    PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) ||
 	    kw->port[0] != ks->port[0] ||
 	    kw->port[1] != ks->port[1]) {
 		sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
 		if (sks == NULL)
 			goto cleanup;
 	} else
 		sks = skw;
 
 	/* allocate memory for scrub info */
 	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
 	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
 		goto cleanup;
 
 	/* Copy to state key(s). */
 	skw->addr[0] = kw->addr[0];
 	skw->addr[1] = kw->addr[1];
 	skw->port[0] = kw->port[0];
 	skw->port[1] = kw->port[1];
 	skw->proto = sp->proto;
 	skw->af = sp->af;
 	if (sks != skw) {
 		sks->addr[0] = ks->addr[0];
 		sks->addr[1] = ks->addr[1];
 		sks->port[0] = ks->port[0];
 		sks->port[1] = ks->port[1];
 		sks->proto = sp->proto;
 		sks->af = sp->af;
 	}
 
 	/* copy to state */
 	bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
 	st->creation = time_uptime - ntohl(sp->creation);
 	st->expire = time_uptime;
 	if (sp->expire) {
 		uint32_t timeout;
 
 		timeout = r->timeout[sp->timeout];
 		if (!timeout)
 			timeout = V_pf_default_rule.timeout[sp->timeout];
 
 		/* sp->expire may have been adaptively scaled by export. */
 		st->expire -= timeout - ntohl(sp->expire);
 	}
 
 	st->direction = sp->direction;
 	st->log = sp->log;
 	st->timeout = sp->timeout;
 	st->state_flags = sp->state_flags;
 
 	st->id = sp->id;
 	st->creatorid = sp->creatorid;
 	pf_state_peer_ntoh(&sp->src, &st->src);
 	pf_state_peer_ntoh(&sp->dst, &st->dst);
 
 	st->rule.ptr = r;
 	st->nat_rule.ptr = NULL;
 	st->anchor.ptr = NULL;
 	st->rt_kif = NULL;
 
 	st->pfsync_time = time_uptime;
 	st->sync_state = PFSYNC_S_NONE;
 
 	if (!(flags & PFSYNC_SI_IOCTL))
 		st->state_flags |= PFSTATE_NOSYNC;
 
 	if ((error = pf_state_insert(kif, skw, sks, st)) != 0)
 		goto cleanup_state;
 
 	/* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
 	counter_u64_add(r->states_cur, 1);
 	counter_u64_add(r->states_tot, 1);
 
 	if (!(flags & PFSYNC_SI_IOCTL)) {
 		st->state_flags &= ~PFSTATE_NOSYNC;
 		if (st->state_flags & PFSTATE_ACK) {
 			pfsync_q_ins(st, PFSYNC_S_IACK, true);
 			pfsync_push_all(sc);
 		}
 	}
 	st->state_flags &= ~PFSTATE_ACK;
 	PF_STATE_UNLOCK(st);
 
 	return (0);
 
 cleanup:
 	error = ENOMEM;
 	if (skw == sks)
 		sks = NULL;
 	if (skw != NULL)
 		uma_zfree(V_pf_state_key_z, skw);
 	if (sks != NULL)
 		uma_zfree(V_pf_state_key_z, sks);
 
 cleanup_state:	/* pf_state_insert() frees the state keys. */
 	if (st) {
 		if (st->dst.scrub)
 			uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
 		if (st->src.scrub)
 			uma_zfree(V_pf_state_scrub_z, st->src.scrub);
 		uma_zfree(V_pf_state_z, st);
 	}
 	return (error);
 }
 
 static int
 pfsync_input(struct mbuf **mp, int *offp __unused, int proto __unused)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_pkt pkt;
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct pfsync_header *ph;
 	struct pfsync_subheader subh;
 
 	int offset, len;
 	int rv;
 	uint16_t count;
 
 	PF_RULES_RLOCK_TRACKER;
 
 	*mp = NULL;
 	V_pfsyncstats.pfsyncs_ipackets++;
 
 	/* Verify that we have a sync interface configured. */
 	if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
 	    (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		goto done;
 
 	/* verify that the packet came in on the right interface */
 	if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
 		V_pfsyncstats.pfsyncs_badif++;
 		goto done;
 	}
 
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	/* verify that the IP TTL is 255. */
 	if (ip->ip_ttl != PFSYNC_DFLTTL) {
 		V_pfsyncstats.pfsyncs_badttl++;
 		goto done;
 	}
 
 	offset = ip->ip_hl << 2;
 	if (m->m_pkthdr.len < offset + sizeof(*ph)) {
 		V_pfsyncstats.pfsyncs_hdrops++;
 		goto done;
 	}
 
 	if (offset + sizeof(*ph) > m->m_len) {
 		if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
 			V_pfsyncstats.pfsyncs_hdrops++;
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 	}
 	ph = (struct pfsync_header *)((char *)ip + offset);
 
 	/* verify the version */
 	if (ph->version != PFSYNC_VERSION) {
 		V_pfsyncstats.pfsyncs_badver++;
 		goto done;
 	}
 
 	len = ntohs(ph->len) + offset;
 	if (m->m_pkthdr.len < len) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		goto done;
 	}
 
 	/* Cheaper to grab this now than having to mess with mbufs later */
 	pkt.ip = ip;
 	pkt.src = ip->ip_src;
 	pkt.flags = 0;
 
 	/*
 	 * Trusting pf_chksum during packet processing, as well as seeking
 	 * in interface name tree, require holding PF_RULES_RLOCK().
 	 */
 	PF_RULES_RLOCK();
 	if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
 		pkt.flags |= PFSYNC_SI_CKSUM;
 
 	offset += sizeof(*ph);
 	while (offset <= len - sizeof(subh)) {
 		m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
 		offset += sizeof(subh);
 
 		if (subh.action >= PFSYNC_ACT_MAX) {
 			V_pfsyncstats.pfsyncs_badact++;
 			PF_RULES_RUNLOCK();
 			goto done;
 		}
 
 		count = ntohs(subh.count);
 		V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
 		rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
 		if (rv == -1) {
 			PF_RULES_RUNLOCK();
 			return (IPPROTO_DONE);
 		}
 
 		offset += rv;
 	}
 	PF_RULES_RUNLOCK();
 
 done:
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 static int
 pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_clr *clr;
 	struct mbuf *mp;
 	int len = sizeof(*clr) * count;
 	int i, offp;
 	u_int32_t creatorid;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	clr = (struct pfsync_clr *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		creatorid = clr[i].creatorid;
 
 		if (clr[i].ifname[0] != '\0' &&
 		    pfi_kif_find(clr[i].ifname) == NULL)
 			continue;
 
 		for (int i = 0; i <= pf_hashmask; i++) {
 			struct pf_idhash *ih = &V_pf_idhash[i];
 			struct pf_state *s;
 relock:
 			PF_HASHROW_LOCK(ih);
 			LIST_FOREACH(s, &ih->states, entry) {
 				if (s->creatorid == creatorid) {
 					s->state_flags |= PFSTATE_NOSYNC;
 					pf_unlink_state(s, PF_ENTER_LOCKED);
 					goto relock;
 				}
 			}
 			PF_HASHROW_UNLOCK(ih);
 		}
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct mbuf *mp;
 	struct pfsync_state *sa, *sp;
 	int len = sizeof(*sp) * count;
 	int i, offp;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_state *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		/* Check for invalid values. */
 		if (sp->timeout >= PFTM_MAX ||
 		    sp->src.state > PF_TCPS_PROXY_DST ||
 		    sp->dst.state > PF_TCPS_PROXY_DST ||
 		    sp->direction > PF_OUT ||
 		    (sp->af != AF_INET && sp->af != AF_INET6)) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf("%s: invalid value\n", __func__);
 			V_pfsyncstats.pfsyncs_badval++;
 			continue;
 		}
 
 		if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
 			/* Drop out, but process the rest of the actions. */
 			break;
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_ins_ack *ia, *iaa;
 	struct pf_state *st;
 
 	struct mbuf *mp;
 	int len = count * sizeof(*ia);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		ia = &iaa[i];
 
 		st = pf_find_state_byid(ia->id, ia->creatorid);
 		if (st == NULL)
 			continue;
 
 		if (st->state_flags & PFSTATE_ACK) {
 			pfsync_undefer_state(st, 0);
 		}
 		PF_STATE_UNLOCK(st);
 	}
 	/*
 	 * XXX this is not yet implemented, but we know the size of the
 	 * message so we can skip it.
 	 */
 
 	return (count * sizeof(struct pfsync_ins_ack));
 }
 
 static int
 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
     struct pfsync_state_peer *dst)
 {
 	int sync = 0;
 
 	PF_STATE_LOCK_ASSERT(st);
 
 	/*
 	 * The state should never go backwards except
 	 * for syn-proxy states.  Neither should the
 	 * sequence window slide backwards.
 	 */
 	if ((st->src.state > src->state &&
 	    (st->src.state < PF_TCPS_PROXY_SRC ||
 	    src->state >= PF_TCPS_PROXY_SRC)) ||
 
 	    (st->src.state == src->state &&
 	    SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
 		sync++;
 	else
 		pf_state_peer_ntoh(src, &st->src);
 
 	if ((st->dst.state > dst->state) ||
 
 	    (st->dst.state >= TCPS_SYN_SENT &&
 	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
 		sync++;
 	else
 		pf_state_peer_ntoh(dst, &st->dst);
 
 	return (sync);
 }
 
 static int
 pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_state *sa, *sp;
 	struct pf_state *st;
 	int sync;
 
 	struct mbuf *mp;
 	int len = count * sizeof(*sp);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_state *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		/* check for invalid values */
 		if (sp->timeout >= PFTM_MAX ||
 		    sp->src.state > PF_TCPS_PROXY_DST ||
 		    sp->dst.state > PF_TCPS_PROXY_DST) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
 				printf("pfsync_input: PFSYNC_ACT_UPD: "
 				    "invalid value\n");
 			}
 			V_pfsyncstats.pfsyncs_badval++;
 			continue;
 		}
 
 		st = pf_find_state_byid(sp->id, sp->creatorid);
 		if (st == NULL) {
 			/* insert the update */
 			if (pfsync_state_import(sp, pkt->flags))
 				V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
 
 		if (st->state_flags & PFSTATE_ACK) {
 			pfsync_undefer_state(st, 1);
 		}
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
 			sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
 		else {
 			sync = 0;
 
 			/*
 			 * Non-TCP protocol state machine always go
 			 * forwards
 			 */
 			if (st->src.state > sp->src.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&sp->src, &st->src);
 			if (st->dst.state > sp->dst.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&sp->dst, &st->dst);
 		}
 		if (sync < 2) {
 			pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
 			pf_state_peer_ntoh(&sp->dst, &st->dst);
 			st->expire = time_uptime;
 			st->timeout = sp->timeout;
 		}
 		st->pfsync_time = time_uptime;
 
 		if (sync) {
 			V_pfsyncstats.pfsyncs_stale++;
 
 			pfsync_update_state(st);
 			PF_STATE_UNLOCK(st);
 			pfsync_push_all(sc);
 			continue;
 		}
 		PF_STATE_UNLOCK(st);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_upd_c *ua, *up;
 	struct pf_state *st;
 	int len = count * sizeof(*up);
 	int sync;
 	struct mbuf *mp;
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	ua = (struct pfsync_upd_c *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		up = &ua[i];
 
 		/* check for invalid values */
 		if (up->timeout >= PFTM_MAX ||
 		    up->src.state > PF_TCPS_PROXY_DST ||
 		    up->dst.state > PF_TCPS_PROXY_DST) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
 				printf("pfsync_input: "
 				    "PFSYNC_ACT_UPD_C: "
 				    "invalid value\n");
 			}
 			V_pfsyncstats.pfsyncs_badval++;
 			continue;
 		}
 
 		st = pf_find_state_byid(up->id, up->creatorid);
 		if (st == NULL) {
 			/* We don't have this state. Ask for it. */
 			PFSYNC_BUCKET_LOCK(&sc->sc_buckets[0]);
 			pfsync_request_update(up->creatorid, up->id);
 			PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[0]);
 			continue;
 		}
 
 		if (st->state_flags & PFSTATE_ACK) {
 			pfsync_undefer_state(st, 1);
 		}
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
 			sync = pfsync_upd_tcp(st, &up->src, &up->dst);
 		else {
 			sync = 0;
 
 			/*
 			 * Non-TCP protocol state machine always go
 			 * forwards
 			 */
 			if (st->src.state > up->src.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&up->src, &st->src);
 			if (st->dst.state > up->dst.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&up->dst, &st->dst);
 		}
 		if (sync < 2) {
 			pfsync_alloc_scrub_memory(&up->dst, &st->dst);
 			pf_state_peer_ntoh(&up->dst, &st->dst);
 			st->expire = time_uptime;
 			st->timeout = up->timeout;
 		}
 		st->pfsync_time = time_uptime;
 
 		if (sync) {
 			V_pfsyncstats.pfsyncs_stale++;
 
 			pfsync_update_state(st);
 			PF_STATE_UNLOCK(st);
 			pfsync_push_all(sc);
 			continue;
 		}
 		PF_STATE_UNLOCK(st);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_upd_req *ur, *ura;
 	struct mbuf *mp;
 	int len = count * sizeof(*ur);
 	int i, offp;
 
 	struct pf_state *st;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	ura = (struct pfsync_upd_req *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		ur = &ura[i];
 
 		if (ur->id == 0 && ur->creatorid == 0)
 			pfsync_bulk_start();
 		else {
 			st = pf_find_state_byid(ur->id, ur->creatorid);
 			if (st == NULL) {
 				V_pfsyncstats.pfsyncs_badstate++;
 				continue;
 			}
 			if (st->state_flags & PFSTATE_NOSYNC) {
 				PF_STATE_UNLOCK(st);
 				continue;
 			}
 
 			pfsync_update_state_req(st);
 			PF_STATE_UNLOCK(st);
 		}
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct mbuf *mp;
 	struct pfsync_state *sa, *sp;
 	struct pf_state *st;
 	int len = count * sizeof(*sp);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_state *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		st = pf_find_state_byid(sp->id, sp->creatorid);
 		if (st == NULL) {
 			V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
 		st->state_flags |= PFSTATE_NOSYNC;
 		pf_unlink_state(st, PF_ENTER_LOCKED);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct mbuf *mp;
 	struct pfsync_del_c *sa, *sp;
 	struct pf_state *st;
 	int len = count * sizeof(*sp);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_del_c *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		st = pf_find_state_byid(sp->id, sp->creatorid);
 		if (st == NULL) {
 			V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
 
 		st->state_flags |= PFSTATE_NOSYNC;
 		pf_unlink_state(st, PF_ENTER_LOCKED);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_bus *bus;
 	struct mbuf *mp;
 	int len = count * sizeof(*bus);
 	int offp;
 
 	PFSYNC_BLOCK(sc);
 
 	/* If we're not waiting for a bulk update, who cares. */
 	if (sc->sc_ureq_sent == 0) {
 		PFSYNC_BUNLOCK(sc);
 		return (len);
 	}
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		PFSYNC_BUNLOCK(sc);
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	bus = (struct pfsync_bus *)(mp->m_data + offp);
 
 	switch (bus->status) {
 	case PFSYNC_BUS_START:
 		callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
 		    V_pf_limits[PF_LIMIT_STATES].limit /
 		    ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
 		    sizeof(struct pfsync_state)),
 		    pfsync_bulk_fail, sc);
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: received bulk update start\n");
 		break;
 
 	case PFSYNC_BUS_END:
 		if (time_uptime - ntohl(bus->endtime) >=
 		    sc->sc_ureq_sent) {
 			/* that's it, we're happy */
 			sc->sc_ureq_sent = 0;
 			sc->sc_bulk_tries = 0;
 			callout_stop(&sc->sc_bulkfail_tmo);
 			if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 				(*carp_demote_adj_p)(-V_pfsync_carp_adj,
 				    "pfsync bulk done");
 			sc->sc_flags |= PFSYNCF_OK;
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf("pfsync: received valid "
 				    "bulk update end\n");
 		} else {
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf("pfsync: received invalid "
 				    "bulk update end: bad timestamp\n");
 		}
 		break;
 	}
 	PFSYNC_BUNLOCK(sc);
 
 	return (len);
 }
 
 static int
 pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	int len = count * sizeof(struct pfsync_tdb);
 
 #if defined(IPSEC)
 	struct pfsync_tdb *tp;
 	struct mbuf *mp;
 	int offp;
 	int i;
 	int s;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	tp = (struct pfsync_tdb *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++)
 		pfsync_update_net_tdb(&tp[i]);
 #endif
 
 	return (len);
 }
 
 #if defined(IPSEC)
 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
 static void
 pfsync_update_net_tdb(struct pfsync_tdb *pt)
 {
 	struct tdb		*tdb;
 	int			 s;
 
 	/* check for invalid values */
 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
 	    (pt->dst.sa.sa_family != AF_INET &&
 	    pt->dst.sa.sa_family != AF_INET6))
 		goto bad;
 
 	tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
 	if (tdb) {
 		pt->rpl = ntohl(pt->rpl);
 		pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
 
 		/* Neither replay nor byte counter should ever decrease. */
 		if (pt->rpl < tdb->tdb_rpl ||
 		    pt->cur_bytes < tdb->tdb_cur_bytes) {
 			goto bad;
 		}
 
 		tdb->tdb_rpl = pt->rpl;
 		tdb->tdb_cur_bytes = pt->cur_bytes;
 	}
 	return;
 
 bad:
 	if (V_pf_status.debug >= PF_DEBUG_MISC)
 		printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
 		    "invalid value\n");
 	V_pfsyncstats.pfsyncs_badstate++;
 	return;
 }
 #endif
 
 
 static int
 pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	/* check if we are at the right place in the packet */
 	if (offset != m->m_pkthdr.len)
 		V_pfsyncstats.pfsyncs_badlen++;
 
 	/* we're done. free and let the caller return */
 	m_freem(m);
 	return (-1);
 }
 
 static int
 pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
 {
 	V_pfsyncstats.pfsyncs_badact++;
 
 	m_freem(m);
 	return (-1);
 }
 
 static int
 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *rt)
 {
 	m_freem(m);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct pfsync_softc *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct pfsyncreq pfsyncr;
 	int error;
 	int c;
 
 	switch (cmd) {
 	case SIOCSIFFLAGS:
 		PFSYNC_LOCK(sc);
 		if (ifp->if_flags & IFF_UP) {
 			ifp->if_drv_flags |= IFF_DRV_RUNNING;
 			PFSYNC_UNLOCK(sc);
 			pfsync_pointers_init();
 		} else {
 			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 			PFSYNC_UNLOCK(sc);
 			pfsync_pointers_uninit();
 		}
 		break;
 	case SIOCSIFMTU:
 		if (!sc->sc_sync_if ||
 		    ifr->ifr_mtu <= PFSYNC_MINPKT ||
 		    ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
 			return (EINVAL);
 		if (ifr->ifr_mtu < ifp->if_mtu) {
 			for (c = 0; c < pfsync_buckets; c++) {
 				PFSYNC_BUCKET_LOCK(&sc->sc_buckets[c]);
 				if (sc->sc_buckets[c].b_len > PFSYNC_MINPKT)
 					pfsync_sendout(1, c);
 				PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[c]);
 			}
 		}
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 	case SIOCGETPFSYNC:
 		bzero(&pfsyncr, sizeof(pfsyncr));
 		PFSYNC_LOCK(sc);
 		if (sc->sc_sync_if) {
 			strlcpy(pfsyncr.pfsyncr_syncdev,
 			    sc->sc_sync_if->if_xname, IFNAMSIZ);
 		}
 		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
 		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
 		pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
 		    (sc->sc_flags & PFSYNCF_DEFER));
 		PFSYNC_UNLOCK(sc);
 		return (copyout(&pfsyncr, ifr_data_get_ptr(ifr),
 		    sizeof(pfsyncr)));
 
 	case SIOCSETPFSYNC:
 	    {
-		struct ip_moptions *imo = &sc->sc_imo;
+		struct in_mfilter *imf = NULL;
 		struct ifnet *sifp;
 		struct ip *ip;
-		void *mship = NULL;
 
 		if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
 			return (error);
 		if ((error = copyin(ifr_data_get_ptr(ifr), &pfsyncr,
 		    sizeof(pfsyncr))))
 			return (error);
 
 		if (pfsyncr.pfsyncr_maxupdates > 255)
 			return (EINVAL);
 
 		if (pfsyncr.pfsyncr_syncdev[0] == 0)
 			sifp = NULL;
 		else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
 			return (EINVAL);
 
 		if (sifp != NULL && (
 		    pfsyncr.pfsyncr_syncpeer.s_addr == 0 ||
 		    pfsyncr.pfsyncr_syncpeer.s_addr ==
 		    htonl(INADDR_PFSYNC_GROUP)))
-			mship = malloc((sizeof(struct in_multi *) *
-			    IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO);
+			imf = ip_mfilter_alloc(M_WAITOK, 0, 0);
 
 		PFSYNC_LOCK(sc);
 		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
 			sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
 		else
 			sc->sc_sync_peer.s_addr =
 			    pfsyncr.pfsyncr_syncpeer.s_addr;
 
 		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
 		if (pfsyncr.pfsyncr_defer) {
 			sc->sc_flags |= PFSYNCF_DEFER;
 			V_pfsync_defer_ptr = pfsync_defer;
 		} else {
 			sc->sc_flags &= ~PFSYNCF_DEFER;
 			V_pfsync_defer_ptr = NULL;
 		}
 
 		if (sifp == NULL) {
 			if (sc->sc_sync_if)
 				if_rele(sc->sc_sync_if);
 			sc->sc_sync_if = NULL;
-			if (imo->imo_membership)
-				pfsync_multicast_cleanup(sc);
+			pfsync_multicast_cleanup(sc);
 			PFSYNC_UNLOCK(sc);
 			break;
 		}
 
 		for (c = 0; c < pfsync_buckets; c++) {
 			PFSYNC_BUCKET_LOCK(&sc->sc_buckets[c]);
 			if (sc->sc_buckets[c].b_len > PFSYNC_MINPKT &&
 			    (sifp->if_mtu < sc->sc_ifp->if_mtu ||
 			    (sc->sc_sync_if != NULL &&
 			    sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
 			    sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
 				pfsync_sendout(1, c);
 			PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[c]);
 		}
 
-		if (imo->imo_membership)
-			pfsync_multicast_cleanup(sc);
+		pfsync_multicast_cleanup(sc);
 
 		if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
-			error = pfsync_multicast_setup(sc, sifp, mship);
+			error = pfsync_multicast_setup(sc, sifp, imf);
 			if (error) {
 				if_rele(sifp);
-				free(mship, M_PFSYNC);
+				ip_mfilter_free(imf);
 				PFSYNC_UNLOCK(sc);
 				return (error);
 			}
 		}
 		if (sc->sc_sync_if)
 			if_rele(sc->sc_sync_if);
 		sc->sc_sync_if = sifp;
 
 		ip = &sc->sc_template;
 		bzero(ip, sizeof(*ip));
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(sc->sc_template) >> 2;
 		ip->ip_tos = IPTOS_LOWDELAY;
 		/* len and id are set later. */
 		ip->ip_off = htons(IP_DF);
 		ip->ip_ttl = PFSYNC_DFLTTL;
 		ip->ip_p = IPPROTO_PFSYNC;
 		ip->ip_src.s_addr = INADDR_ANY;
 		ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
 
 		/* Request a full state table update. */
 		if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 			(*carp_demote_adj_p)(V_pfsync_carp_adj,
 			    "pfsync bulk start");
 		sc->sc_flags &= ~PFSYNCF_OK;
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: requesting bulk update\n");
 		PFSYNC_UNLOCK(sc);
 		PFSYNC_BUCKET_LOCK(&sc->sc_buckets[0]);
 		pfsync_request_update(0, 0);
 		PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[0]);
 		PFSYNC_BLOCK(sc);
 		sc->sc_ureq_sent = time_uptime;
 		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
 		    sc);
 		PFSYNC_BUNLOCK(sc);
 
 		break;
 	    }
 	default:
 		return (ENOTTY);
 	}
 
 	return (0);
 }
 
 static void
 pfsync_out_state(struct pf_state *st, void *buf)
 {
 	struct pfsync_state *sp = buf;
 
 	pfsync_state_export(sp, st);
 }
 
 static void
 pfsync_out_iack(struct pf_state *st, void *buf)
 {
 	struct pfsync_ins_ack *iack = buf;
 
 	iack->id = st->id;
 	iack->creatorid = st->creatorid;
 }
 
 static void
 pfsync_out_upd_c(struct pf_state *st, void *buf)
 {
 	struct pfsync_upd_c *up = buf;
 
 	bzero(up, sizeof(*up));
 	up->id = st->id;
 	pf_state_peer_hton(&st->src, &up->src);
 	pf_state_peer_hton(&st->dst, &up->dst);
 	up->creatorid = st->creatorid;
 	up->timeout = st->timeout;
 }
 
 static void
 pfsync_out_del(struct pf_state *st, void *buf)
 {
 	struct pfsync_del_c *dp = buf;
 
 	dp->id = st->id;
 	dp->creatorid = st->creatorid;
 	st->state_flags |= PFSTATE_NOSYNC;
 }
 
 static void
 pfsync_drop(struct pfsync_softc *sc)
 {
 	struct pf_state *st, *next;
 	struct pfsync_upd_req_item *ur;
 	struct pfsync_bucket *b;
 	int c, q;
 
 	for (c = 0; c < pfsync_buckets; c++) {
 		b = &sc->sc_buckets[c];
 		for (q = 0; q < PFSYNC_S_COUNT; q++) {
 			if (TAILQ_EMPTY(&b->b_qs[q]))
 				continue;
 
 			TAILQ_FOREACH_SAFE(st, &b->b_qs[q], sync_list, next) {
 				KASSERT(st->sync_state == q,
 					("%s: st->sync_state == q",
 						__func__));
 				st->sync_state = PFSYNC_S_NONE;
 				pf_release_state(st);
 			}
 			TAILQ_INIT(&b->b_qs[q]);
 		}
 
 		while ((ur = TAILQ_FIRST(&b->b_upd_req_list)) != NULL) {
 			TAILQ_REMOVE(&b->b_upd_req_list, ur, ur_entry);
 			free(ur, M_PFSYNC);
 		}
 
 		b->b_len = PFSYNC_MINPKT;
 		b->b_plus = NULL;
 	}
 }
 
 static void
 pfsync_sendout(int schedswi, int c)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct ifnet *ifp = sc->sc_ifp;
 	struct mbuf *m;
 	struct ip *ip;
 	struct pfsync_header *ph;
 	struct pfsync_subheader *subh;
 	struct pf_state *st, *st_next;
 	struct pfsync_upd_req_item *ur;
 	struct pfsync_bucket *b = &sc->sc_buckets[c];
 	int offset;
 	int q, count = 0;
 
 	KASSERT(sc != NULL, ("%s: null sc", __func__));
 	KASSERT(b->b_len > PFSYNC_MINPKT,
 	    ("%s: sc_len %zu", __func__, b->b_len));
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
 		pfsync_drop(sc);
 		return;
 	}
 
 	m = m_get2(max_linkhdr + b->b_len, M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		V_pfsyncstats.pfsyncs_onomem++;
 		return;
 	}
 	m->m_data += max_linkhdr;
 	m->m_len = m->m_pkthdr.len = b->b_len;
 
 	/* build the ip header */
 	ip = (struct ip *)m->m_data;
 	bcopy(&sc->sc_template, ip, sizeof(*ip));
 	offset = sizeof(*ip);
 
 	ip->ip_len = htons(m->m_pkthdr.len);
 	ip_fillid(ip);
 
 	/* build the pfsync header */
 	ph = (struct pfsync_header *)(m->m_data + offset);
 	bzero(ph, sizeof(*ph));
 	offset += sizeof(*ph);
 
 	ph->version = PFSYNC_VERSION;
 	ph->len = htons(b->b_len - sizeof(*ip));
 	bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
 
 	/* walk the queues */
 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
 		if (TAILQ_EMPTY(&b->b_qs[q]))
 			continue;
 
 		subh = (struct pfsync_subheader *)(m->m_data + offset);
 		offset += sizeof(*subh);
 
 		count = 0;
 		TAILQ_FOREACH_SAFE(st, &b->b_qs[q], sync_list, st_next) {
 			KASSERT(st->sync_state == q,
 				("%s: st->sync_state == q",
 					__func__));
 			/*
 			 * XXXGL: some of write methods do unlocked reads
 			 * of state data :(
 			 */
 			pfsync_qs[q].write(st, m->m_data + offset);
 			offset += pfsync_qs[q].len;
 			st->sync_state = PFSYNC_S_NONE;
 			pf_release_state(st);
 			count++;
 		}
 		TAILQ_INIT(&b->b_qs[q]);
 
 		bzero(subh, sizeof(*subh));
 		subh->action = pfsync_qs[q].action;
 		subh->count = htons(count);
 		V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
 	}
 
 	if (!TAILQ_EMPTY(&b->b_upd_req_list)) {
 		subh = (struct pfsync_subheader *)(m->m_data + offset);
 		offset += sizeof(*subh);
 
 		count = 0;
 		while ((ur = TAILQ_FIRST(&b->b_upd_req_list)) != NULL) {
 			TAILQ_REMOVE(&b->b_upd_req_list, ur, ur_entry);
 
 			bcopy(&ur->ur_msg, m->m_data + offset,
 			    sizeof(ur->ur_msg));
 			offset += sizeof(ur->ur_msg);
 			free(ur, M_PFSYNC);
 			count++;
 		}
 
 		bzero(subh, sizeof(*subh));
 		subh->action = PFSYNC_ACT_UPD_REQ;
 		subh->count = htons(count);
 		V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
 	}
 
 	/* has someone built a custom region for us to add? */
 	if (b->b_plus != NULL) {
 		bcopy(b->b_plus, m->m_data + offset, b->b_pluslen);
 		offset += b->b_pluslen;
 
 		b->b_plus = NULL;
 	}
 
 	subh = (struct pfsync_subheader *)(m->m_data + offset);
 	offset += sizeof(*subh);
 
 	bzero(subh, sizeof(*subh));
 	subh->action = PFSYNC_ACT_EOF;
 	subh->count = htons(1);
 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
 
 	/* we're done, let's put it on the wire */
 	if (ifp->if_bpf) {
 		m->m_data += sizeof(*ip);
 		m->m_len = m->m_pkthdr.len = b->b_len - sizeof(*ip);
 		BPF_MTAP(ifp, m);
 		m->m_data -= sizeof(*ip);
 		m->m_len = m->m_pkthdr.len = b->b_len;
 	}
 
 	if (sc->sc_sync_if == NULL) {
 		b->b_len = PFSYNC_MINPKT;
 		m_freem(m);
 		return;
 	}
 
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 	b->b_len = PFSYNC_MINPKT;
 
 	if (!_IF_QFULL(&b->b_snd))
 		_IF_ENQUEUE(&b->b_snd, m);
 	else {
 		m_freem(m);
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1);
 	}
 	if (schedswi)
 		swi_sched(V_pfsync_swi_cookie, 0);
 }
 
 static void
 pfsync_insert_state(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	if (st->state_flags & PFSTATE_NOSYNC)
 		return;
 
 	if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
 	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
 		st->state_flags |= PFSTATE_NOSYNC;
 		return;
 	}
 
 	KASSERT(st->sync_state == PFSYNC_S_NONE,
 		("%s: st->sync_state %u", __func__, st->sync_state));
 
 	PFSYNC_BUCKET_LOCK(b);
 	if (b->b_len == PFSYNC_MINPKT)
 		callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b);
 
 	pfsync_q_ins(st, PFSYNC_S_INS, true);
 	PFSYNC_BUCKET_UNLOCK(b);
 
 	st->sync_updates = 0;
 }
 
 static int
 pfsync_defer(struct pf_state *st, struct mbuf *m)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_deferral *pd;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	if (m->m_flags & (M_BCAST|M_MCAST))
 		return (0);
 
 	PFSYNC_LOCK(sc);
 
 	if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
 	    !(sc->sc_flags & PFSYNCF_DEFER)) {
 		PFSYNC_UNLOCK(sc);
 		return (0);
 	}
 
 	if (b->b_deferred >= 128)
 		pfsync_undefer(TAILQ_FIRST(&b->b_deferrals), 0);
 
 	pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
 	if (pd == NULL)
 		return (0);
 	b->b_deferred++;
 
 	m->m_flags |= M_SKIP_FIREWALL;
 	st->state_flags |= PFSTATE_ACK;
 
 	pd->pd_sc = sc;
 	pd->pd_refs = 0;
 	pd->pd_st = st;
 	pf_ref_state(st);
 	pd->pd_m = m;
 
 	TAILQ_INSERT_TAIL(&b->b_deferrals, pd, pd_entry);
 	callout_init_mtx(&pd->pd_tmo, &b->b_mtx, CALLOUT_RETURNUNLOCKED);
 	callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
 
 	pfsync_push(b);
 
 	return (1);
 }
 
 static void
 pfsync_undefer(struct pfsync_deferral *pd, int drop)
 {
 	struct pfsync_softc *sc = pd->pd_sc;
 	struct mbuf *m = pd->pd_m;
 	struct pf_state *st = pd->pd_st;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry);
 	b->b_deferred--;
 	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
 	free(pd, M_PFSYNC);
 	pf_release_state(st);
 
 	if (drop)
 		m_freem(m);
 	else {
 		_IF_ENQUEUE(&b->b_snd, m);
 		pfsync_push(b);
 	}
 }
 
 static void
 pfsync_defer_tmo(void *arg)
 {
 	struct pfsync_deferral *pd = arg;
 	struct pfsync_softc *sc = pd->pd_sc;
 	struct mbuf *m = pd->pd_m;
 	struct pf_state *st = pd->pd_st;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
 
 	TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry);
 	b->b_deferred--;
 	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
 	if (pd->pd_refs == 0)
 		free(pd, M_PFSYNC);
 	PFSYNC_UNLOCK(sc);
 
 	ip_output(m, NULL, NULL, 0, NULL, NULL);
 
 	pf_release_state(st);
 
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_undefer_state(struct pf_state *st, int drop)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_deferral *pd;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PFSYNC_BUCKET_LOCK(b);
 
 	TAILQ_FOREACH(pd, &b->b_deferrals, pd_entry) {
 		 if (pd->pd_st == st) {
 			if (callout_stop(&pd->pd_tmo) > 0)
 				pfsync_undefer(pd, drop);
 
 			PFSYNC_BUCKET_UNLOCK(b);
 			return;
 		}
 	}
 	PFSYNC_BUCKET_UNLOCK(b);
 
 	panic("%s: unable to find deferred state", __func__);
 }
 
 static struct pfsync_bucket*
 pfsync_get_bucket(struct pfsync_softc *sc, struct pf_state *st)
 {
 	int c = PF_IDHASH(st) % pfsync_buckets;
 	return &sc->sc_buckets[c];
 }
 
 static void
 pfsync_update_state(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	bool sync = false, ref = true;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PF_STATE_LOCK_ASSERT(st);
 	PFSYNC_BUCKET_LOCK(b);
 
 	if (st->state_flags & PFSTATE_ACK)
 		pfsync_undefer_state(st, 0);
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
 			pfsync_q_del(st, true, b);
 		PFSYNC_BUCKET_UNLOCK(b);
 		return;
 	}
 
 	if (b->b_len == PFSYNC_MINPKT)
 		callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b);
 
 	switch (st->sync_state) {
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_INS:
 		/* we're already handling it */
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
 			st->sync_updates++;
 			if (st->sync_updates >= sc->sc_maxupdates)
 				sync = true;
 		}
 		break;
 
 	case PFSYNC_S_IACK:
 		pfsync_q_del(st, false, b);
 		ref = false;
 		/* FALLTHROUGH */
 
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_UPD_C, ref);
 		st->sync_updates = 0;
 		break;
 
 	default:
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 
 	if (sync || (time_uptime - st->pfsync_time) < 2)
 		pfsync_push(b);
 
 	PFSYNC_BUCKET_UNLOCK(b);
 }
 
 static void
 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_bucket *b = &sc->sc_buckets[0];
 	struct pfsync_upd_req_item *item;
 	size_t nlen = sizeof(struct pfsync_upd_req);
 
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	/*
 	 * This code does a bit to prevent multiple update requests for the
 	 * same state being generated. It searches current subheader queue,
 	 * but it doesn't lookup into queue of already packed datagrams.
 	 */
 	TAILQ_FOREACH(item, &b->b_upd_req_list, ur_entry)
 		if (item->ur_msg.id == id &&
 		    item->ur_msg.creatorid == creatorid)
 			return;
 
 	item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
 	if (item == NULL)
 		return; /* XXX stats */
 
 	item->ur_msg.id = id;
 	item->ur_msg.creatorid = creatorid;
 
 	if (TAILQ_EMPTY(&b->b_upd_req_list))
 		nlen += sizeof(struct pfsync_subheader);
 
 	if (b->b_len + nlen > sc->sc_ifp->if_mtu) {
 		pfsync_sendout(1, 0);
 
 		nlen = sizeof(struct pfsync_subheader) +
 		    sizeof(struct pfsync_upd_req);
 	}
 
 	TAILQ_INSERT_TAIL(&b->b_upd_req_list, item, ur_entry);
 	b->b_len += nlen;
 }
 
 static bool
 pfsync_update_state_req(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	bool ref = true, full = false;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PF_STATE_LOCK_ASSERT(st);
 	PFSYNC_BUCKET_LOCK(b);
 
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
 			pfsync_q_del(st, true, b);
 		PFSYNC_BUCKET_UNLOCK(b);
 		return (full);
 	}
 
 	switch (st->sync_state) {
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_IACK:
 		pfsync_q_del(st, false, b);
 		ref = false;
 		/* FALLTHROUGH */
 
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_UPD, ref);
 		pfsync_push(b);
 		break;
 
 	case PFSYNC_S_INS:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_DEL:
 		/* we're already handling it */
 		break;
 
 	default:
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 
 	if ((sc->sc_ifp->if_mtu - b->b_len) < sizeof(struct pfsync_state))
 		full = true;
 
 	PFSYNC_BUCKET_UNLOCK(b);
 
 	return (full);
 }
 
 static void
 pfsync_delete_state(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 	bool ref = true;
 
 	PFSYNC_BUCKET_LOCK(b);
 	if (st->state_flags & PFSTATE_ACK)
 		pfsync_undefer_state(st, 1);
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
 			pfsync_q_del(st, true, b);
 		PFSYNC_BUCKET_UNLOCK(b);
 		return;
 	}
 
 	if (b->b_len == PFSYNC_MINPKT)
 		callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b);
 
 	switch (st->sync_state) {
 	case PFSYNC_S_INS:
 		/* We never got to tell the world so just forget about it. */
 		pfsync_q_del(st, true, b);
 		break;
 
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_IACK:
 		pfsync_q_del(st, false, b);
 		ref = false;
 		/* FALLTHROUGH */
 
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_DEL, ref);
 		break;
 
 	default:
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 
 	PFSYNC_BUCKET_UNLOCK(b);
 }
 
 static void
 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
 {
 	struct {
 		struct pfsync_subheader subh;
 		struct pfsync_clr clr;
 	} __packed r;
 
 	bzero(&r, sizeof(r));
 
 	r.subh.action = PFSYNC_ACT_CLR;
 	r.subh.count = htons(1);
 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
 
 	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
 	r.clr.creatorid = creatorid;
 
 	pfsync_send_plus(&r, sizeof(r));
 }
 
 static void
 pfsync_q_ins(struct pf_state *st, int q, bool ref)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	size_t nlen = pfsync_qs[q].len;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	KASSERT(st->sync_state == PFSYNC_S_NONE,
 		("%s: st->sync_state %u", __func__, st->sync_state));
 	KASSERT(b->b_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
 	    b->b_len));
 
 	if (TAILQ_EMPTY(&b->b_qs[q]))
 		nlen += sizeof(struct pfsync_subheader);
 
 	if (b->b_len + nlen > sc->sc_ifp->if_mtu) {
 		pfsync_sendout(1, b->b_id);
 
 		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
 	}
 
 	b->b_len += nlen;
 	TAILQ_INSERT_TAIL(&b->b_qs[q], st, sync_list);
 	st->sync_state = q;
 	if (ref)
 		pf_ref_state(st);
 }
 
 static void
 pfsync_q_del(struct pf_state *st, bool unref, struct pfsync_bucket *b)
 {
 	int q = st->sync_state;
 
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 	KASSERT(st->sync_state != PFSYNC_S_NONE,
 		("%s: st->sync_state != PFSYNC_S_NONE", __func__));
 
 	b->b_len -= pfsync_qs[q].len;
 	TAILQ_REMOVE(&b->b_qs[q], st, sync_list);
 	st->sync_state = PFSYNC_S_NONE;
 	if (unref)
 		pf_release_state(st);
 
 	if (TAILQ_EMPTY(&b->b_qs[q]))
 		b->b_len -= sizeof(struct pfsync_subheader);
 }
 
 static void
 pfsync_bulk_start(void)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	if (V_pf_status.debug >= PF_DEBUG_MISC)
 		printf("pfsync: received bulk update request\n");
 
 	PFSYNC_BLOCK(sc);
 
 	sc->sc_ureq_received = time_uptime;
 	sc->sc_bulk_hashid = 0;
 	sc->sc_bulk_stateid = 0;
 	pfsync_bulk_status(PFSYNC_BUS_START);
 	callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
 	PFSYNC_BUNLOCK(sc);
 }
 
 static void
 pfsync_bulk_update(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 	struct pf_state *s;
 	int i, sent = 0;
 
 	PFSYNC_BLOCK_ASSERT(sc);
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 
 	/*
 	 * Start with last state from previous invocation.
 	 * It may had gone, in this case start from the
 	 * hash slot.
 	 */
 	s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
 
 	if (s != NULL)
 		i = PF_IDHASH(s);
 	else
 		i = sc->sc_bulk_hashid;
 
 	for (; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 
 		if (s != NULL)
 			PF_HASHROW_ASSERT(ih);
 		else {
 			PF_HASHROW_LOCK(ih);
 			s = LIST_FIRST(&ih->states);
 		}
 
 		for (; s; s = LIST_NEXT(s, entry)) {
 			if (s->sync_state == PFSYNC_S_NONE &&
 			    s->timeout < PFTM_MAX &&
 			    s->pfsync_time <= sc->sc_ureq_received) {
 				if (pfsync_update_state_req(s)) {
 					/* We've filled a packet. */
 					sc->sc_bulk_hashid = i;
 					sc->sc_bulk_stateid = s->id;
 					sc->sc_bulk_creatorid = s->creatorid;
 					PF_HASHROW_UNLOCK(ih);
 					callout_reset(&sc->sc_bulk_tmo, 1,
 					    pfsync_bulk_update, sc);
 					goto full;
 				}
 				sent++;
 			}
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 
 	/* We're done. */
 	pfsync_bulk_status(PFSYNC_BUS_END);
 full:
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_bulk_status(u_int8_t status)
 {
 	struct {
 		struct pfsync_subheader subh;
 		struct pfsync_bus bus;
 	} __packed r;
 
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	bzero(&r, sizeof(r));
 
 	r.subh.action = PFSYNC_ACT_BUS;
 	r.subh.count = htons(1);
 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
 
 	r.bus.creatorid = V_pf_status.hostid;
 	r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
 	r.bus.status = status;
 
 	pfsync_send_plus(&r, sizeof(r));
 }
 
 static void
 pfsync_bulk_fail(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 	struct pfsync_bucket *b = &sc->sc_buckets[0];
 
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 
 	PFSYNC_BLOCK_ASSERT(sc);
 
 	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
 		/* Try again */
 		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
 		    pfsync_bulk_fail, V_pfsyncif);
 		PFSYNC_BUCKET_LOCK(b);
 		pfsync_request_update(0, 0);
 		PFSYNC_BUCKET_UNLOCK(b);
 	} else {
 		/* Pretend like the transfer was ok. */
 		sc->sc_ureq_sent = 0;
 		sc->sc_bulk_tries = 0;
 		PFSYNC_LOCK(sc);
 		if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 			(*carp_demote_adj_p)(-V_pfsync_carp_adj,
 			    "pfsync bulk fail");
 		sc->sc_flags |= PFSYNCF_OK;
 		PFSYNC_UNLOCK(sc);
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: failed to receive bulk update\n");
 	}
 
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_send_plus(void *plus, size_t pluslen)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_bucket *b = &sc->sc_buckets[0];
 
 	PFSYNC_BUCKET_LOCK(b);
 
 	if (b->b_len + pluslen > sc->sc_ifp->if_mtu)
 		pfsync_sendout(1, b->b_id);
 
 	b->b_plus = plus;
 	b->b_len += (b->b_pluslen = pluslen);
 
 	pfsync_sendout(1, b->b_id);
 	PFSYNC_BUCKET_UNLOCK(b);
 }
 
 static void
 pfsync_timeout(void *arg)
 {
 	struct pfsync_bucket *b = arg;
 
 	CURVNET_SET(b->b_sc->sc_ifp->if_vnet);
 	PFSYNC_BUCKET_LOCK(b);
 	pfsync_push(b);
 	PFSYNC_BUCKET_UNLOCK(b);
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_push(struct pfsync_bucket *b)
 {
 
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	b->b_flags |= PFSYNCF_BUCKET_PUSH;
 	swi_sched(V_pfsync_swi_cookie, 0);
 }
 
 static void
 pfsync_push_all(struct pfsync_softc *sc)
 {
 	int c;
 	struct pfsync_bucket *b;
 
 	for (c = 0; c < pfsync_buckets; c++) {
 		b = &sc->sc_buckets[c];
 
 		PFSYNC_BUCKET_LOCK(b);
 		pfsync_push(b);
 		PFSYNC_BUCKET_UNLOCK(b);
 	}
 }
 
 static void
 pfsyncintr(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 	struct pfsync_bucket *b;
 	struct mbuf *m, *n;
 	int c;
 
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 
 	for (c = 0; c < pfsync_buckets; c++) {
 		b = &sc->sc_buckets[c];
 
 		PFSYNC_BUCKET_LOCK(b);
 		if ((b->b_flags & PFSYNCF_BUCKET_PUSH) && b->b_len > PFSYNC_MINPKT) {
 			pfsync_sendout(0, b->b_id);
 			b->b_flags &= ~PFSYNCF_BUCKET_PUSH;
 		}
 		_IF_DEQUEUE_ALL(&b->b_snd, m);
 		PFSYNC_BUCKET_UNLOCK(b);
 
 		for (; m != NULL; m = n) {
 
 			n = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 
 			/*
 			 * We distinguish between a deferral packet and our
 			 * own pfsync packet based on M_SKIP_FIREWALL
 			 * flag. This is XXX.
 			 */
 			if (m->m_flags & M_SKIP_FIREWALL)
 				ip_output(m, NULL, NULL, 0, NULL, NULL);
 			else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
 			    NULL) == 0)
 				V_pfsyncstats.pfsyncs_opackets++;
 			else
 				V_pfsyncstats.pfsyncs_oerrors++;
 		}
 	}
 	CURVNET_RESTORE();
 }
 
 static int
-pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship)
+pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp,
+    struct in_mfilter *imf)
 {
 	struct ip_moptions *imo = &sc->sc_imo;
 	int error;
 
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		return (EADDRNOTAVAIL);
 
-	imo->imo_membership = (struct in_multi **)mship;
-	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
 	imo->imo_multicast_vif = -1;
 
 	if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
-	    &imo->imo_membership[0])) != 0) {
-		imo->imo_membership = NULL;
+	    &imf->imf_inm)) != 0)
 		return (error);
-	}
-	imo->imo_num_memberships++;
+
+	ip_mfilter_init(&imo->imo_head);
+	ip_mfilter_insert(&imo->imo_head, imf);
 	imo->imo_multicast_ifp = ifp;
 	imo->imo_multicast_ttl = PFSYNC_DFLTTL;
 	imo->imo_multicast_loop = 0;
 
 	return (0);
 }
 
 static void
 pfsync_multicast_cleanup(struct pfsync_softc *sc)
 {
 	struct ip_moptions *imo = &sc->sc_imo;
+	struct in_mfilter *imf;
 
-	in_leavegroup(imo->imo_membership[0], NULL);
-	free(imo->imo_membership, M_PFSYNC);
-	imo->imo_membership = NULL;
+	while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) {
+		ip_mfilter_remove(&imo->imo_head, imf);
+		in_leavegroup(imf->imf_inm, NULL);
+		ip_mfilter_free(imf);
+	}
 	imo->imo_multicast_ifp = NULL;
 }
 
 void
 pfsync_detach_ifnet(struct ifnet *ifp)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	if (sc == NULL)
 		return;
 
 	PFSYNC_LOCK(sc);
 
 	if (sc->sc_sync_if == ifp) {
 		/* We don't need mutlicast cleanup here, because the interface
 		 * is going away. We do need to ensure we don't try to do
 		 * cleanup later.
 		 */
-		sc->sc_imo.imo_membership = NULL;
+		ip_mfilter_init(&sc->sc_imo.imo_head);
 		sc->sc_imo.imo_multicast_ifp = NULL;
 		sc->sc_sync_if = NULL;
 	}
 
 	PFSYNC_UNLOCK(sc);
 }
 
 #ifdef INET
 extern  struct domain inetdomain;
 static struct protosw in_pfsync_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&inetdomain,
 	.pr_protocol =		IPPROTO_PFSYNC,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_input =		pfsync_input,
 	.pr_output =		rip_output,
 	.pr_ctloutput =		rip_ctloutput,
 	.pr_usrreqs =		&rip_usrreqs
 };
 #endif
 
 static void
 pfsync_pointers_init()
 {
 
 	PF_RULES_WLOCK();
 	V_pfsync_state_import_ptr = pfsync_state_import;
 	V_pfsync_insert_state_ptr = pfsync_insert_state;
 	V_pfsync_update_state_ptr = pfsync_update_state;
 	V_pfsync_delete_state_ptr = pfsync_delete_state;
 	V_pfsync_clear_states_ptr = pfsync_clear_states;
 	V_pfsync_defer_ptr = pfsync_defer;
 	PF_RULES_WUNLOCK();
 }
 
 static void
 pfsync_pointers_uninit()
 {
 
 	PF_RULES_WLOCK();
 	V_pfsync_state_import_ptr = NULL;
 	V_pfsync_insert_state_ptr = NULL;
 	V_pfsync_update_state_ptr = NULL;
 	V_pfsync_delete_state_ptr = NULL;
 	V_pfsync_clear_states_ptr = NULL;
 	V_pfsync_defer_ptr = NULL;
 	PF_RULES_WUNLOCK();
 }
 
 static void
 vnet_pfsync_init(const void *unused __unused)
 {
 	int error;
 
 	V_pfsync_cloner = if_clone_simple(pfsyncname,
 	    pfsync_clone_create, pfsync_clone_destroy, 1);
 	error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif,
 	    SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
 	if (error) {
 		if_clone_detach(V_pfsync_cloner);
 		log(LOG_INFO, "swi_add() failed in %s\n", __func__);
 	}
 
 	pfsync_pointers_init();
 }
 VNET_SYSINIT(vnet_pfsync_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY,
     vnet_pfsync_init, NULL);
 
 static void
 vnet_pfsync_uninit(const void *unused __unused)
 {
 
 	pfsync_pointers_uninit();
 
 	if_clone_detach(V_pfsync_cloner);
 	swi_remove(V_pfsync_swi_cookie);
 }
 
 VNET_SYSUNINIT(vnet_pfsync_uninit, SI_SUB_PROTO_FIREWALL, SI_ORDER_FOURTH,
     vnet_pfsync_uninit, NULL);
 
 static int
 pfsync_init()
 {
 #ifdef INET
 	int error;
 
 	pfsync_detach_ifnet_ptr = pfsync_detach_ifnet;
 
 	error = pf_proto_register(PF_INET, &in_pfsync_protosw);
 	if (error)
 		return (error);
 	error = ipproto_register(IPPROTO_PFSYNC);
 	if (error) {
 		pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
 		return (error);
 	}
 #endif
 
 	return (0);
 }
 
 static void
 pfsync_uninit()
 {
 	pfsync_detach_ifnet_ptr = NULL;
 
 #ifdef INET
 	ipproto_unregister(IPPROTO_PFSYNC);
 	pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
 #endif
 }
 
 static int
 pfsync_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		error = pfsync_init();
 		break;
 	case MOD_UNLOAD:
 		pfsync_uninit();
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 static moduledata_t pfsync_mod = {
 	pfsyncname,
 	pfsync_modevent,
 	0
 };
 
 #define PFSYNC_MODVER 1
 
 /* Stay on FIREWALL as we depend on pf being initialized and on inetdomain. */
 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
 MODULE_VERSION(pfsync, PFSYNC_MODVER);
 MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);