Index: head/sys/netinet/if_ether.c
===================================================================
--- head/sys/netinet/if_ether.c	(revision 292014)
+++ head/sys/netinet/if_ether.c	(revision 292015)
@@ -1,1235 +1,1227 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ether.c	8.1 (Berkeley) 6/10/93
  */
 
 /*
  * Ethernet address resolution protocol.
  * TODO:
  *	add "inuse/lock" bit (or ref. count) along with valid bit
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/ethernet.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
+#include <netinet/in_fib.h>
 #include <netinet/in_var.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #ifdef INET
 #include <netinet/ip_carp.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #define SIN(s) ((const struct sockaddr_in *)(s))
 
 static struct timeval arp_lastlog;
 static int arp_curpps;
 static int arp_maxpps = 1;
 
 /* Simple ARP state machine */
 enum arp_llinfo_state {
 	ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */
 	ARP_LLINFO_REACHABLE,	/* LLE is valid */
 	ARP_LLINFO_VERIFY,	/* LLE is valid, need refresh */
 	ARP_LLINFO_DELETED,	/* LLE is deleted */
 };
 
 SYSCTL_DECL(_net_link_ether);
 static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");
 static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, "");
 
 /* timer values */
 static VNET_DEFINE(int, arpt_keep) = (20*60);	/* once resolved, good for 20
 						 * minutes */
 static VNET_DEFINE(int, arp_maxtries) = 5;
 static VNET_DEFINE(int, arp_proxyall) = 0;
 static VNET_DEFINE(int, arpt_down) = 20;	/* keep incomplete entries for
 						 * 20 seconds */
 static VNET_DEFINE(int, arpt_rexmit) = 1;	/* retransmit arp entries, sec*/
 VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat);  /* ARP statistics, see if_arp.h */
 VNET_PCPUSTAT_SYSINIT(arpstat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(arpstat);
 #endif /* VIMAGE */
 
 static VNET_DEFINE(int, arp_maxhold) = 1;
 
 #define	V_arpt_keep		VNET(arpt_keep)
 #define	V_arpt_down		VNET(arpt_down)
 #define	V_arpt_rexmit		VNET(arpt_rexmit)
 #define	V_arp_maxtries		VNET(arp_maxtries)
 #define	V_arp_proxyall		VNET(arp_proxyall)
 #define	V_arp_maxhold		VNET(arp_maxhold)
 
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arpt_keep), 0,
 	"ARP entry lifetime in seconds");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_maxtries), 0,
 	"ARP resolution attempts before returning error");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_proxyall), 0,
 	"Enable proxy ARP for all suitable requests");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arpt_down), 0,
 	"Incomplete ARP entry lifetime in seconds");
 SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat,
     arpstat, "ARP statistics (struct arpstat, net/if_arp.h)");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_maxhold), 0,
 	"Number of packets to hold per ARP entry");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second,
 	CTLFLAG_RW, &arp_maxpps, 0,
 	"Maximum number of remotely triggered ARP messages that can be "
 	"logged per second");
 
 #define	ARP_LOG(pri, ...)	do {					\
 	if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps))	\
 		log((pri), "arp: " __VA_ARGS__);			\
 } while (0)
 
 
 static void	arp_init(void);
 static void	arpintr(struct mbuf *);
 static void	arptimer(void *);
 #ifdef INET
 static void	in_arpinput(struct mbuf *);
 #endif
 
 static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr,
     struct ifnet *ifp, int bridged, struct llentry *la);
 static void arp_mark_lle_reachable(struct llentry *la);
 static void arp_iflladdr(void *arg __unused, struct ifnet *ifp);
 
 static eventhandler_tag iflladdr_tag;
 
 static const struct netisr_handler arp_nh = {
 	.nh_name = "arp",
 	.nh_handler = arpintr,
 	.nh_proto = NETISR_ARP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 /*
  * Timeout routine.  Age arp_tab entries periodically.
  */
 static void
 arptimer(void *arg)
 {
 	struct llentry *lle = (struct llentry *)arg;
 	struct ifnet *ifp;
 	int r_skip_req;
 
 	if (lle->la_flags & LLE_STATIC) {
 		return;
 	}
 	LLE_WLOCK(lle);
 	if (callout_pending(&lle->lle_timer)) {
 		/*
 		 * Here we are a bit odd here in the treatment of 
 		 * active/pending. If the pending bit is set, it got
 		 * rescheduled before I ran. The active
 		 * bit we ignore, since if it was stopped
 		 * in ll_tablefree() and was currently running
 		 * it would have return 0 so the code would
 		 * not have deleted it since the callout could
 		 * not be stopped so we want to go through
 		 * with the delete here now. If the callout
 		 * was restarted, the pending bit will be back on and
 		 * we just want to bail since the callout_reset would
 		 * return 1 and our reference would have been removed
 		 * by arpresolve() below.
 		 */
 		LLE_WUNLOCK(lle);
  		return;
  	}
 	ifp = lle->lle_tbl->llt_ifp;
 	CURVNET_SET(ifp->if_vnet);
 
 	switch (lle->ln_state) {
 	case ARP_LLINFO_REACHABLE:
 
 		/*
 		 * Expiration time is approaching.
 		 * Let's try to refresh entry if it is still
 		 * in use.
 		 *
 		 * Set r_skip_req to get feedback from
 		 * fast path. Change state and re-schedule
 		 * ourselves.
 		 */
 		LLE_REQ_LOCK(lle);
 		lle->r_skip_req = 1;
 		LLE_REQ_UNLOCK(lle);
 		lle->ln_state = ARP_LLINFO_VERIFY;
 		callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
 		LLE_WUNLOCK(lle);
 		CURVNET_RESTORE();
 		return;
 	case ARP_LLINFO_VERIFY:
 		LLE_REQ_LOCK(lle);
 		r_skip_req = lle->r_skip_req;
 		LLE_REQ_UNLOCK(lle);
 
 		if (r_skip_req == 0 && lle->la_preempt > 0) {
 			/* Entry was used, issue refresh request */
 			struct in_addr dst;
 			dst = lle->r_l3addr.addr4;
 			lle->la_preempt--;
 			callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
 			LLE_WUNLOCK(lle);
 			arprequest(ifp, NULL, &dst, NULL);
 			CURVNET_RESTORE();
 			return;
 		}
 		/* Nothing happened. Reschedule if not too late */
 		if (lle->la_expire > time_uptime) {
 			callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
 			LLE_WUNLOCK(lle);
 			CURVNET_RESTORE();
 			return;
 		}
 		break;
 	case ARP_LLINFO_INCOMPLETE:
 	case ARP_LLINFO_DELETED:
 		break;
 	}
 
 	if ((lle->la_flags & LLE_DELETED) == 0) {
 		int evt;
 
 		if (lle->la_flags & LLE_VALID)
 			evt = LLENTRY_EXPIRED;
 		else
 			evt = LLENTRY_TIMEDOUT;
 		EVENTHANDLER_INVOKE(lle_event, lle, evt);
 	}
 
 	callout_stop(&lle->lle_timer);
 
 	/* XXX: LOR avoidance. We still have ref on lle. */
 	LLE_WUNLOCK(lle);
 	IF_AFDATA_LOCK(ifp);
 	LLE_WLOCK(lle);
 
 	/* Guard against race with other llentry_free(). */
 	if (lle->la_flags & LLE_LINKED) {
 		LLE_REMREF(lle);
 		lltable_unlink_entry(lle->lle_tbl, lle);
 	}
 	IF_AFDATA_UNLOCK(ifp);
 
 	size_t pkts_dropped = llentry_free(lle);
 
 	ARPSTAT_ADD(dropped, pkts_dropped);
 	ARPSTAT_INC(timeouts);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Broadcast an ARP request. Caller specifies:
  *	- arp header source ip address
  *	- arp header target ip address
  *	- arp header source ethernet address
  */
 void
 arprequest(struct ifnet *ifp, const struct in_addr *sip,
     const struct in_addr *tip, u_char *enaddr)
 {
 	struct mbuf *m;
 	struct arphdr *ah;
 	struct sockaddr sa;
 	u_char *carpaddr = NULL;
 
 	if (sip == NULL) {
 		/*
 		 * The caller did not supply a source address, try to find
 		 * a compatible one among those assigned to this interface.
 		 */
 		struct ifaddr *ifa;
 
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 
 			if (ifa->ifa_carp) {
 				if ((*carp_iamatch_p)(ifa, &carpaddr) == 0)
 					continue;
 				sip = &IA_SIN(ifa)->sin_addr;
 			} else {
 				carpaddr = NULL;
 				sip = &IA_SIN(ifa)->sin_addr;
 			}
 
 			if (0 == ((sip->s_addr ^ tip->s_addr) &
 			    IA_MASKSIN(ifa)->sin_addr.s_addr))
 				break;  /* found it. */
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (sip == NULL) {
 			printf("%s: cannot find matching address\n", __func__);
 			return;
 		}
 	}
 	if (enaddr == NULL)
 		enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp);
 
 	if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
 		return;
 	m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) +
 		2 * ifp->if_addrlen;
 	m->m_pkthdr.len = m->m_len;
 	M_ALIGN(m, m->m_len);
 	ah = mtod(m, struct arphdr *);
 	bzero((caddr_t)ah, m->m_len);
 #ifdef MAC
 	mac_netinet_arp_send(ifp, m);
 #endif
 	ah->ar_pro = htons(ETHERTYPE_IP);
 	ah->ar_hln = ifp->if_addrlen;		/* hardware address length */
 	ah->ar_pln = sizeof(struct in_addr);	/* protocol address length */
 	ah->ar_op = htons(ARPOP_REQUEST);
 	bcopy(enaddr, ar_sha(ah), ah->ar_hln);
 	bcopy(sip, ar_spa(ah), ah->ar_pln);
 	bcopy(tip, ar_tpa(ah), ah->ar_pln);
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 	m->m_flags |= M_BCAST;
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	(*ifp->if_output)(ifp, m, &sa, NULL);
 	ARPSTAT_INC(txrequests);
 }
 
 /*
  * Resolve an IP address into an ethernet address - heavy version.
  * Used internally by arpresolve().
  * We have already checked than  we can't use existing lle without
  * modification so we have to acquire LLE_EXCLUSIVE lle lock.
  *
  * On success, desten and flags are filled in and the function returns 0;
  * If the packet must be held pending resolution, we return EWOULDBLOCK
  * On other errors, we return the corresponding error code.
  * Note that m_freem() handles NULL.
  */
 static int
 arpresolve_full(struct ifnet *ifp, int is_gw, int create, struct mbuf *m,
 	const struct sockaddr *dst, u_char *desten, uint32_t *pflags)
 {
 	struct llentry *la = NULL, *la_tmp;
 	struct mbuf *curr = NULL;
 	struct mbuf *next = NULL;
 	int error, renew;
 
 	if (pflags != NULL)
 		*pflags = 0;
 
 	if (create == 0) {
 		IF_AFDATA_RLOCK(ifp);
 		la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 		IF_AFDATA_RUNLOCK(ifp);
 	}
 	if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) {
 		la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
 		if (la == NULL) {
 			log(LOG_DEBUG,
 			    "arpresolve: can't allocate llinfo for %s on %s\n",
 			    inet_ntoa(SIN(dst)->sin_addr), if_name(ifp));
 			m_freem(m);
 			return (EINVAL);
 		}
 
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(la);
 		la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 		/* Prefer ANY existing lle over newly-created one */
 		if (la_tmp == NULL)
 			lltable_link_entry(LLTABLE(ifp), la);
 		IF_AFDATA_WUNLOCK(ifp);
 		if (la_tmp != NULL) {
 			lltable_free_entry(LLTABLE(ifp), la);
 			la = la_tmp;
 		}
 	}
 	if (la == NULL) {
 		m_freem(m);
 		return (EINVAL);
 	}
 
 	if ((la->la_flags & LLE_VALID) &&
 	    ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
 		bcopy(&la->ll_addr, desten, ifp->if_addrlen);
 
 		/* Check if we have feedback request from arptimer() */
 		if (la->r_skip_req != 0) {
 			LLE_REQ_LOCK(la);
 			la->r_skip_req = 0; /* Notify that entry was used */
 			LLE_REQ_UNLOCK(la);
 		}
 		if (pflags != NULL)
 			*pflags = la->la_flags & (LLE_VALID|LLE_IFADDR);
 		LLE_WUNLOCK(la);
 		return (0);
 	}
 
 	renew = (la->la_asked == 0 || la->la_expire != time_uptime);
 	/*
 	 * There is an arptab entry, but no ethernet address
 	 * response yet.  Add the mbuf to the list, dropping
 	 * the oldest packet if we have exceeded the system
 	 * setting.
 	 */
 	if (m != NULL) {
 		if (la->la_numheld >= V_arp_maxhold) {
 			if (la->la_hold != NULL) {
 				next = la->la_hold->m_nextpkt;
 				m_freem(la->la_hold);
 				la->la_hold = next;
 				la->la_numheld--;
 				ARPSTAT_INC(dropped);
 			}
 		}
 		if (la->la_hold != NULL) {
 			curr = la->la_hold;
 			while (curr->m_nextpkt != NULL)
 				curr = curr->m_nextpkt;
 			curr->m_nextpkt = m;
 		} else
 			la->la_hold = m;
 		la->la_numheld++;
 	}
 	/*
 	 * Return EWOULDBLOCK if we have tried less than arp_maxtries. It
 	 * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
 	 * if we have already sent arp_maxtries ARP requests. Retransmit the
 	 * ARP request, but not faster than one request per second.
 	 */
 	if (la->la_asked < V_arp_maxtries)
 		error = EWOULDBLOCK;	/* First request. */
 	else
 		error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN;
 
 	if (renew) {
 		int canceled;
 
 		LLE_ADDREF(la);
 		la->la_expire = time_uptime;
 		canceled = callout_reset(&la->lle_timer, hz * V_arpt_down,
 		    arptimer, la);
 		if (canceled)
 			LLE_REMREF(la);
 		la->la_asked++;
 		LLE_WUNLOCK(la);
 		arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL);
 		return (error);
 	}
 
 	LLE_WUNLOCK(la);
 	return (error);
 }
 
 /*
  * Resolve an IP address into an ethernet address.
  * On input:
  *    ifp is the interface we use
  *    is_gw != 0 if @dst represents gateway to some destination
  *    m is the mbuf. May be NULL if we don't have a packet.
  *    dst is the next hop,
  *    desten is the storage to put LL address.
  *    flags returns subset of lle flags: LLE_VALID | LLE_IFADDR
  *
  * On success, desten and flags are filled in and the function returns 0;
  * If the packet must be held pending resolution, we return EWOULDBLOCK
  * On other errors, we return the corresponding error code.
  * Note that m_freem() handles NULL.
  */
 int
 arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
 	const struct sockaddr *dst, u_char *desten, uint32_t *pflags)
 {
 	struct llentry *la = 0;
 
 	if (pflags != NULL)
 		*pflags = 0;
 
 	if (m != NULL) {
 		if (m->m_flags & M_BCAST) {
 			/* broadcast */
 			(void)memcpy(desten,
 			    ifp->if_broadcastaddr, ifp->if_addrlen);
 			return (0);
 		}
 		if (m->m_flags & M_MCAST) {
 			/* multicast */
 			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
 			return (0);
 		}
 	}
 
 	IF_AFDATA_RLOCK(ifp);
 	la = lla_lookup(LLTABLE(ifp), LLE_UNLOCKED, dst);
 	if (la != NULL && (la->r_flags & RLLE_VALID) != 0) {
 		/* Entry found, let's copy lle info */
 		bcopy(&la->ll_addr, desten, ifp->if_addrlen);
 		if (pflags != NULL)
 			*pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR);
 		/* Check if we have feedback request from arptimer() */
 		if (la->r_skip_req != 0) {
 			LLE_REQ_LOCK(la);
 			la->r_skip_req = 0; /* Notify that entry was used */
 			LLE_REQ_UNLOCK(la);
 		}
 		IF_AFDATA_RUNLOCK(ifp);
 		return (0);
 	}
 	IF_AFDATA_RUNLOCK(ifp);
 
 	return (arpresolve_full(ifp, is_gw, 1, m, dst, desten, pflags));
 }
 
 /*
  * Common length and type checks are done here,
  * then the protocol-specific routine is called.
  */
 static void
 arpintr(struct mbuf *m)
 {
 	struct arphdr *ar;
 	struct ifnet *ifp;
 	char *layer;
 	int hlen;
 
 	ifp = m->m_pkthdr.rcvif;
 
 	if (m->m_len < sizeof(struct arphdr) &&
 	    ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
 		ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n",
 		    if_name(ifp));
 		return;
 	}
 	ar = mtod(m, struct arphdr *);
 
 	/* Check if length is sufficient */
 	if (m->m_len <  arphdr_len(ar)) {
 		m = m_pullup(m, arphdr_len(ar));
 		if (m == NULL) {
 			ARP_LOG(LOG_NOTICE, "short packet received on %s\n",
 			    if_name(ifp));
 			return;
 		}
 		ar = mtod(m, struct arphdr *);
 	}
 
 	hlen = 0;
 	layer = "";
 	switch (ntohs(ar->ar_hrd)) {
 	case ARPHRD_ETHER:
 		hlen = ETHER_ADDR_LEN; /* RFC 826 */
 		layer = "ethernet";
 		break;
 	case ARPHRD_IEEE802:
 		hlen = 6; /* RFC 1390, FDDI_ADDR_LEN */
 		layer = "fddi";
 		break;
 	case ARPHRD_ARCNET:
 		hlen = 1; /* RFC 1201, ARC_ADDR_LEN */
 		layer = "arcnet";
 		break;
 	case ARPHRD_INFINIBAND:
 		hlen = 20;	/* RFC 4391, INFINIBAND_ALEN */ 
 		layer = "infiniband";
 		break;
 	case ARPHRD_IEEE1394:
 		hlen = 0; /* SHALL be 16 */ /* RFC 2734 */
 		layer = "firewire";
 
 		/*
 		 * Restrict too long harware addresses.
 		 * Currently we are capable of handling 20-byte
 		 * addresses ( sizeof(lle->ll_addr) )
 		 */
 		if (ar->ar_hln >= 20)
 			hlen = 16;
 		break;
 	default:
 		ARP_LOG(LOG_NOTICE,
 		    "packet with unknown harware format 0x%02d received on %s\n",
 		    ntohs(ar->ar_hrd), if_name(ifp));
 		m_freem(m);
 		return;
 	}
 
 	if (hlen != 0 && hlen != ar->ar_hln) {
 		ARP_LOG(LOG_NOTICE,
 		    "packet with invalid %s address length %d received on %s\n",
 		    layer, ar->ar_hln, if_name(ifp));
 		m_freem(m);
 		return;
 	}
 
 	ARPSTAT_INC(received);
 	switch (ntohs(ar->ar_pro)) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		in_arpinput(m);
 		return;
 #endif
 	}
 	m_freem(m);
 }
 
 #ifdef INET
 /*
  * ARP for Internet protocols on 10 Mb/s Ethernet.
  * Algorithm is that given in RFC 826.
  * In addition, a sanity check is performed on the sender
  * protocol address, to catch impersonators.
  * We no longer handle negotiations for use of trailer protocol:
  * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
  * along with IP replies if we wanted trailers sent to us,
  * and also sent them in response to IP replies.
  * This allowed either end to announce the desire to receive
  * trailer packets.
  * We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
  * but formerly didn't normally send requests.
  */
 static int log_arp_wrong_iface = 1;
 static int log_arp_movements = 1;
 static int log_arp_permanent_modify = 1;
 static int allow_multicast = 0;
 
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
 	&log_arp_wrong_iface, 0,
 	"log arp packets arriving on the wrong interface");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
 	&log_arp_movements, 0,
 	"log arp replies from MACs different than the one in the cache");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
 	&log_arp_permanent_modify, 0,
 	"log arp replies from MACs different than the one in the permanent arp entry");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW,
 	&allow_multicast, 0, "accept multicast addresses");
 
 static void
 in_arpinput(struct mbuf *m)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct arphdr *ah;
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct llentry *la = NULL, *la_tmp;
-	struct rtentry *rt;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	struct sockaddr sa;
 	struct in_addr isaddr, itaddr, myaddr;
 	u_int8_t *enaddr = NULL;
 	int op;
 	int bridged = 0, is_bridge = 0;
 	int carped;
 	struct sockaddr_in sin;
 	struct sockaddr *dst;
+	struct nhop4_basic nh4;
+
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = 0;
 
 	if (ifp->if_bridge)
 		bridged = 1;
 	if (ifp->if_type == IFT_BRIDGE)
 		is_bridge = 1;
 
 	/*
 	 * We already have checked that mbuf contains enough contiguous data
 	 * to hold entire arp message according to the arp header.
 	 */
 	ah = mtod(m, struct arphdr *);
 
 	/*
 	 * ARP is only for IPv4 so we can reject packets with
 	 * a protocol length not equal to an IPv4 address.
 	 */
 	if (ah->ar_pln != sizeof(struct in_addr)) {
 		ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n",
 		    sizeof(struct in_addr));
 		goto drop;
 	}
 
 	if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) {
 		ARP_LOG(LOG_NOTICE, "%*D is multicast\n",
 		    ifp->if_addrlen, (u_char *)ar_sha(ah), ":");
 		goto drop;
 	}
 
 	op = ntohs(ah->ar_op);
 	(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
 	(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
 
 	if (op == ARPOP_REPLY)
 		ARPSTAT_INC(rxreplies);
 
 	/*
 	 * For a bridge, we want to check the address irrespective
 	 * of the receive interface. (This will change slightly
 	 * when we have clusters of interfaces).
 	 */
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
 		    ia->ia_ifp == ifp) &&
 		    itaddr.s_addr == ia->ia_addr.sin_addr.s_addr &&
 		    (ia->ia_ifa.ifa_carp == NULL ||
 		    (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) {
 			ifa_ref(&ia->ia_ifa);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			goto match;
 		}
 	}
 	LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
 		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
 		    ia->ia_ifp == ifp) &&
 		    isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
 			ifa_ref(&ia->ia_ifa);
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			goto match;
 		}
 
 #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia)				\
   (ia->ia_ifp->if_bridge == ifp->if_softc &&				\
   !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) &&	\
   addr == ia->ia_addr.sin_addr.s_addr)
 	/*
 	 * Check the case when bridge shares its MAC address with
 	 * some of its children, so packets are claimed by bridge
 	 * itself (bridge_input() does it first), but they are really
 	 * meant to be destined to the bridge member.
 	 */
 	if (is_bridge) {
 		LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 			if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
 				ifa_ref(&ia->ia_ifa);
 				ifp = ia->ia_ifp;
 				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 				goto match;
 			}
 		}
 	}
 #undef BDG_MEMBER_MATCHES_ARP
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 
 	/*
 	 * No match, use the first inet address on the receive interface
 	 * as a dummy address for the rest of the function.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (ifa->ifa_addr->sa_family == AF_INET &&
 		    (ifa->ifa_carp == NULL ||
 		    (*carp_iamatch_p)(ifa, &enaddr))) {
 			ia = ifatoia(ifa);
 			ifa_ref(ifa);
 			IF_ADDR_RUNLOCK(ifp);
 			goto match;
 		}
 	IF_ADDR_RUNLOCK(ifp);
 
 	/*
 	 * If bridging, fall back to using any inet address.
 	 */
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) {
 		IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		goto drop;
 	}
 	ifa_ref(&ia->ia_ifa);
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 match:
 	if (!enaddr)
 		enaddr = (u_int8_t *)IF_LLADDR(ifp);
 	carped = (ia->ia_ifa.ifa_carp != NULL);
 	myaddr = ia->ia_addr.sin_addr;
 	ifa_free(&ia->ia_ifa);
 	if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
 		goto drop;	/* it's from me, ignore it. */
 	if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
 		ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address "
 		    "%s!\n", inet_ntoa(isaddr));
 		goto drop;
 	}
 
 	if (ifp->if_addrlen != ah->ar_hln) {
 		ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
 		    "i/f %d (ignored)\n", ifp->if_addrlen,
 		    (u_char *) ar_sha(ah), ":", ah->ar_hln,
 		    ifp->if_addrlen);
 		goto drop;
 	}
 
 	/*
 	 * Warn if another host is using the same IP address, but only if the
 	 * IP address isn't 0.0.0.0, which is used for DHCP only, in which
 	 * case we suppress the warning to avoid false positive complaints of
 	 * potential misconfiguration.
 	 */
 	if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr &&
 	    myaddr.s_addr != 0) {
 		ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n",
 		   ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 		   inet_ntoa(isaddr), ifp->if_xname);
 		itaddr = myaddr;
 		ARPSTAT_INC(dupips);
 		goto reply;
 	}
 	if (ifp->if_flags & IFF_STATICARP)
 		goto reply;
 
 	bzero(&sin, sizeof(sin));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr = isaddr;
 	dst = (struct sockaddr *)&sin;
 	IF_AFDATA_RLOCK(ifp);
 	la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 	IF_AFDATA_RUNLOCK(ifp);
 	if (la != NULL)
 		arp_check_update_lle(ah, isaddr, ifp, bridged, la);
 	else if (itaddr.s_addr == myaddr.s_addr) {
 		/*
 		 * Reply to our address, but no lle exists yet.
 		 * do we really have to create an entry?
 		 */
 		la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
 		if (la == NULL)
 			goto drop;
 		lltable_set_entry_addr(ifp, la, ar_sha(ah));
 
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(la);
 		la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 
 		/*
 		 * Check if lle still does not exists.
 		 * If it does, that means that we either
 		 * 1) have configured it explicitly, via
 		 * 1a) 'arp -s' static entry or
 		 * 1b) interface address static record
 		 * or
 		 * 2) it was the result of sending first packet to-host
 		 * or
 		 * 3) it was another arp reply packet we handled in
 		 * different thread.
 		 *
 		 * In all cases except 3) we definitely need to prefer
 		 * existing lle. For the sake of simplicity, prefer any
 		 * existing lle over newly-create one.
 		 */
 		if (la_tmp == NULL)
 			lltable_link_entry(LLTABLE(ifp), la);
 		IF_AFDATA_WUNLOCK(ifp);
 
 		if (la_tmp == NULL) {
 			arp_mark_lle_reachable(la);
 			LLE_WUNLOCK(la);
 		} else {
 			/* Free newly-create entry and handle packet */
 			lltable_free_entry(LLTABLE(ifp), la);
 			la = la_tmp;
 			la_tmp = NULL;
 			arp_check_update_lle(ah, isaddr, ifp, bridged, la);
 			/* arp_check_update_lle() returns @la unlocked */
 		}
 		la = NULL;
 	}
 reply:
 	if (op != ARPOP_REQUEST)
 		goto drop;
 	ARPSTAT_INC(rxrequests);
 
 	if (itaddr.s_addr == myaddr.s_addr) {
 		/* Shortcut.. the receiving interface is the target. */
 		(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 		(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 	} else {
 		struct llentry *lle = NULL;
 
 		sin.sin_addr = itaddr;
 		IF_AFDATA_RLOCK(ifp);
 		lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
 		IF_AFDATA_RUNLOCK(ifp);
 
 		if ((lle != NULL) && (lle->la_flags & LLE_PUB)) {
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln);
 			LLE_RUNLOCK(lle);
 		} else {
 
 			if (lle != NULL)
 				LLE_RUNLOCK(lle);
 
 			if (!V_arp_proxyall)
 				goto drop;
 
-			sin.sin_addr = itaddr;
 			/* XXX MRT use table 0 for arp reply  */
-			rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
-			if (!rt)
+			if (fib4_lookup_nh_basic(0, itaddr, 0, 0, &nh4) != 0)
 				goto drop;
 
 			/*
 			 * Don't send proxies for nodes on the same interface
 			 * as this one came out of, or we'll get into a fight
 			 * over who claims what Ether address.
 			 */
-			if (!rt->rt_ifp || rt->rt_ifp == ifp) {
-				RTFREE_LOCKED(rt);
+			if (nh4.nh_ifp == ifp)
 				goto drop;
-			}
-			RTFREE_LOCKED(rt);
 
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 
 			/*
 			 * Also check that the node which sent the ARP packet
 			 * is on the interface we expect it to be on. This
 			 * avoids ARP chaos if an interface is connected to the
 			 * wrong network.
 			 */
-			sin.sin_addr = isaddr;
 
 			/* XXX MRT use table 0 for arp checks */
-			rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
-			if (!rt)
+			if (fib4_lookup_nh_basic(0, isaddr, 0, 0, &nh4) != 0)
 				goto drop;
-			if (rt->rt_ifp != ifp) {
+			if (nh4.nh_ifp != ifp) {
 				ARP_LOG(LOG_INFO, "proxy: ignoring request"
-				    " from %s via %s, expecting %s\n",
-				    inet_ntoa(isaddr), ifp->if_xname,
-				    rt->rt_ifp->if_xname);
-				RTFREE_LOCKED(rt);
+				    " from %s via %s\n",
+				    inet_ntoa(isaddr), ifp->if_xname);
 				goto drop;
 			}
-			RTFREE_LOCKED(rt);
 
 #ifdef DEBUG_PROXY
 			printf("arp: proxying for %s\n", inet_ntoa(itaddr));
 #endif
 		}
 	}
 
 	if (itaddr.s_addr == myaddr.s_addr &&
 	    IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
 		/* RFC 3927 link-local IPv4; always reply by broadcast. */
 #ifdef DEBUG_LINKLOCAL
 		printf("arp: sending reply for link-local addr %s\n",
 		    inet_ntoa(itaddr));
 #endif
 		m->m_flags |= M_BCAST;
 		m->m_flags &= ~M_MCAST;
 	} else {
 		/* default behaviour; never reply by broadcast. */
 		m->m_flags &= ~(M_BCAST|M_MCAST);
 	}
 	(void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
 	(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
 	ah->ar_op = htons(ARPOP_REPLY);
 	ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
 	m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
 	m->m_pkthdr.len = m->m_len;
 	m->m_pkthdr.rcvif = NULL;
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	(*ifp->if_output)(ifp, m, &sa, NULL);
 	ARPSTAT_INC(txreplies);
 	return;
 
 drop:
 	m_freem(m);
 }
 #endif
 
 /*
  * Checks received arp data against existing @la.
  * Updates lle state/performs notification if necessary.
  */
 static void
 arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp,
     int bridged, struct llentry *la)
 {
 	struct sockaddr sa;
 	struct mbuf *m_hold, *m_hold_next;
 
 	LLE_WLOCK_ASSERT(la);
 
 	/* the following is not an error when doing bridging */
 	if (!bridged && la->lle_tbl->llt_ifp != ifp) {
 		if (log_arp_wrong_iface)
 			ARP_LOG(LOG_WARNING, "%s is on %s "
 			    "but got reply from %*D on %s\n",
 			    inet_ntoa(isaddr),
 			    la->lle_tbl->llt_ifp->if_xname,
 			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 			    ifp->if_xname);
 		LLE_WUNLOCK(la);
 		return;
 	}
 	if ((la->la_flags & LLE_VALID) &&
 	    bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) {
 		if (la->la_flags & LLE_STATIC) {
 			LLE_WUNLOCK(la);
 			if (log_arp_permanent_modify)
 				ARP_LOG(LOG_ERR,
 				    "%*D attempts to modify "
 				    "permanent entry for %s on %s\n",
 				    ifp->if_addrlen,
 				    (u_char *)ar_sha(ah), ":",
 				    inet_ntoa(isaddr), ifp->if_xname);
 			return;
 		}
 		if (log_arp_movements) {
 			ARP_LOG(LOG_INFO, "%s moved from %*D "
 			    "to %*D on %s\n",
 			    inet_ntoa(isaddr),
 			    ifp->if_addrlen,
 			    (u_char *)&la->ll_addr, ":",
 			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 			    ifp->if_xname);
 		}
 	}
 
 	/* Check if something has changed */
 	if (memcmp(&la->ll_addr, ar_sha(ah), ifp->if_addrlen) != 0 ||
 	    (la->la_flags & LLE_VALID) == 0) {
 		/* Perform real LLE update */
 		/* use afdata WLOCK to update fields */
 		LLE_ADDREF(la);
 		LLE_WUNLOCK(la);
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(la);
 
 		/*
 		 * Since we droppped LLE lock, other thread might have deleted
 		 * this lle. Check and return
 		 */
 		if ((la->la_flags & LLE_DELETED) != 0) {
 			IF_AFDATA_WUNLOCK(ifp);
 			LLE_FREE_LOCKED(la);
 			return;
 		}
 
 		/* Update data */
 		lltable_set_entry_addr(ifp, la, ar_sha(ah));
 
 		IF_AFDATA_WUNLOCK(ifp);
 		LLE_REMREF(la);
 
 		/* Clear fast path feedback request if set */
 		la->r_skip_req = 0;
 	}
 
 	arp_mark_lle_reachable(la);
 
 	/*
 	 * The packets are all freed within the call to the output
 	 * routine.
 	 *
 	 * NB: The lock MUST be released before the call to the
 	 * output routine.
 	 */
 	if (la->la_hold != NULL) {
 		m_hold = la->la_hold;
 		la->la_hold = NULL;
 		la->la_numheld = 0;
 		lltable_fill_sa_entry(la, &sa);
 		LLE_WUNLOCK(la);
 		for (; m_hold != NULL; m_hold = m_hold_next) {
 			m_hold_next = m_hold->m_nextpkt;
 			m_hold->m_nextpkt = NULL;
 			/* Avoid confusing lower layers. */
 			m_clrprotoflags(m_hold);
 			(*ifp->if_output)(ifp, m_hold, &sa, NULL);
 		}
 	} else
 		LLE_WUNLOCK(la);
 }
 
 static void
 arp_mark_lle_reachable(struct llentry *la)
 {
 	int canceled, wtime;
 
 	LLE_WLOCK_ASSERT(la);
 
 	la->ln_state = ARP_LLINFO_REACHABLE;
 	EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
 
 	if (!(la->la_flags & LLE_STATIC)) {
 		LLE_ADDREF(la);
 		la->la_expire = time_uptime + V_arpt_keep;
 		wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit;
 		if (wtime < 0)
 			wtime = V_arpt_keep;
 		canceled = callout_reset(&la->lle_timer,
 		    hz * wtime, arptimer, la);
 		if (canceled)
 			LLE_REMREF(la);
 	}
 	la->la_asked = 0;
 	la->la_preempt = V_arp_maxtries;
 }
 
 /*
  * Add pernament link-layer record for given interface address.
  */
 static __noinline void
 arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst)
 {
 	struct llentry *lle, *lle_tmp;
 
 	/*
 	 * Interface address LLE record is considered static
 	 * because kernel code relies on LLE_STATIC flag to check
 	 * if these entries can be rewriten by arp updates.
 	 */
 	lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst);
 	if (lle == NULL) {
 		log(LOG_INFO, "arp_ifinit: cannot create arp "
 		    "entry for interface address\n");
 		return;
 	}
 
 	IF_AFDATA_WLOCK(ifp);
 	LLE_WLOCK(lle);
 	/* Unlink any entry if exists */
 	lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 	if (lle_tmp != NULL)
 		lltable_unlink_entry(LLTABLE(ifp), lle_tmp);
 
 	lltable_link_entry(LLTABLE(ifp), lle);
 	IF_AFDATA_WUNLOCK(ifp);
 
 	if (lle_tmp != NULL)
 		EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED);
 
 	EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
 	LLE_WUNLOCK(lle);
 	if (lle_tmp != NULL)
 		lltable_free_entry(LLTABLE(ifp), lle_tmp);
 }
 
 void
 arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
 {
 	const struct sockaddr_in *dst_in;
 	const struct sockaddr *dst;
 
 	if (ifa->ifa_carp != NULL)
 		return;
 
 	dst = ifa->ifa_addr;
 	dst_in = (const struct sockaddr_in *)dst;
 
 	if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY)
 		return;
 	arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp));
 
 	arp_add_ifa_lle(ifp, dst);
 }
 
 void
 arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr)
 {
 
 	if (ntohl(addr.s_addr) != INADDR_ANY)
 		arprequest(ifp, &addr, &addr, enaddr);
 }
 
 /*
  * Sends gratuitous ARPs for each ifaddr to notify other
  * nodes about the address change.
  */
 static __noinline void
 arp_handle_ifllchange(struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			arp_ifinit(ifp, ifa);
 	}
 }
 
 /*
  * A handler for interface link layer address change event.
  */
 static __noinline void
 arp_iflladdr(void *arg __unused, struct ifnet *ifp)
 {
 
 	if ((ifp->if_flags & IFF_UP) != 0)
 		arp_handle_ifllchange(ifp);
 }
 
 static void
 arp_init(void)
 {
 
 	netisr_register(&arp_nh);
 	if (IS_DEFAULT_VNET(curvnet))
 		iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
 		    arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
 }
 SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0);
Index: head/sys/netinet/in_mcast.c
===================================================================
--- head/sys/netinet/in_mcast.c	(revision 292014)
+++ head/sys/netinet/in_mcast.c	(revision 292015)
@@ -1,3012 +1,3009 @@
 /*-
  * Copyright (c) 2007-2009 Bruce Simpson.
  * Copyright (c) 2005 Robert N. M. Watson.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPv4 multicast socket, group, and socket option processing module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 #include <sys/ktr.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
+#include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/igmp_var.h>
 
 #ifndef KTR_IGMPV3
 #define KTR_IGMPV3 KTR_INET
 #endif
 
 #ifndef __SOCKUNION_DECLARED
 union sockunion {
 	struct sockaddr_storage	ss;
 	struct sockaddr		sa;
 	struct sockaddr_dl	sdl;
 	struct sockaddr_in	sin;
 };
 typedef union sockunion sockunion_t;
 #define __SOCKUNION_DECLARED
 #endif /* __SOCKUNION_DECLARED */
 
 static MALLOC_DEFINE(M_INMFILTER, "in_mfilter",
     "IPv4 multicast PCB-layer source filter");
 static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group");
 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options");
 static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource",
     "IPv4 multicast IGMP-layer source filter");
 
 /*
  * Locking:
  * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
  * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however
  *   it can be taken by code in net/if.c also.
  * - ip_moptions and in_mfilter are covered by the INP_WLOCK.
  *
  * struct in_multi is covered by IN_MULTI_LOCK. There isn't strictly
  * any need for in_multi itself to be virtualized -- it is bound to an ifp
  * anyway no matter what happens.
  */
 struct mtx in_multi_mtx;
 MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF);
 
 /*
  * Functions with non-static linkage defined in this file should be
  * declared in in_var.h:
  *  imo_multi_filter()
  *  in_addmulti()
  *  in_delmulti()
  *  in_joingroup()
  *  in_joingroup_locked()
  *  in_leavegroup()
  *  in_leavegroup_locked()
  * and ip_var.h:
  *  inp_freemoptions()
  *  inp_getmoptions()
  *  inp_setmoptions()
  *
  * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti()
  * and in_delmulti().
  */
 static void	imf_commit(struct in_mfilter *);
 static int	imf_get_source(struct in_mfilter *imf,
 		    const struct sockaddr_in *psin,
 		    struct in_msource **);
 static struct in_msource *
 		imf_graft(struct in_mfilter *, const uint8_t,
 		    const struct sockaddr_in *);
 static void	imf_leave(struct in_mfilter *);
 static int	imf_prune(struct in_mfilter *, const struct sockaddr_in *);
 static void	imf_purge(struct in_mfilter *);
 static void	imf_rollback(struct in_mfilter *);
 static void	imf_reap(struct in_mfilter *);
 static int	imo_grow(struct ip_moptions *);
 static size_t	imo_match_group(const struct ip_moptions *,
 		    const struct ifnet *, const struct sockaddr *);
 static struct in_msource *
 		imo_match_source(const struct ip_moptions *, const size_t,
 		    const struct sockaddr *);
 static void	ims_merge(struct ip_msource *ims,
 		    const struct in_msource *lims, const int rollback);
 static int	in_getmulti(struct ifnet *, const struct in_addr *,
 		    struct in_multi **);
 static int	inm_get_source(struct in_multi *inm, const in_addr_t haddr,
 		    const int noalloc, struct ip_msource **pims);
 #ifdef KTR
 static int	inm_is_ifp_detached(const struct in_multi *);
 #endif
 static int	inm_merge(struct in_multi *, /*const*/ struct in_mfilter *);
 static void	inm_purge(struct in_multi *);
 static void	inm_reap(struct in_multi *);
 static struct ip_moptions *
 		inp_findmoptions(struct inpcb *);
 static void	inp_freemoptions_internal(struct ip_moptions *);
 static void	inp_gcmoptions(void *, int);
 static int	inp_get_source_filters(struct inpcb *, struct sockopt *);
 static int	inp_join_group(struct inpcb *, struct sockopt *);
 static int	inp_leave_group(struct inpcb *, struct sockopt *);
 static struct ifnet *
 		inp_lookup_mcast_ifp(const struct inpcb *,
 		    const struct sockaddr_in *, const struct in_addr);
 static int	inp_block_unblock_source(struct inpcb *, struct sockopt *);
 static int	inp_set_multicast_if(struct inpcb *, struct sockopt *);
 static int	inp_set_source_filters(struct inpcb *, struct sockopt *);
 static int	sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS);
 
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0,
     "IPv4 multicast");
 
 static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER;
 SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc,
     CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0,
     "Max source filters per group");
 
 static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER;
 SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc,
     CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0,
     "Max source filters per socket");
 
 int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP;
 SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
     &in_mcast_loop, 0, "Loopback multicast datagrams by default");
 
 static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters,
     "Per-interface stack-wide source filters");
 
 static STAILQ_HEAD(, ip_moptions) imo_gc_list =
     STAILQ_HEAD_INITIALIZER(imo_gc_list);
 static struct task imo_gc_task = TASK_INITIALIZER(0, inp_gcmoptions, NULL);
 
 #ifdef KTR
 /*
  * Inline function which wraps assertions for a valid ifp.
  * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
  * is detached.
  */
 static int __inline
 inm_is_ifp_detached(const struct in_multi *inm)
 {
 	struct ifnet *ifp;
 
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that netinet's notion of ifp is the
 		 * same as net's.
 		 */
 		KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	return (ifp == NULL);
 }
 #endif
 
 /*
  * Initialize an in_mfilter structure to a known state at t0, t1
  * with an empty source filter list.
  */
 static __inline void
 imf_init(struct in_mfilter *imf, const int st0, const int st1)
 {
 	memset(imf, 0, sizeof(struct in_mfilter));
 	RB_INIT(&imf->imf_sources);
 	imf->imf_st[0] = st0;
 	imf->imf_st[1] = st1;
 }
 
 /*
  * Function for looking up an in_multi record for an IPv4 multicast address
  * on a given interface. ifp must be valid. If no record found, return NULL.
  * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held.
  */
 struct in_multi *
 inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina)
 {
 	struct ifmultiaddr *ifma;
 	struct in_multi *inm;
 
 	IN_MULTI_LOCK_ASSERT();
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	inm = NULL;
 	TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) {
 		if (ifma->ifma_addr->sa_family == AF_INET) {
 			inm = (struct in_multi *)ifma->ifma_protospec;
 			if (inm->inm_addr.s_addr == ina.s_addr)
 				break;
 			inm = NULL;
 		}
 	}
 	return (inm);
 }
 
 /*
  * Wrapper for inm_lookup_locked().
  * The IF_ADDR_LOCK will be taken on ifp and released on return.
  */
 struct in_multi *
 inm_lookup(struct ifnet *ifp, const struct in_addr ina)
 {
 	struct in_multi *inm;
 
 	IN_MULTI_LOCK_ASSERT();
 	IF_ADDR_RLOCK(ifp);
 	inm = inm_lookup_locked(ifp, ina);
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (inm);
 }
 
 /*
  * Resize the ip_moptions vector to the next power-of-two minus 1.
  * May be called with locks held; do not sleep.
  */
 static int
 imo_grow(struct ip_moptions *imo)
 {
 	struct in_multi		**nmships;
 	struct in_multi		**omships;
 	struct in_mfilter	 *nmfilters;
 	struct in_mfilter	 *omfilters;
 	size_t			  idx;
 	size_t			  newmax;
 	size_t			  oldmax;
 
 	nmships = NULL;
 	nmfilters = NULL;
 	omships = imo->imo_membership;
 	omfilters = imo->imo_mfilters;
 	oldmax = imo->imo_max_memberships;
 	newmax = ((oldmax + 1) * 2) - 1;
 
 	if (newmax <= IP_MAX_MEMBERSHIPS) {
 		nmships = (struct in_multi **)realloc(omships,
 		    sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT);
 		nmfilters = (struct in_mfilter *)realloc(omfilters,
 		    sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT);
 		if (nmships != NULL && nmfilters != NULL) {
 			/* Initialize newly allocated source filter heads. */
 			for (idx = oldmax; idx < newmax; idx++) {
 				imf_init(&nmfilters[idx], MCAST_UNDEFINED,
 				    MCAST_EXCLUDE);
 			}
 			imo->imo_max_memberships = newmax;
 			imo->imo_membership = nmships;
 			imo->imo_mfilters = nmfilters;
 		}
 	}
 
 	if (nmships == NULL || nmfilters == NULL) {
 		if (nmships != NULL)
 			free(nmships, M_IPMOPTS);
 		if (nmfilters != NULL)
 			free(nmfilters, M_INMFILTER);
 		return (ETOOMANYREFS);
 	}
 
 	return (0);
 }
 
 /*
  * Find an IPv4 multicast group entry for this ip_moptions instance
  * which matches the specified group, and optionally an interface.
  * Return its index into the array, or -1 if not found.
  */
 static size_t
 imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group)
 {
 	const struct sockaddr_in *gsin;
 	struct in_multi	**pinm;
 	int		  idx;
 	int		  nmships;
 
 	gsin = (const struct sockaddr_in *)group;
 
 	/* The imo_membership array may be lazy allocated. */
 	if (imo->imo_membership == NULL || imo->imo_num_memberships == 0)
 		return (-1);
 
 	nmships = imo->imo_num_memberships;
 	pinm = &imo->imo_membership[0];
 	for (idx = 0; idx < nmships; idx++, pinm++) {
 		if (*pinm == NULL)
 			continue;
 		if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) &&
 		    in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) {
 			break;
 		}
 	}
 	if (idx >= nmships)
 		idx = -1;
 
 	return (idx);
 }
 
 /*
  * Find an IPv4 multicast source entry for this imo which matches
  * the given group index for this socket, and source address.
  *
  * NOTE: This does not check if the entry is in-mode, merely if
  * it exists, which may not be the desired behaviour.
  */
 static struct in_msource *
 imo_match_source(const struct ip_moptions *imo, const size_t gidx,
     const struct sockaddr *src)
 {
 	struct ip_msource	 find;
 	struct in_mfilter	*imf;
 	struct ip_msource	*ims;
 	const sockunion_t	*psa;
 
 	KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__));
 	KASSERT(gidx != -1 && gidx < imo->imo_num_memberships,
 	    ("%s: invalid index %d\n", __func__, (int)gidx));
 
 	/* The imo_mfilters array may be lazy allocated. */
 	if (imo->imo_mfilters == NULL)
 		return (NULL);
 	imf = &imo->imo_mfilters[gidx];
 
 	/* Source trees are keyed in host byte order. */
 	psa = (const sockunion_t *)src;
 	find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 
 	return ((struct in_msource *)ims);
 }
 
 /*
  * Perform filtering for multicast datagrams on a socket by group and source.
  *
  * Returns 0 if a datagram should be allowed through, or various error codes
  * if the socket was not a member of the group, or the source was muted, etc.
  */
 int
 imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group, const struct sockaddr *src)
 {
 	size_t gidx;
 	struct in_msource *ims;
 	int mode;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 
 	gidx = imo_match_group(imo, ifp, group);
 	if (gidx == -1)
 		return (MCAST_NOTGMEMBER);
 
 	/*
 	 * Check if the source was included in an (S,G) join.
 	 * Allow reception on exclusive memberships by default,
 	 * reject reception on inclusive memberships by default.
 	 * Exclude source only if an in-mode exclude filter exists.
 	 * Include source only if an in-mode include filter exists.
 	 * NOTE: We are comparing group state here at IGMP t1 (now)
 	 * with socket-layer t0 (since last downcall).
 	 */
 	mode = imo->imo_mfilters[gidx].imf_st[1];
 	ims = imo_match_source(imo, gidx, src);
 
 	if ((ims == NULL && mode == MCAST_INCLUDE) ||
 	    (ims != NULL && ims->imsl_st[0] != mode))
 		return (MCAST_NOTSMEMBER);
 
 	return (MCAST_PASS);
 }
 
 /*
  * Find and return a reference to an in_multi record for (ifp, group),
  * and bump its reference count.
  * If one does not exist, try to allocate it, and update link-layer multicast
  * filters on ifp to listen for group.
  * Assumes the IN_MULTI lock is held across the call.
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in_getmulti(struct ifnet *ifp, const struct in_addr *group,
     struct in_multi **pinm)
 {
 	struct sockaddr_in	 gsin;
 	struct ifmultiaddr	*ifma;
 	struct in_ifinfo	*ii;
 	struct in_multi		*inm;
 	int error;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET];
 
 	inm = inm_lookup(ifp, *group);
 	if (inm != NULL) {
 		/*
 		 * If we already joined this group, just bump the
 		 * refcount and return it.
 		 */
 		KASSERT(inm->inm_refcount >= 1,
 		    ("%s: bad refcount %d", __func__, inm->inm_refcount));
 		++inm->inm_refcount;
 		*pinm = inm;
 		return (0);
 	}
 
 	memset(&gsin, 0, sizeof(gsin));
 	gsin.sin_family = AF_INET;
 	gsin.sin_len = sizeof(struct sockaddr_in);
 	gsin.sin_addr = *group;
 
 	/*
 	 * Check if a link-layer group is already associated
 	 * with this network-layer group on the given ifnet.
 	 */
 	error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma);
 	if (error != 0)
 		return (error);
 
 	/* XXX ifma_protospec must be covered by IF_ADDR_LOCK */
 	IF_ADDR_WLOCK(ifp);
 
 	/*
 	 * If something other than netinet is occupying the link-layer
 	 * group, print a meaningful error message and back out of
 	 * the allocation.
 	 * Otherwise, bump the refcount on the existing network-layer
 	 * group association and return it.
 	 */
 	if (ifma->ifma_protospec != NULL) {
 		inm = (struct in_multi *)ifma->ifma_protospec;
 #ifdef INVARIANTS
 		KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
 		    __func__));
 		KASSERT(ifma->ifma_addr->sa_family == AF_INET,
 		    ("%s: ifma not AF_INET", __func__));
 		KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
 		if (inm->inm_ifma != ifma || inm->inm_ifp != ifp ||
 		    !in_hosteq(inm->inm_addr, *group))
 			panic("%s: ifma %p is inconsistent with %p (%s)",
 			    __func__, ifma, inm, inet_ntoa(*group));
 #endif
 		++inm->inm_refcount;
 		*pinm = inm;
 		IF_ADDR_WUNLOCK(ifp);
 		return (0);
 	}
 
 	IF_ADDR_WLOCK_ASSERT(ifp);
 
 	/*
 	 * A new in_multi record is needed; allocate and initialize it.
 	 * We DO NOT perform an IGMP join as the in_ layer may need to
 	 * push an initial source list down to IGMP to support SSM.
 	 *
 	 * The initial source filter state is INCLUDE, {} as per the RFC.
 	 */
 	inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO);
 	if (inm == NULL) {
 		IF_ADDR_WUNLOCK(ifp);
 		if_delmulti_ifma(ifma);
 		return (ENOMEM);
 	}
 	inm->inm_addr = *group;
 	inm->inm_ifp = ifp;
 	inm->inm_igi = ii->ii_igmp;
 	inm->inm_ifma = ifma;
 	inm->inm_refcount = 1;
 	inm->inm_state = IGMP_NOT_MEMBER;
 	mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES);
 	inm->inm_st[0].iss_fmode = MCAST_UNDEFINED;
 	inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 	RB_INIT(&inm->inm_srcs);
 
 	ifma->ifma_protospec = inm;
 
 	*pinm = inm;
 
 	IF_ADDR_WUNLOCK(ifp);
 	return (0);
 }
 
 /*
  * Drop a reference to an in_multi record.
  *
  * If the refcount drops to 0, free the in_multi record and
  * delete the underlying link-layer membership.
  */
 void
 inm_release_locked(struct in_multi *inm)
 {
 	struct ifmultiaddr *ifma;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount);
 
 	if (--inm->inm_refcount > 0) {
 		CTR2(KTR_IGMPV3, "%s: refcount is now %d", __func__,
 		    inm->inm_refcount);
 		return;
 	}
 
 	CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm);
 
 	ifma = inm->inm_ifma;
 
 	/* XXX this access is not covered by IF_ADDR_LOCK */
 	CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma);
 	KASSERT(ifma->ifma_protospec == inm,
 	    ("%s: ifma_protospec != inm", __func__));
 	ifma->ifma_protospec = NULL;
 
 	inm_purge(inm);
 
 	free(inm, M_IPMADDR);
 
 	if_delmulti_ifma(ifma);
 }
 
 /*
  * Clear recorded source entries for a group.
  * Used by the IGMP code. Caller must hold the IN_MULTI lock.
  * FIXME: Should reap.
  */
 void
 inm_clear_recorded(struct in_multi *inm)
 {
 	struct ip_msource	*ims;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 		if (ims->ims_stp) {
 			ims->ims_stp = 0;
 			--inm->inm_st[1].iss_rec;
 		}
 	}
 	KASSERT(inm->inm_st[1].iss_rec == 0,
 	    ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec));
 }
 
 /*
  * Record a source as pending for a Source-Group IGMPv3 query.
  * This lives here as it modifies the shared tree.
  *
  * inm is the group descriptor.
  * naddr is the address of the source to record in network-byte order.
  *
  * If the net.inet.igmp.sgalloc sysctl is non-zero, we will
  * lazy-allocate a source node in response to an SG query.
  * Otherwise, no allocation is performed. This saves some memory
  * with the trade-off that the source will not be reported to the
  * router if joined in the window between the query response and
  * the group actually being joined on the local host.
  *
  * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed.
  * This turns off the allocation of a recorded source entry if
  * the group has not been joined.
  *
  * Return 0 if the source didn't exist or was already marked as recorded.
  * Return 1 if the source was marked as recorded by this function.
  * Return <0 if any error occured (negated errno code).
  */
 int
 inm_record_source(struct in_multi *inm, const in_addr_t naddr)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	find.ims_haddr = ntohl(naddr);
 	ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
 	if (ims && ims->ims_stp)
 		return (0);
 	if (ims == NULL) {
 		if (inm->inm_nsrc == in_mcast_maxgrpsrc)
 			return (-ENOSPC);
 		nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (-ENOMEM);
 		nims->ims_haddr = find.ims_haddr;
 		RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
 		++inm->inm_nsrc;
 		ims = nims;
 	}
 
 	/*
 	 * Mark the source as recorded and update the recorded
 	 * source count.
 	 */
 	++ims->ims_stp;
 	++inm->inm_st[1].iss_rec;
 
 	return (1);
 }
 
 /*
  * Return a pointer to an in_msource owned by an in_mfilter,
  * given its source address.
  * Lazy-allocate if needed. If this is a new entry its filter state is
  * undefined at t0.
  *
  * imf is the filter set being modified.
  * haddr is the source address in *host* byte-order.
  *
  * SMPng: May be called with locks held; malloc must not block.
  */
 static int
 imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin,
     struct in_msource **plims)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 	struct in_msource	*lims;
 	int			 error;
 
 	error = 0;
 	ims = NULL;
 	lims = NULL;
 
 	/* key is host byte order */
 	find.ims_haddr = ntohl(psin->sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 	lims = (struct in_msource *)ims;
 	if (lims == NULL) {
 		if (imf->imf_nsrc == in_mcast_maxsocksrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct in_msource), M_INMFILTER,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		lims = (struct in_msource *)nims;
 		lims->ims_haddr = find.ims_haddr;
 		lims->imsl_st[0] = MCAST_UNDEFINED;
 		RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
 		++imf->imf_nsrc;
 	}
 
 	*plims = lims;
 
 	return (error);
 }
 
 /*
  * Graft a source entry into an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being in the new filter mode at t1.
  *
  * Return the pointer to the new node, otherwise return NULL.
  */
 static struct in_msource *
 imf_graft(struct in_mfilter *imf, const uint8_t st1,
     const struct sockaddr_in *psin)
 {
 	struct ip_msource	*nims;
 	struct in_msource	*lims;
 
 	nims = malloc(sizeof(struct in_msource), M_INMFILTER,
 	    M_NOWAIT | M_ZERO);
 	if (nims == NULL)
 		return (NULL);
 	lims = (struct in_msource *)nims;
 	lims->ims_haddr = ntohl(psin->sin_addr.s_addr);
 	lims->imsl_st[0] = MCAST_UNDEFINED;
 	lims->imsl_st[1] = st1;
 	RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
 	++imf->imf_nsrc;
 
 	return (lims);
 }
 
 /*
  * Prune a source entry from an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being left at t1, it is not freed.
  *
  * Return 0 if no error occurred, otherwise return an errno value.
  */
 static int
 imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	/* key is host byte order */
 	find.ims_haddr = ntohl(psin->sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 	if (ims == NULL)
 		return (ENOENT);
 	lims = (struct in_msource *)ims;
 	lims->imsl_st[1] = MCAST_UNDEFINED;
 	return (0);
 }
 
 /*
  * Revert socket-layer filter set deltas at t1 to t0 state.
  */
 static void
 imf_rollback(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 	struct in_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == lims->imsl_st[1]) {
 			/* no change at t1 */
 			continue;
 		} else if (lims->imsl_st[0] != MCAST_UNDEFINED) {
 			/* revert change to existing source at t1 */
 			lims->imsl_st[1] = lims->imsl_st[0];
 		} else {
 			/* revert source added t1 */
 			CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 			RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 			free(ims, M_INMFILTER);
 			imf->imf_nsrc--;
 		}
 	}
 	imf->imf_st[1] = imf->imf_st[0];
 }
 
 /*
  * Mark socket-layer filter set as INCLUDE {} at t1.
  */
 static void
 imf_leave(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		lims->imsl_st[1] = MCAST_UNDEFINED;
 	}
 	imf->imf_st[1] = MCAST_INCLUDE;
 }
 
 /*
  * Mark socket-layer filter set deltas as committed.
  */
 static void
 imf_commit(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		lims->imsl_st[0] = lims->imsl_st[1];
 	}
 	imf->imf_st[0] = imf->imf_st[1];
 }
 
 /*
  * Reap unreferenced sources from socket-layer filter set.
  */
 static void
 imf_reap(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 	struct in_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		lims = (struct in_msource *)ims;
 		if ((lims->imsl_st[0] == MCAST_UNDEFINED) &&
 		    (lims->imsl_st[1] == MCAST_UNDEFINED)) {
 			CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims);
 			RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 			free(ims, M_INMFILTER);
 			imf->imf_nsrc--;
 		}
 	}
 }
 
 /*
  * Purge socket-layer filter set.
  */
 static void
 imf_purge(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 		free(ims, M_INMFILTER);
 		imf->imf_nsrc--;
 	}
 	imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED;
 	KASSERT(RB_EMPTY(&imf->imf_sources),
 	    ("%s: imf_sources not empty", __func__));
 }
 
 /*
  * Look up a source filter entry for a multicast group.
  *
  * inm is the group descriptor to work with.
  * haddr is the host-byte-order IPv4 address to look up.
  * noalloc may be non-zero to suppress allocation of sources.
  * *pims will be set to the address of the retrieved or allocated source.
  *
  * SMPng: NOTE: may be called with locks held.
  * Return 0 if successful, otherwise return a non-zero error code.
  */
 static int
 inm_get_source(struct in_multi *inm, const in_addr_t haddr,
     const int noalloc, struct ip_msource **pims)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 #ifdef KTR
 	struct in_addr ia;
 #endif
 
 	find.ims_haddr = haddr;
 	ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
 	if (ims == NULL && !noalloc) {
 		if (inm->inm_nsrc == in_mcast_maxgrpsrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		nims->ims_haddr = haddr;
 		RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
 		++inm->inm_nsrc;
 		ims = nims;
 #ifdef KTR
 		ia.s_addr = htonl(haddr);
 		CTR3(KTR_IGMPV3, "%s: allocated %s as %p", __func__,
 		    inet_ntoa(ia), ims);
 #endif
 	}
 
 	*pims = ims;
 	return (0);
 }
 
 /*
  * Merge socket-layer source into IGMP-layer source.
  * If rollback is non-zero, perform the inverse of the merge.
  */
 static void
 ims_merge(struct ip_msource *ims, const struct in_msource *lims,
     const int rollback)
 {
 	int n = rollback ? -1 : 1;
 #ifdef KTR
 	struct in_addr ia;
 
 	ia.s_addr = htonl(ims->ims_haddr);
 #endif
 
 	if (lims->imsl_st[0] == MCAST_EXCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on %s",
 		    __func__, n, inet_ntoa(ia));
 		ims->ims_st[1].ex -= n;
 	} else if (lims->imsl_st[0] == MCAST_INCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 in -= %d on %s",
 		    __func__, n, inet_ntoa(ia));
 		ims->ims_st[1].in -= n;
 	}
 
 	if (lims->imsl_st[1] == MCAST_EXCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 ex += %d on %s",
 		    __func__, n, inet_ntoa(ia));
 		ims->ims_st[1].ex += n;
 	} else if (lims->imsl_st[1] == MCAST_INCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 in += %d on %s",
 		    __func__, n, inet_ntoa(ia));
 		ims->ims_st[1].in += n;
 	}
 }
 
 /*
  * Atomically update the global in_multi state, when a membership's
  * filter list is being updated in any way.
  *
  * imf is the per-inpcb-membership group filter pointer.
  * A fake imf may be passed for in-kernel consumers.
  *
  * XXX This is a candidate for a set-symmetric-difference style loop
  * which would eliminate the repeated lookup from root of ims nodes,
  * as they share the same key space.
  *
  * If any error occurred this function will back out of refcounts
  * and return a non-zero value.
  */
 static int
 inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *nims;
 	struct in_msource	*lims;
 	int			 schanged, error;
 	int			 nsrc0, nsrc1;
 
 	schanged = 0;
 	error = 0;
 	nsrc1 = nsrc0 = 0;
 
 	/*
 	 * Update the source filters first, as this may fail.
 	 * Maintain count of in-mode filters at t0, t1. These are
 	 * used to work out if we transition into ASM mode or not.
 	 * Maintain a count of source filters whose state was
 	 * actually modified by this operation.
 	 */
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++;
 		if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++;
 		if (lims->imsl_st[0] == lims->imsl_st[1]) continue;
 		error = inm_get_source(inm, lims->ims_haddr, 0, &nims);
 		++schanged;
 		if (error)
 			break;
 		ims_merge(nims, lims, 0);
 	}
 	if (error) {
 		struct ip_msource *bims;
 
 		RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) {
 			lims = (struct in_msource *)ims;
 			if (lims->imsl_st[0] == lims->imsl_st[1])
 				continue;
 			(void)inm_get_source(inm, lims->ims_haddr, 1, &bims);
 			if (bims == NULL)
 				continue;
 			ims_merge(bims, lims, 1);
 		}
 		goto out_reap;
 	}
 
 	CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1",
 	    __func__, nsrc0, nsrc1);
 
 	/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
 	if (imf->imf_st[0] == imf->imf_st[1] &&
 	    imf->imf_st[1] == MCAST_INCLUDE) {
 		if (nsrc1 == 0) {
 			CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
 			--inm->inm_st[1].iss_in;
 		}
 	}
 
 	/* Handle filter mode transition on socket. */
 	if (imf->imf_st[0] != imf->imf_st[1]) {
 		CTR3(KTR_IGMPV3, "%s: imf transition %d to %d",
 		    __func__, imf->imf_st[0], imf->imf_st[1]);
 
 		if (imf->imf_st[0] == MCAST_EXCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__);
 			--inm->inm_st[1].iss_ex;
 		} else if (imf->imf_st[0] == MCAST_INCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
 			--inm->inm_st[1].iss_in;
 		}
 
 		if (imf->imf_st[1] == MCAST_EXCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__);
 			inm->inm_st[1].iss_ex++;
 		} else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
 			CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__);
 			inm->inm_st[1].iss_in++;
 		}
 	}
 
 	/*
 	 * Track inm filter state in terms of listener counts.
 	 * If there are any exclusive listeners, stack-wide
 	 * membership is exclusive.
 	 * Otherwise, if only inclusive listeners, stack-wide is inclusive.
 	 * If no listeners remain, state is undefined at t1,
 	 * and the IGMP lifecycle for this group should finish.
 	 */
 	if (inm->inm_st[1].iss_ex > 0) {
 		CTR1(KTR_IGMPV3, "%s: transition to EX", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_EXCLUDE;
 	} else if (inm->inm_st[1].iss_in > 0) {
 		CTR1(KTR_IGMPV3, "%s: transition to IN", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_INCLUDE;
 	} else {
 		CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 	}
 
 	/* Decrement ASM listener count on transition out of ASM mode. */
 	if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
 		if ((imf->imf_st[1] != MCAST_EXCLUDE) ||
 		    (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0))
 			CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__);
 			--inm->inm_st[1].iss_asm;
 	}
 
 	/* Increment ASM listener count on transition to ASM mode. */
 	if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
 		CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__);
 		inm->inm_st[1].iss_asm++;
 	}
 
 	CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm);
 	inm_print(inm);
 
 out_reap:
 	if (schanged > 0) {
 		CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__);
 		inm_reap(inm);
 	}
 	return (error);
 }
 
 /*
  * Mark an in_multi's filter set deltas as committed.
  * Called by IGMP after a state change has been enqueued.
  */
 void
 inm_commit(struct in_multi *inm)
 {
 	struct ip_msource	*ims;
 
 	CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm);
 	CTR1(KTR_IGMPV3, "%s: pre commit:", __func__);
 	inm_print(inm);
 
 	RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 		ims->ims_st[0] = ims->ims_st[1];
 	}
 	inm->inm_st[0] = inm->inm_st[1];
 }
 
 /*
  * Reap unreferenced nodes from an in_multi's filter set.
  */
 static void
 inm_reap(struct in_multi *inm)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
 		if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 ||
 		    ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 ||
 		    ims->ims_stp != 0)
 			continue;
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
 		free(ims, M_IPMSOURCE);
 		inm->inm_nsrc--;
 	}
 }
 
 /*
  * Purge all source nodes from an in_multi's filter set.
  */
 static void
 inm_purge(struct in_multi *inm)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
 		free(ims, M_IPMSOURCE);
 		inm->inm_nsrc--;
 	}
 }
 
 /*
  * Join a multicast group; unlocked entry point.
  *
  * SMPng: XXX: in_joingroup() is called from in_control() when Giant
  * is not held. Fortunately, ifp is unlikely to have been detached
  * at this point, so we assume it's OK to recurse.
  */
 int
 in_joingroup(struct ifnet *ifp, const struct in_addr *gina,
     /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
 {
 	int error;
 
 	IN_MULTI_LOCK();
 	error = in_joingroup_locked(ifp, gina, imf, pinm);
 	IN_MULTI_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Join a multicast group; real entry point.
  *
  * Only preserves atomicity at inm level.
  * NOTE: imf argument cannot be const due to sys/tree.h limitations.
  *
  * If the IGMP downcall fails, the group is not joined, and an error
  * code is returned.
  */
 int
 in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina,
     /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
 {
 	struct in_mfilter	 timf;
 	struct in_multi		*inm;
 	int			 error;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	CTR4(KTR_IGMPV3, "%s: join %s on %p(%s))", __func__,
 	    inet_ntoa(*gina), ifp, ifp->if_xname);
 
 	error = 0;
 	inm = NULL;
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		imf = &timf;
 	}
 
 	error = in_getmulti(ifp, gina, &inm);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__);
 		return (error);
 	}
 
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		goto out_inm_release;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to update source", __func__);
 		goto out_inm_release;
 	}
 
 out_inm_release:
 	if (error) {
 		CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
 		inm_release_locked(inm);
 	} else {
 		*pinm = inm;
 	}
 
 	return (error);
 }
 
 /*
  * Leave a multicast group; unlocked entry point.
  */
 int
 in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	int error;
 
 	IN_MULTI_LOCK();
 	error = in_leavegroup_locked(inm, imf);
 	IN_MULTI_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Leave a multicast group; real entry point.
  * All source filters will be expunged.
  *
  * Only preserves atomicity at inm level.
  *
  * Holding the write lock for the INP which contains imf
  * is highly advisable. We can't assert for it as imf does not
  * contain a back-pointer to the owning inp.
  *
  * Note: This is not the same as inm_release(*) as this function also
  * makes a state change downcall into IGMP.
  */
 int
 in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	struct in_mfilter	 timf;
 	int			 error;
 
 	error = 0;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	CTR5(KTR_IGMPV3, "%s: leave inm %p, %s/%s, imf %p", __func__,
 	    inm, inet_ntoa(inm->inm_addr),
 	    (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname),
 	    imf);
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
 		imf = &timf;
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 *
 	 * As this particular invocation should not cause any memory
 	 * to be allocated, and there is no opportunity to roll back
 	 * the transaction, it MUST NOT fail.
 	 */
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	error = inm_merge(inm, imf);
 	KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	CURVNET_SET(inm->inm_ifp->if_vnet);
 	error = igmp_change_state(inm);
 	CURVNET_RESTORE();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
 	CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
 	inm_release_locked(inm);
 
 	return (error);
 }
 
 /*#ifndef BURN_BRIDGES*/
 /*
  * Join an IPv4 multicast group in (*,G) exclusive mode.
  * The group must be a 224.0.0.0/24 link-scope group.
  * This KPI is for legacy kernel consumers only.
  */
 struct in_multi *
 in_addmulti(struct in_addr *ap, struct ifnet *ifp)
 {
 	struct in_multi *pinm;
 	int error;
 
 	KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)),
 	    ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa(*ap)));
 
 	error = in_joingroup(ifp, ap, NULL, &pinm);
 	if (error != 0)
 		pinm = NULL;
 
 	return (pinm);
 }
 
 /*
  * Leave an IPv4 multicast group, assumed to be in exclusive (*,G) mode.
  * This KPI is for legacy kernel consumers only.
  */
 void
 in_delmulti(struct in_multi *inm)
 {
 
 	(void)in_leavegroup(inm, NULL);
 }
 /*#endif*/
 
 /*
  * Block or unblock an ASM multicast source on an inpcb.
  * This implements the delta-based API described in RFC 3678.
  *
  * The delta-based API applies only to exclusive-mode memberships.
  * An IGMP downcall will be performed.
  *
  * SMPng: NOTE: Must take Giant as a join may create a new ifma.
  *
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_msource		*ims;
 	struct in_multi			*inm;
 	size_t				 idx;
 	uint16_t			 fmode;
 	int				 error, doblock;
 
 	ifp = NULL;
 	error = 0;
 	doblock = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 
 	switch (sopt->sopt_name) {
 	case IP_BLOCK_SOURCE:
 	case IP_UNBLOCK_SOURCE: {
 		struct ip_mreq_source	 mreqs;
 
 		error = sooptcopyin(sopt, &mreqs,
 		    sizeof(struct ip_mreq_source),
 		    sizeof(struct ip_mreq_source));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 
 		ssa->sin.sin_family = AF_INET;
 		ssa->sin.sin_len = sizeof(struct sockaddr_in);
 		ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 
 		if (!in_nullhost(mreqs.imr_interface))
 			INADDR_TO_IFP(mreqs.imr_interface, ifp);
 
 		if (sopt->sopt_name == IP_BLOCK_SOURCE)
 			doblock = 1;
 
 		CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p",
 		    __func__, inet_ntoa(mreqs.imr_interface), ifp);
 		break;
 	    }
 
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = sooptcopyin(sopt, &gsr,
 		    sizeof(struct group_source_req),
 		    sizeof(struct group_source_req));
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (ssa->sin.sin_family != AF_INET ||
 		    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 
 		ifp = ifnet_byindex(gsr.gsr_interface);
 
 		if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
 			doblock = 1;
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	/*
 	 * Check if we are actually a member of this group.
 	 */
 	imo = inp_findmoptions(inp);
 	idx = imo_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1 || imo->imo_mfilters == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 
 	KASSERT(imo->imo_mfilters != NULL,
 	    ("%s: imo_mfilters not allocated", __func__));
 	imf = &imo->imo_mfilters[idx];
 	inm = imo->imo_membership[idx];
 
 	/*
 	 * Attempting to use the delta-based API on an
 	 * non exclusive-mode membership is an error.
 	 */
 	fmode = imf->imf_st[0];
 	if (fmode != MCAST_EXCLUDE) {
 		error = EINVAL;
 		goto out_inp_locked;
 	}
 
 	/*
 	 * Deal with error cases up-front:
 	 *  Asked to block, but already blocked; or
 	 *  Asked to unblock, but nothing to unblock.
 	 * If adding a new block entry, allocate it.
 	 */
 	ims = imo_match_source(imo, idx, &ssa->sa);
 	if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
 		CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__,
 		    inet_ntoa(ssa->sin.sin_addr), doblock ? "" : "not ");
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	if (doblock) {
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
 		ims = imf_graft(imf, fmode, &ssa->sin);
 		if (ims == NULL)
 			error = ENOMEM;
 	} else {
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
 		error = imf_prune(imf, &ssa->sin);
 	}
 
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__);
 		goto out_imf_rollback;
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	IN_MULTI_LOCK();
 
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		goto out_in_multi_locked;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
 out_in_multi_locked:
 
 	IN_MULTI_UNLOCK();
 
 out_imf_rollback:
 	if (error)
 		imf_rollback(imf);
 	else
 		imf_commit(imf);
 
 	imf_reap(imf);
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Given an inpcb, return its multicast options structure pointer.  Accepts
  * an unlocked inpcb pointer, but will return it locked.  May sleep.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  * SMPng: NOTE: Returns with the INP write lock held.
  */
 static struct ip_moptions *
 inp_findmoptions(struct inpcb *inp)
 {
 	struct ip_moptions	 *imo;
 	struct in_multi		**immp;
 	struct in_mfilter	 *imfp;
 	size_t			  idx;
 
 	INP_WLOCK(inp);
 	if (inp->inp_moptions != NULL)
 		return (inp->inp_moptions);
 
 	INP_WUNLOCK(inp);
 
 	imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
 	immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS,
 	    M_WAITOK | M_ZERO);
 	imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS,
 	    M_INMFILTER, M_WAITOK);
 
 	imo->imo_multicast_ifp = NULL;
 	imo->imo_multicast_addr.s_addr = INADDR_ANY;
 	imo->imo_multicast_vif = -1;
 	imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
 	imo->imo_multicast_loop = in_mcast_loop;
 	imo->imo_num_memberships = 0;
 	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
 	imo->imo_membership = immp;
 
 	/* Initialize per-group source filters. */
 	for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++)
 		imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE);
 	imo->imo_mfilters = imfp;
 
 	INP_WLOCK(inp);
 	if (inp->inp_moptions != NULL) {
 		free(imfp, M_INMFILTER);
 		free(immp, M_IPMOPTS);
 		free(imo, M_IPMOPTS);
 		return (inp->inp_moptions);
 	}
 	inp->inp_moptions = imo;
 	return (imo);
 }
 
 /*
  * Discard the IP multicast options (and source filters).  To minimize
  * the amount of work done while holding locks such as the INP's
  * pcbinfo lock (which is used in the receive path), the free
  * operation is performed asynchronously in a separate task.
  *
  * SMPng: NOTE: assumes INP write lock is held.
  */
 void
 inp_freemoptions(struct ip_moptions *imo)
 {
 
 	KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__));
 	IN_MULTI_LOCK();
 	STAILQ_INSERT_TAIL(&imo_gc_list, imo, imo_link);
 	IN_MULTI_UNLOCK();
 	taskqueue_enqueue(taskqueue_thread, &imo_gc_task);
 }
 
 static void
 inp_freemoptions_internal(struct ip_moptions *imo)
 {
 	struct in_mfilter	*imf;
 	size_t			 idx, nmships;
 
 	nmships = imo->imo_num_memberships;
 	for (idx = 0; idx < nmships; ++idx) {
 		imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL;
 		if (imf)
 			imf_leave(imf);
 		(void)in_leavegroup(imo->imo_membership[idx], imf);
 		if (imf)
 			imf_purge(imf);
 	}
 
 	if (imo->imo_mfilters)
 		free(imo->imo_mfilters, M_INMFILTER);
 	free(imo->imo_membership, M_IPMOPTS);
 	free(imo, M_IPMOPTS);
 }
 
 static void
 inp_gcmoptions(void *context, int pending)
 {
 	struct ip_moptions *imo;
 
 	IN_MULTI_LOCK();
 	while (!STAILQ_EMPTY(&imo_gc_list)) {
 		imo = STAILQ_FIRST(&imo_gc_list);
 		STAILQ_REMOVE_HEAD(&imo_gc_list, imo_link);
 		IN_MULTI_UNLOCK();
 		inp_freemoptions_internal(imo);
 		IN_MULTI_LOCK();
 	}
 	IN_MULTI_UNLOCK();
 }
 
 /*
  * Atomically get source filters on a socket for an IPv4 multicast group.
  * Called with INP lock held; returns with lock released.
  */
 static int
 inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct ip_moptions	*imo;
 	struct in_mfilter	*imf;
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 	struct sockaddr_in	*psin;
 	struct sockaddr_storage	*ptss;
 	struct sockaddr_storage	*tss;
 	int			 error;
 	size_t			 idx, nsrcs, ncsrcs;
 
 	INP_WLOCK_ASSERT(inp);
 
 	imo = inp->inp_moptions;
 	KASSERT(imo != NULL, ("%s: null ip_moptions", __func__));
 
 	INP_WUNLOCK(inp);
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
 		return (EINVAL);
 
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	if (ifp == NULL)
 		return (EINVAL);
 
 	INP_WLOCK(inp);
 
 	/*
 	 * Lookup group on the socket.
 	 */
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	idx = imo_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1 || imo->imo_mfilters == NULL) {
 		INP_WUNLOCK(inp);
 		return (EADDRNOTAVAIL);
 	}
 	imf = &imo->imo_mfilters[idx];
 
 	/*
 	 * Ignore memberships which are in limbo.
 	 */
 	if (imf->imf_st[1] == MCAST_UNDEFINED) {
 		INP_WUNLOCK(inp);
 		return (EAGAIN);
 	}
 	msfr.msfr_fmode = imf->imf_st[1];
 
 	/*
 	 * If the user specified a buffer, copy out the source filter
 	 * entries to userland gracefully.
 	 * We only copy out the number of entries which userland
 	 * has asked for, but we always tell userland how big the
 	 * buffer really needs to be.
 	 */
 	if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
 		msfr.msfr_nsrcs = in_mcast_maxsocksrc;
 	tss = NULL;
 	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
 		tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_NOWAIT | M_ZERO);
 		if (tss == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOBUFS);
 		}
 	}
 
 	/*
 	 * Count number of sources in-mode at t0.
 	 * If buffer space exists and remains, copy out source entries.
 	 */
 	nsrcs = msfr.msfr_nsrcs;
 	ncsrcs = 0;
 	ptss = tss;
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == MCAST_UNDEFINED ||
 		    lims->imsl_st[0] != imf->imf_st[0])
 			continue;
 		++ncsrcs;
 		if (tss != NULL && nsrcs > 0) {
 			psin = (struct sockaddr_in *)ptss;
 			psin->sin_family = AF_INET;
 			psin->sin_len = sizeof(struct sockaddr_in);
 			psin->sin_addr.s_addr = htonl(lims->ims_haddr);
 			psin->sin_port = 0;
 			++ptss;
 			--nsrcs;
 		}
 	}
 
 	INP_WUNLOCK(inp);
 
 	if (tss != NULL) {
 		error = copyout(tss, msfr.msfr_srcs,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		free(tss, M_TEMP);
 		if (error)
 			return (error);
 	}
 
 	msfr.msfr_nsrcs = ncsrcs;
 	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
 
 	return (error);
 }
 
 /*
  * Return the IP multicast options in response to user getsockopt().
  */
 int
 inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct rm_priotracker	 in_ifa_tracker;
 	struct ip_mreqn		 mreqn;
 	struct ip_moptions	*imo;
 	struct ifnet		*ifp;
 	struct in_ifaddr	*ia;
 	int			 error, optval;
 	u_char			 coptval;
 
 	INP_WLOCK(inp);
 	imo = inp->inp_moptions;
 	/*
 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
 	 * or is a divert socket, reject it.
 	 */
 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = 0;
 	switch (sopt->sopt_name) {
 	case IP_MULTICAST_VIF:
 		if (imo != NULL)
 			optval = imo->imo_multicast_vif;
 		else
 			optval = -1;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MULTICAST_IF:
 		memset(&mreqn, 0, sizeof(struct ip_mreqn));
 		if (imo != NULL) {
 			ifp = imo->imo_multicast_ifp;
 			if (!in_nullhost(imo->imo_multicast_addr)) {
 				mreqn.imr_address = imo->imo_multicast_addr;
 			} else if (ifp != NULL) {
 				mreqn.imr_ifindex = ifp->if_index;
 				IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 				if (ia != NULL) {
 					mreqn.imr_address =
 					    IA_SIN(ia)->sin_addr;
 					ifa_free(&ia->ia_ifa);
 				}
 			}
 		}
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
 			error = sooptcopyout(sopt, &mreqn,
 			    sizeof(struct ip_mreqn));
 		} else {
 			error = sooptcopyout(sopt, &mreqn.imr_address,
 			    sizeof(struct in_addr));
 		}
 		break;
 
 	case IP_MULTICAST_TTL:
 		if (imo == 0)
 			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
 		else
 			optval = coptval = imo->imo_multicast_ttl;
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(u_char))
 			error = sooptcopyout(sopt, &coptval, sizeof(u_char));
 		else
 			error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MULTICAST_LOOP:
 		if (imo == 0)
 			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
 		else
 			optval = coptval = imo->imo_multicast_loop;
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(u_char))
 			error = sooptcopyout(sopt, &coptval, sizeof(u_char));
 		else
 			error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MSFILTER:
 		if (imo == NULL) {
 			error = EADDRNOTAVAIL;
 			INP_WUNLOCK(inp);
 		} else {
 			error = inp_get_source_filters(inp, sopt);
 		}
 		break;
 
 	default:
 		INP_WUNLOCK(inp);
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Look up the ifnet to use for a multicast group membership,
  * given the IPv4 address of an interface, and the IPv4 group address.
  *
  * This routine exists to support legacy multicast applications
  * which do not understand that multicast memberships are scoped to
  * specific physical links in the networking stack, or which need
  * to join link-scope groups before IPv4 addresses are configured.
  *
  * If inp is non-NULL, use this socket's current FIB number for any
  * required FIB lookup.
  * If ina is INADDR_ANY, look up the group address in the unicast FIB,
  * and use its ifp; usually, this points to the default next-hop.
  *
  * If the FIB lookup fails, attempt to use the first non-loopback
  * interface with multicast capability in the system as a
  * last resort. The legacy IPv4 ASM API requires that we do
  * this in order to allow groups to be joined when the routing
  * table has not yet been populated during boot.
  *
  * Returns NULL if no ifp could be found.
  *
  * SMPng: TODO: Acquire the appropriate locks for INADDR_TO_IFP.
  * FUTURE: Implement IPv4 source-address selection.
  */
 static struct ifnet *
 inp_lookup_mcast_ifp(const struct inpcb *inp,
     const struct sockaddr_in *gsin, const struct in_addr ina)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct ifnet *ifp;
+	struct nhop4_basic nh4;
+	uint32_t fibnum;
 
 	KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__));
 	KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)),
 	    ("%s: not multicast", __func__));
 
 	ifp = NULL;
 	if (!in_nullhost(ina)) {
 		INADDR_TO_IFP(ina, ifp);
 	} else {
-		struct route ro;
-
-		ro.ro_rt = NULL;
-		memcpy(&ro.ro_dst, gsin, sizeof(struct sockaddr_in));
-		in_rtalloc_ign(&ro, 0, inp ? inp->inp_inc.inc_fibnum : 0);
-		if (ro.ro_rt != NULL) {
-			ifp = ro.ro_rt->rt_ifp;
-			KASSERT(ifp != NULL, ("%s: null ifp", __func__));
-			RTFREE(ro.ro_rt);
-		} else {
+		fibnum = inp ? inp->inp_inc.inc_fibnum : 0;
+		if (fib4_lookup_nh_basic(fibnum, gsin->sin_addr, 0, 0, &nh4)==0)
+			ifp = nh4.nh_ifp;
+		else {
 			struct in_ifaddr *ia;
 			struct ifnet *mifp;
 
 			mifp = NULL;
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 				mifp = ia->ia_ifp;
 				if (!(mifp->if_flags & IFF_LOOPBACK) &&
 				     (mifp->if_flags & IFF_MULTICAST)) {
 					ifp = mifp;
 					break;
 				}
 			}
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		}
 	}
 
 	return (ifp);
 }
 
 /*
  * Join an IPv4 multicast group, possibly with a source.
  */
 static int
 inp_join_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_multi			*inm;
 	struct in_msource		*lims;
 	size_t				 idx;
 	int				 error, is_new;
 
 	ifp = NULL;
 	imf = NULL;
 	lims = NULL;
 	error = 0;
 	is_new = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	switch (sopt->sopt_name) {
 	case IP_ADD_MEMBERSHIP:
 	case IP_ADD_SOURCE_MEMBERSHIP: {
 		struct ip_mreq_source	 mreqs;
 
 		if (sopt->sopt_name == IP_ADD_MEMBERSHIP) {
 			error = sooptcopyin(sopt, &mreqs,
 			    sizeof(struct ip_mreq),
 			    sizeof(struct ip_mreq));
 			/*
 			 * Do argument switcharoo from ip_mreq into
 			 * ip_mreq_source to avoid using two instances.
 			 */
 			mreqs.imr_interface = mreqs.imr_sourceaddr;
 			mreqs.imr_sourceaddr.s_addr = INADDR_ANY;
 		} else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) {
 			error = sooptcopyin(sopt, &mreqs,
 			    sizeof(struct ip_mreq_source),
 			    sizeof(struct ip_mreq_source));
 		}
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 
 		if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) {
 			ssa->sin.sin_family = AF_INET;
 			ssa->sin.sin_len = sizeof(struct sockaddr_in);
 			ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 		}
 
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		ifp = inp_lookup_mcast_ifp(inp, &gsa->sin,
 		    mreqs.imr_interface);
 		CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p",
 		    __func__, inet_ntoa(mreqs.imr_interface), ifp);
 		break;
 	}
 
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_JOIN_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		/*
 		 * Overwrite the port field if present, as the sockaddr
 		 * being copied in may be matched with a binary comparison.
 		 */
 		gsa->sin.sin_port = 0;
 		if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			if (ssa->sin.sin_family != AF_INET ||
 			    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 				return (EINVAL);
 			ssa->sin.sin_port = 0;
 		}
 
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EADDRNOTAVAIL);
 
 	imo = inp_findmoptions(inp);
 	idx = imo_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1) {
 		is_new = 1;
 	} else {
 		inm = imo->imo_membership[idx];
 		imf = &imo->imo_mfilters[idx];
 		if (ssa->ss.ss_family != AF_UNSPEC) {
 			/*
 			 * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
 			 * is an error. On an existing inclusive membership,
 			 * it just adds the source to the filter list.
 			 */
 			if (imf->imf_st[1] != MCAST_INCLUDE) {
 				error = EINVAL;
 				goto out_inp_locked;
 			}
 			/*
 			 * Throw out duplicates.
 			 *
 			 * XXX FIXME: This makes a naive assumption that
 			 * even if entries exist for *ssa in this imf,
 			 * they will be rejected as dupes, even if they
 			 * are not valid in the current mode (in-mode).
 			 *
 			 * in_msource is transactioned just as for anything
 			 * else in SSM -- but note naive use of inm_graft()
 			 * below for allocating new filter entries.
 			 *
 			 * This is only an issue if someone mixes the
 			 * full-state SSM API with the delta-based API,
 			 * which is discouraged in the relevant RFCs.
 			 */
 			lims = imo_match_source(imo, idx, &ssa->sa);
 			if (lims != NULL /*&&
 			    lims->imsl_st[1] == MCAST_INCLUDE*/) {
 				error = EADDRNOTAVAIL;
 				goto out_inp_locked;
 			}
 		} else {
 			/*
 			 * MCAST_JOIN_GROUP on an existing exclusive
 			 * membership is an error; return EADDRINUSE
 			 * to preserve 4.4BSD API idempotence, and
 			 * avoid tedious detour to code below.
 			 * NOTE: This is bending RFC 3678 a bit.
 			 *
 			 * On an existing inclusive membership, this is also
 			 * an error; if you want to change filter mode,
 			 * you must use the userland API setsourcefilter().
 			 * XXX We don't reject this for imf in UNDEFINED
 			 * state at t1, because allocation of a filter
 			 * is atomic with allocation of a membership.
 			 */
 			error = EINVAL;
 			if (imf->imf_st[1] == MCAST_EXCLUDE)
 				error = EADDRINUSE;
 			goto out_inp_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	if (is_new) {
 		if (imo->imo_num_memberships == imo->imo_max_memberships) {
 			error = imo_grow(imo);
 			if (error)
 				goto out_inp_locked;
 		}
 		/*
 		 * Allocate the new slot upfront so we can deal with
 		 * grafting the new source filter in same code path
 		 * as for join-source on existing membership.
 		 */
 		idx = imo->imo_num_memberships;
 		imo->imo_membership[idx] = NULL;
 		imo->imo_num_memberships++;
 		KASSERT(imo->imo_mfilters != NULL,
 		    ("%s: imf_mfilters vector was not allocated", __func__));
 		imf = &imo->imo_mfilters[idx];
 		KASSERT(RB_EMPTY(&imf->imf_sources),
 		    ("%s: imf_sources not empty", __func__));
 	}
 
 	/*
 	 * Graft new source into filter list for this inpcb's
 	 * membership of the group. The in_multi may not have
 	 * been allocated yet if this is a new membership, however,
 	 * the in_mfilter slot will be allocated and must be initialized.
 	 *
 	 * Note: Grafting of exclusive mode filters doesn't happen
 	 * in this path.
 	 * XXX: Should check for non-NULL lims (node exists but may
 	 * not be in-mode) for interop with full-state API.
 	 */
 	if (ssa->ss.ss_family != AF_UNSPEC) {
 		/* Membership starts in IN mode */
 		if (is_new) {
 			CTR1(KTR_IGMPV3, "%s: new join w/source", __func__);
 			imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE);
 		} else {
 			CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
 		}
 		lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin);
 		if (lims == NULL) {
 			CTR1(KTR_IGMPV3, "%s: merge imf state failed",
 			    __func__);
 			error = ENOMEM;
 			goto out_imo_free;
 		}
 	} else {
 		/* No address specified; Membership starts in EX mode */
 		if (is_new) {
 			CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__);
 			imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	IN_MULTI_LOCK();
 
 	if (is_new) {
 		error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf,
 		    &inm);
 		if (error) {
                         CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", 
                             __func__);
                         IN_MULTI_UNLOCK();
 			goto out_imo_free;
                 }
 		imo->imo_membership[idx] = inm;
 	} else {
 		CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 		error = inm_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
 			    __func__);
 			goto out_in_multi_locked;
 		}
 		CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 		error = igmp_change_state(inm);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
 			    __func__);
 			goto out_in_multi_locked;
 		}
 	}
 
 out_in_multi_locked:
 
 	IN_MULTI_UNLOCK();
 
 	INP_WLOCK_ASSERT(inp);
 	if (error) {
 		imf_rollback(imf);
 		if (is_new)
 			imf_purge(imf);
 		else
 			imf_reap(imf);
 	} else {
 		imf_commit(imf);
 	}
 
 out_imo_free:
 	if (error && is_new) {
 		imo->imo_membership[idx] = NULL;
 		--imo->imo_num_memberships;
 	}
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Leave an IPv4 multicast group on an inpcb, possibly with a source.
  */
 static int
 inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	struct ip_mreq_source		 mreqs;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_msource		*ims;
 	struct in_multi			*inm;
 	size_t				 idx;
 	int				 error, is_final;
 
 	ifp = NULL;
 	error = 0;
 	is_final = 1;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	switch (sopt->sopt_name) {
 	case IP_DROP_MEMBERSHIP:
 	case IP_DROP_SOURCE_MEMBERSHIP:
 		if (sopt->sopt_name == IP_DROP_MEMBERSHIP) {
 			error = sooptcopyin(sopt, &mreqs,
 			    sizeof(struct ip_mreq),
 			    sizeof(struct ip_mreq));
 			/*
 			 * Swap interface and sourceaddr arguments,
 			 * as ip_mreq and ip_mreq_source are laid
 			 * out differently.
 			 */
 			mreqs.imr_interface = mreqs.imr_sourceaddr;
 			mreqs.imr_sourceaddr.s_addr = INADDR_ANY;
 		} else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
 			error = sooptcopyin(sopt, &mreqs,
 			    sizeof(struct ip_mreq_source),
 			    sizeof(struct ip_mreq_source));
 		}
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 
 		if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
 			ssa->sin.sin_family = AF_INET;
 			ssa->sin.sin_len = sizeof(struct sockaddr_in);
 			ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 		}
 
 		/*
 		 * Attempt to look up hinted ifp from interface address.
 		 * Fallthrough with null ifp iff lookup fails, to
 		 * preserve 4.4BSD mcast API idempotence.
 		 * XXX NOTE WELL: The RFC 3678 API is preferred because
 		 * using an IPv4 address as a key is racy.
 		 */
 		if (!in_nullhost(mreqs.imr_interface))
 			INADDR_TO_IFP(mreqs.imr_interface, ifp);
 
 		CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p",
 		    __func__, inet_ntoa(mreqs.imr_interface), ifp);
 
 		break;
 
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			if (ssa->sin.sin_family != AF_INET ||
 			    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 				return (EINVAL);
 		}
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 
 		ifp = ifnet_byindex(gsr.gsr_interface);
 
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	/*
 	 * Find the membership in the membership array.
 	 */
 	imo = inp_findmoptions(inp);
 	idx = imo_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 	inm = imo->imo_membership[idx];
 	imf = &imo->imo_mfilters[idx];
 
 	if (ssa->ss.ss_family != AF_UNSPEC)
 		is_final = 0;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If we were instructed only to leave a given source, do so.
 	 * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
 	 */
 	if (is_final) {
 		imf_leave(imf);
 	} else {
 		if (imf->imf_st[0] == MCAST_EXCLUDE) {
 			error = EADDRNOTAVAIL;
 			goto out_inp_locked;
 		}
 		ims = imo_match_source(imo, idx, &ssa->sa);
 		if (ims == NULL) {
 			CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__,
 			    inet_ntoa(ssa->sin.sin_addr), "not ");
 			error = EADDRNOTAVAIL;
 			goto out_inp_locked;
 		}
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
 		error = imf_prune(imf, &ssa->sin);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: merge imf state failed",
 			    __func__);
 			goto out_inp_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	IN_MULTI_LOCK();
 
 	if (is_final) {
 		/*
 		 * Give up the multicast address record to which
 		 * the membership points.
 		 */
 		(void)in_leavegroup_locked(inm, imf);
 	} else {
 		CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 		error = inm_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
 			    __func__);
 			goto out_in_multi_locked;
 		}
 
 		CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 		error = igmp_change_state(inm);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
 			    __func__);
 		}
 	}
 
 out_in_multi_locked:
 
 	IN_MULTI_UNLOCK();
 
 	if (error)
 		imf_rollback(imf);
 	else
 		imf_commit(imf);
 
 	imf_reap(imf);
 
 	if (is_final) {
 		/* Remove the gap in the membership and filter array. */
 		for (++idx; idx < imo->imo_num_memberships; ++idx) {
 			imo->imo_membership[idx-1] = imo->imo_membership[idx];
 			imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx];
 		}
 		imo->imo_num_memberships--;
 	}
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Select the interface for transmitting IPv4 multicast datagrams.
  *
  * Either an instance of struct in_addr or an instance of struct ip_mreqn
  * may be passed to this socket option. An address of INADDR_ANY or an
  * interface index of 0 is used to remove a previous selection.
  * When no interface is selected, one is chosen for every send.
  */
 static int
 inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct in_addr		 addr;
 	struct ip_mreqn		 mreqn;
 	struct ifnet		*ifp;
 	struct ip_moptions	*imo;
 	int			 error;
 
 	if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
 		/*
 		 * An interface index was specified using the
 		 * Linux-derived ip_mreqn structure.
 		 */
 		error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn),
 		    sizeof(struct ip_mreqn));
 		if (error)
 			return (error);
 
 		if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex)
 			return (EINVAL);
 
 		if (mreqn.imr_ifindex == 0) {
 			ifp = NULL;
 		} else {
 			ifp = ifnet_byindex(mreqn.imr_ifindex);
 			if (ifp == NULL)
 				return (EADDRNOTAVAIL);
 		}
 	} else {
 		/*
 		 * An interface was specified by IPv4 address.
 		 * This is the traditional BSD usage.
 		 */
 		error = sooptcopyin(sopt, &addr, sizeof(struct in_addr),
 		    sizeof(struct in_addr));
 		if (error)
 			return (error);
 		if (in_nullhost(addr)) {
 			ifp = NULL;
 		} else {
 			INADDR_TO_IFP(addr, ifp);
 			if (ifp == NULL)
 				return (EADDRNOTAVAIL);
 		}
 		CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = %s", __func__, ifp,
 		    inet_ntoa(addr));
 	}
 
 	/* Reject interfaces which do not support multicast. */
 	if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EOPNOTSUPP);
 
 	imo = inp_findmoptions(inp);
 	imo->imo_multicast_ifp = ifp;
 	imo->imo_multicast_addr.s_addr = INADDR_ANY;
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 /*
  * Atomically set source filters on a socket for an IPv4 multicast group.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  */
 static int
 inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct in_mfilter	*imf;
 	struct ip_moptions	*imo;
 	struct in_multi		*inm;
 	size_t			 idx;
 	int			 error;
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
 		return (ENOBUFS);
 
 	if ((msfr.msfr_fmode != MCAST_EXCLUDE &&
 	     msfr.msfr_fmode != MCAST_INCLUDE))
 		return (EINVAL);
 
 	if (msfr.msfr_group.ss_family != AF_INET ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	gsa->sin.sin_port = 0;	/* ignore port */
 
 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
 		return (EADDRNOTAVAIL);
 
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 
 	/*
 	 * Take the INP write lock.
 	 * Check if this socket is a member of this group.
 	 */
 	imo = inp_findmoptions(inp);
 	idx = imo_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1 || imo->imo_mfilters == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 	inm = imo->imo_membership[idx];
 	imf = &imo->imo_mfilters[idx];
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	imf->imf_st[1] = msfr.msfr_fmode;
 
 	/*
 	 * Apply any new source filters, if present.
 	 * Make a copy of the user-space source vector so
 	 * that we may copy them with a single copyin. This
 	 * allows us to deal with page faults up-front.
 	 */
 	if (msfr.msfr_nsrcs > 0) {
 		struct in_msource	*lims;
 		struct sockaddr_in	*psin;
 		struct sockaddr_storage	*kss, *pkss;
 		int			 i;
 
 		INP_WUNLOCK(inp);
  
 		CTR2(KTR_IGMPV3, "%s: loading %lu source list entries",
 		    __func__, (unsigned long)msfr.msfr_nsrcs);
 		kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_WAITOK);
 		error = copyin(msfr.msfr_srcs, kss,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		if (error) {
 			free(kss, M_TEMP);
 			return (error);
 		}
 
 		INP_WLOCK(inp);
 
 		/*
 		 * Mark all source filters as UNDEFINED at t1.
 		 * Restore new group filter mode, as imf_leave()
 		 * will set it to INCLUDE.
 		 */
 		imf_leave(imf);
 		imf->imf_st[1] = msfr.msfr_fmode;
 
 		/*
 		 * Update socket layer filters at t1, lazy-allocating
 		 * new entries. This saves a bunch of memory at the
 		 * cost of one RB_FIND() per source entry; duplicate
 		 * entries in the msfr_nsrcs vector are ignored.
 		 * If we encounter an error, rollback transaction.
 		 *
 		 * XXX This too could be replaced with a set-symmetric
 		 * difference like loop to avoid walking from root
 		 * every time, as the key space is common.
 		 */
 		for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
 			psin = (struct sockaddr_in *)pkss;
 			if (psin->sin_family != AF_INET) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 			if (psin->sin_len != sizeof(struct sockaddr_in)) {
 				error = EINVAL;
 				break;
 			}
 			error = imf_get_source(imf, psin, &lims);
 			if (error)
 				break;
 			lims->imsl_st[1] = imf->imf_st[1];
 		}
 		free(kss, M_TEMP);
 	}
 
 	if (error)
 		goto out_imf_rollback;
 
 	INP_WLOCK_ASSERT(inp);
 	IN_MULTI_LOCK();
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		goto out_in_multi_locked;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
 out_in_multi_locked:
 
 	IN_MULTI_UNLOCK();
 
 out_imf_rollback:
 	if (error)
 		imf_rollback(imf);
 	else
 		imf_commit(imf);
 
 	imf_reap(imf);
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Set the IP multicast options in response to user setsockopt().
  *
  * Many of the socket options handled in this function duplicate the
  * functionality of socket options in the regular unicast API. However,
  * it is not possible to merge the duplicate code, because the idempotence
  * of the IPv4 multicast part of the BSD Sockets API must be preserved;
  * the effects of these options must be treated as separate and distinct.
  *
  * SMPng: XXX: Unlocked read of inp_socket believed OK.
  * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING
  * is refactored to no longer use vifs.
  */
 int
 inp_setmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip_moptions	*imo;
 	int			 error;
 
 	error = 0;
 
 	/*
 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
 	 * or is a divert socket, reject it.
 	 */
 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
 		return (EOPNOTSUPP);
 
 	switch (sopt->sopt_name) {
 	case IP_MULTICAST_VIF: {
 		int vifi;
 		/*
 		 * Select a multicast VIF for transmission.
 		 * Only useful if multicast forwarding is active.
 		 */
 		if (legal_vif_num == NULL) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int));
 		if (error)
 			break;
 		if (!legal_vif_num(vifi) && (vifi != -1)) {
 			error = EINVAL;
 			break;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_vif = vifi;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_MULTICAST_IF:
 		error = inp_set_multicast_if(inp, sopt);
 		break;
 
 	case IP_MULTICAST_TTL: {
 		u_char ttl;
 
 		/*
 		 * Set the IP time-to-live for outgoing multicast packets.
 		 * The original multicast API required a char argument,
 		 * which is inconsistent with the rest of the socket API.
 		 * We allow either a char or an int.
 		 */
 		if (sopt->sopt_valsize == sizeof(u_char)) {
 			error = sooptcopyin(sopt, &ttl, sizeof(u_char),
 			    sizeof(u_char));
 			if (error)
 				break;
 		} else {
 			u_int ittl;
 
 			error = sooptcopyin(sopt, &ittl, sizeof(u_int),
 			    sizeof(u_int));
 			if (error)
 				break;
 			if (ittl > 255) {
 				error = EINVAL;
 				break;
 			}
 			ttl = (u_char)ittl;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_ttl = ttl;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_MULTICAST_LOOP: {
 		u_char loop;
 
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.  The original multicast API required a
 		 * char argument, which is inconsistent with the rest
 		 * of the socket API.  We allow either a char or an int.
 		 */
 		if (sopt->sopt_valsize == sizeof(u_char)) {
 			error = sooptcopyin(sopt, &loop, sizeof(u_char),
 			    sizeof(u_char));
 			if (error)
 				break;
 		} else {
 			u_int iloop;
 
 			error = sooptcopyin(sopt, &iloop, sizeof(u_int),
 					    sizeof(u_int));
 			if (error)
 				break;
 			loop = (u_char)iloop;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_loop = !!loop;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_ADD_MEMBERSHIP:
 	case IP_ADD_SOURCE_MEMBERSHIP:
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		error = inp_join_group(inp, sopt);
 		break;
 
 	case IP_DROP_MEMBERSHIP:
 	case IP_DROP_SOURCE_MEMBERSHIP:
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		error = inp_leave_group(inp, sopt);
 		break;
 
 	case IP_BLOCK_SOURCE:
 	case IP_UNBLOCK_SOURCE:
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = inp_block_unblock_source(inp, sopt);
 		break;
 
 	case IP_MSFILTER:
 		error = inp_set_source_filters(inp, sopt);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Expose IGMP's multicast filter mode and source list(s) to userland,
  * keyed by (ifindex, group).
  * The filter mode is written out as a uint32_t, followed by
  * 0..n of struct in_addr.
  * For use by ifmcstat(8).
  * SMPng: NOTE: unlocked read of ifindex space.
  */
 static int
 sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS)
 {
 	struct in_addr			 src, group;
 	struct ifnet			*ifp;
 	struct ifmultiaddr		*ifma;
 	struct in_multi			*inm;
 	struct ip_msource		*ims;
 	int				*name;
 	int				 retval;
 	u_int				 namelen;
 	uint32_t			 fmode, ifindex;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 2)
 		return (EINVAL);
 
 	ifindex = name[0];
 	if (ifindex <= 0 || ifindex > V_if_index) {
 		CTR2(KTR_IGMPV3, "%s: ifindex %u out of range",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 
 	group.s_addr = name[1];
 	if (!IN_MULTICAST(ntohl(group.s_addr))) {
 		CTR2(KTR_IGMPV3, "%s: group %s is not multicast",
 		    __func__, inet_ntoa(group));
 		return (EINVAL);
 	}
 
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 
 	retval = sysctl_wire_old_buffer(req,
 	    sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr)));
 	if (retval)
 		return (retval);
 
 	IN_MULTI_LOCK();
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (!in_hosteq(inm->inm_addr, group))
 			continue;
 		fmode = inm->inm_st[1].iss_fmode;
 		retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
 		if (retval != 0)
 			break;
 		RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 #ifdef KTR
 			struct in_addr ina;
 			ina.s_addr = htonl(ims->ims_haddr);
 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
 			    inet_ntoa(ina));
 #endif
 			/*
 			 * Only copy-out sources which are in-mode.
 			 */
 			if (fmode != ims_get_mode(inm, ims, 1)) {
 				CTR1(KTR_IGMPV3, "%s: skip non-in-mode",
 				    __func__);
 				continue;
 			}
 			src.s_addr = htonl(ims->ims_haddr);
 			retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr));
 			if (retval != 0)
 				break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	IN_MULTI_UNLOCK();
 
 	return (retval);
 }
 
 #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3)
 
 static const char *inm_modestrs[] = { "un", "in", "ex" };
 
 static const char *
 inm_mode_str(const int mode)
 {
 
 	if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
 		return (inm_modestrs[mode]);
 	return ("??");
 }
 
 static const char *inm_statestrs[] = {
 	"not-member",
 	"silent",
 	"idle",
 	"lazy",
 	"sleeping",
 	"awakening",
 	"query-pending",
 	"sg-query-pending",
 	"leaving"
 };
 
 static const char *
 inm_state_str(const int state)
 {
 
 	if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER)
 		return (inm_statestrs[state]);
 	return ("??");
 }
 
 /*
  * Dump an in_multi structure to the console.
  */
 void
 inm_print(const struct in_multi *inm)
 {
 	int t;
 
 	if ((ktr_mask & KTR_IGMPV3) == 0)
 		return;
 
 	printf("%s: --- begin inm %p ---\n", __func__, inm);
 	printf("addr %s ifp %p(%s) ifma %p\n",
 	    inet_ntoa(inm->inm_addr),
 	    inm->inm_ifp,
 	    inm->inm_ifp->if_xname,
 	    inm->inm_ifma);
 	printf("timer %u state %s refcount %u scq.len %u\n",
 	    inm->inm_timer,
 	    inm_state_str(inm->inm_state),
 	    inm->inm_refcount,
 	    inm->inm_scq.mq_len);
 	printf("igi %p nsrc %lu sctimer %u scrv %u\n",
 	    inm->inm_igi,
 	    inm->inm_nsrc,
 	    inm->inm_sctimer,
 	    inm->inm_scrv);
 	for (t = 0; t < 2; t++) {
 		printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
 		    inm_mode_str(inm->inm_st[t].iss_fmode),
 		    inm->inm_st[t].iss_asm,
 		    inm->inm_st[t].iss_ex,
 		    inm->inm_st[t].iss_in,
 		    inm->inm_st[t].iss_rec);
 	}
 	printf("%s: --- end inm %p ---\n", __func__, inm);
 }
 
 #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */
 
 void
 inm_print(const struct in_multi *inm)
 {
 
 }
 
 #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */
 
 RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp);
Index: head/sys/netinet/ip_icmp.c
===================================================================
--- head/sys/netinet/ip_icmp.c	(revision 292014)
+++ head/sys/netinet/ip_icmp.c	(revision 292015)
@@ -1,969 +1,969 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_icmp.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
+#include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/icmp_var.h>
 
 #ifdef INET
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 #endif /* INET */
 
 /*
  * ICMP routines: error generation, receive packet processing, and
  * routines to turnaround packets back to the originator, and
  * host table maintenance routines.
  */
 static VNET_DEFINE(int, icmplim) = 200;
 #define	V_icmplim			VNET(icmplim)
 SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmplim), 0,
 	"Maximum number of ICMP responses per second");
 
 static VNET_DEFINE(int, icmplim_output) = 1;
 #define	V_icmplim_output		VNET(icmplim_output)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmplim_output), 0,
 	"Enable logging of ICMP response rate limiting");
 
 #ifdef INET
 VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat);
 VNET_PCPUSTAT_SYSINIT(icmpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat,
     icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(icmpstat);
 #endif /* VIMAGE */
 
 static VNET_DEFINE(int, icmpmaskrepl) = 0;
 #define	V_icmpmaskrepl			VNET(icmpmaskrepl)
 SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmpmaskrepl), 0,
 	"Reply to ICMP Address Mask Request packets.");
 
 static VNET_DEFINE(u_int, icmpmaskfake) = 0;
 #define	V_icmpmaskfake			VNET(icmpmaskfake)
 SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmpmaskfake), 0,
 	"Fake reply to ICMP Address Mask Request packets.");
 
 VNET_DEFINE(int, drop_redirect) = 0;
 #define	V_drop_redirect			VNET(drop_redirect)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(drop_redirect), 0, "Ignore ICMP redirects");
 
 static VNET_DEFINE(int, log_redirect) = 0;
 #define	V_log_redirect			VNET(log_redirect)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(log_redirect), 0,
 	"Log ICMP redirects to the console");
 
 static VNET_DEFINE(char, reply_src[IFNAMSIZ]);
 #define	V_reply_src			VNET(reply_src)
 SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(reply_src), IFNAMSIZ,
 	"icmp reply source for non-local packets.");
 
 static VNET_DEFINE(int, icmp_rfi) = 0;
 #define	V_icmp_rfi			VNET(icmp_rfi)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmp_rfi), 0,
 	"ICMP reply from incoming interface for non-local packets");
 
 static VNET_DEFINE(int, icmp_quotelen) = 8;
 #define	V_icmp_quotelen			VNET(icmp_quotelen)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmp_quotelen), 0,
 	"Number of bytes from original packet to quote in ICMP reply");
 
 /*
  * ICMP broadcast echo sysctl
  */
 static VNET_DEFINE(int, icmpbmcastecho) = 0;
 #define	V_icmpbmcastecho		VNET(icmpbmcastecho)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmpbmcastecho), 0,
 	"");
 
 static VNET_DEFINE(int, icmptstamprepl) = 1;
 #define	V_icmptstamprepl		VNET(icmptstamprepl)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_RW,
 	&VNET_NAME(icmptstamprepl), 0, "Respond to ICMP Timestamp packets");
 
 #ifdef ICMPPRINTFS
 int	icmpprintfs = 0;
 #endif
 
 static void	icmp_reflect(struct mbuf *);
 static void	icmp_send(struct mbuf *, struct mbuf *);
 
 extern	struct protosw inetsw[];
 
 /*
  * Kernel module interface for updating icmpstat.  The argument is an index
  * into icmpstat treated as an array of u_long.  While this encodes the
  * general layout of icmpstat into the caller, it doesn't encode its
  * location, so that future changes to add, for example, per-CPU stats
  * support won't cause binary compatibility problems for kernel modules.
  */
 void
 kmod_icmpstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(icmpstat)[statnum], 1);
 }
 
 /*
  * Generate an error packet of type error
  * in response to bad packet ip.
  */
 void
 icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
 {
 	register struct ip *oip = mtod(n, struct ip *), *nip;
 	register unsigned oiphlen = oip->ip_hl << 2;
 	register struct icmp *icp;
 	register struct mbuf *m;
 	unsigned icmplen, icmpelen, nlen;
 
 	KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type", __func__));
 #ifdef ICMPPRINTFS
 	if (icmpprintfs)
 		printf("icmp_error(%p, %x, %d)\n", oip, type, code);
 #endif
 	if (type != ICMP_REDIRECT)
 		ICMPSTAT_INC(icps_error);
 	/*
 	 * Don't send error:
 	 *  if the original packet was encrypted.
 	 *  if not the first fragment of message.
 	 *  in response to a multicast or broadcast packet.
 	 *  if the old packet protocol was an ICMP error message.
 	 */
 	if (n->m_flags & M_DECRYPTED)
 		goto freeit;
 	if (oip->ip_off & htons(~(IP_MF|IP_DF)))
 		goto freeit;
 	if (n->m_flags & (M_BCAST|M_MCAST))
 		goto freeit;
 	if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
 	  n->m_len >= oiphlen + ICMP_MINLEN &&
 	  !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) {
 		ICMPSTAT_INC(icps_oldicmp);
 		goto freeit;
 	}
 	/* Drop if IP header plus 8 bytes is not contignous in first mbuf. */
 	if (oiphlen + 8 > n->m_len)
 		goto freeit;
 	/*
 	 * Calculate length to quote from original packet and
 	 * prevent the ICMP mbuf from overflowing.
 	 * Unfortunatly this is non-trivial since ip_forward()
 	 * sends us truncated packets.
 	 */
 	nlen = m_length(n, NULL);
 	if (oip->ip_p == IPPROTO_TCP) {
 		struct tcphdr *th;
 		int tcphlen;
 
 		if (oiphlen + sizeof(struct tcphdr) > n->m_len &&
 		    n->m_next == NULL)
 			goto stdreply;
 		if (n->m_len < oiphlen + sizeof(struct tcphdr) &&
 		    ((n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL))
 			goto freeit;
 		th = (struct tcphdr *)((caddr_t)oip + oiphlen);
 		tcphlen = th->th_off << 2;
 		if (tcphlen < sizeof(struct tcphdr))
 			goto freeit;
 		if (ntohs(oip->ip_len) < oiphlen + tcphlen)
 			goto freeit;
 		if (oiphlen + tcphlen > n->m_len && n->m_next == NULL)
 			goto stdreply;
 		if (n->m_len < oiphlen + tcphlen && 
 		    ((n = m_pullup(n, oiphlen + tcphlen)) == NULL))
 			goto freeit;
 		icmpelen = max(tcphlen, min(V_icmp_quotelen,
 		    ntohs(oip->ip_len) - oiphlen));
 	} else
 stdreply:	icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen));
 
 	icmplen = min(oiphlen + icmpelen, nlen);
 	if (icmplen < sizeof(struct ip))
 		goto freeit;
 
 	if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen)
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	else
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		goto freeit;
 #ifdef MAC
 	mac_netinet_icmp_reply(n, m);
 #endif
 	icmplen = min(icmplen, M_TRAILINGSPACE(m) - sizeof(struct ip) - ICMP_MINLEN);
 	m_align(m, ICMP_MINLEN + icmplen);
 	m->m_len = ICMP_MINLEN + icmplen;
 
 	/* XXX MRT  make the outgoing packet use the same FIB
 	 * that was associated with the incoming packet
 	 */
 	M_SETFIB(m, M_GETFIB(n));
 	icp = mtod(m, struct icmp *);
 	ICMPSTAT_INC(icps_outhist[type]);
 	icp->icmp_type = type;
 	if (type == ICMP_REDIRECT)
 		icp->icmp_gwaddr.s_addr = dest;
 	else {
 		icp->icmp_void = 0;
 		/*
 		 * The following assignments assume an overlay with the
 		 * just zeroed icmp_void field.
 		 */
 		if (type == ICMP_PARAMPROB) {
 			icp->icmp_pptr = code;
 			code = 0;
 		} else if (type == ICMP_UNREACH &&
 			code == ICMP_UNREACH_NEEDFRAG && mtu) {
 			icp->icmp_nextmtu = htons(mtu);
 		}
 	}
 	icp->icmp_code = code;
 
 	/*
 	 * Copy the quotation into ICMP message and
 	 * convert quoted IP header back to network representation.
 	 */
 	m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip);
 	nip = &icp->icmp_ip;
 
 	/*
 	 * Set up ICMP message mbuf and copy old IP header (without options
 	 * in front of ICMP message.
 	 * If the original mbuf was meant to bypass the firewall, the error
 	 * reply should bypass as well.
 	 */
 	m->m_flags |= n->m_flags & M_SKIP_FIREWALL;
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 	m->m_pkthdr.len = m->m_len;
 	m->m_pkthdr.rcvif = n->m_pkthdr.rcvif;
 	nip = mtod(m, struct ip *);
 	bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip));
 	nip->ip_len = htons(m->m_len);
 	nip->ip_v = IPVERSION;
 	nip->ip_hl = 5;
 	nip->ip_p = IPPROTO_ICMP;
 	nip->ip_tos = 0;
 	nip->ip_off = 0;
 	icmp_reflect(m);
 
 freeit:
 	m_freem(n);
 }
 
 /*
  * Process a received ICMP message.
  */
 int
 icmp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct icmp *icp;
 	struct in_ifaddr *ia;
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct sockaddr_in icmpsrc, icmpdst, icmpgw;
 	int hlen = *offp;
 	int icmplen = ntohs(ip->ip_len) - *offp;
 	int i, code;
 	void (*ctlfunc)(int, struct sockaddr *, void *);
 	int fibnum;
 
 	*mp = NULL;
 
 	/*
 	 * Locate icmp structure in mbuf, and check
 	 * that not corrupted and of at least minimum length.
 	 */
 #ifdef ICMPPRINTFS
 	if (icmpprintfs) {
 		char buf[4 * sizeof "123"];
 		strcpy(buf, inet_ntoa(ip->ip_src));
 		printf("icmp_input from %s to %s, len %d\n",
 		       buf, inet_ntoa(ip->ip_dst), icmplen);
 	}
 #endif
 	if (icmplen < ICMP_MINLEN) {
 		ICMPSTAT_INC(icps_tooshort);
 		goto freeit;
 	}
 	i = hlen + min(icmplen, ICMP_ADVLENMIN);
 	if (m->m_len < i && (m = m_pullup(m, i)) == NULL)  {
 		ICMPSTAT_INC(icps_tooshort);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 	m->m_len -= hlen;
 	m->m_data += hlen;
 	icp = mtod(m, struct icmp *);
 	if (in_cksum(m, icmplen)) {
 		ICMPSTAT_INC(icps_checksum);
 		goto freeit;
 	}
 	m->m_len += hlen;
 	m->m_data -= hlen;
 
 #ifdef ICMPPRINTFS
 	if (icmpprintfs)
 		printf("icmp_input, type %d code %d\n", icp->icmp_type,
 		    icp->icmp_code);
 #endif
 
 	/*
 	 * Message type specific processing.
 	 */
 	if (icp->icmp_type > ICMP_MAXTYPE)
 		goto raw;
 
 	/* Initialize */
 	bzero(&icmpsrc, sizeof(icmpsrc));
 	icmpsrc.sin_len = sizeof(struct sockaddr_in);
 	icmpsrc.sin_family = AF_INET;
 	bzero(&icmpdst, sizeof(icmpdst));
 	icmpdst.sin_len = sizeof(struct sockaddr_in);
 	icmpdst.sin_family = AF_INET;
 	bzero(&icmpgw, sizeof(icmpgw));
 	icmpgw.sin_len = sizeof(struct sockaddr_in);
 	icmpgw.sin_family = AF_INET;
 
 	ICMPSTAT_INC(icps_inhist[icp->icmp_type]);
 	code = icp->icmp_code;
 	switch (icp->icmp_type) {
 
 	case ICMP_UNREACH:
 		switch (code) {
 			case ICMP_UNREACH_NET:
 			case ICMP_UNREACH_HOST:
 			case ICMP_UNREACH_SRCFAIL:
 			case ICMP_UNREACH_NET_UNKNOWN:
 			case ICMP_UNREACH_HOST_UNKNOWN:
 			case ICMP_UNREACH_ISOLATED:
 			case ICMP_UNREACH_TOSNET:
 			case ICMP_UNREACH_TOSHOST:
 			case ICMP_UNREACH_HOST_PRECEDENCE:
 			case ICMP_UNREACH_PRECEDENCE_CUTOFF:
 				code = PRC_UNREACH_NET;
 				break;
 
 			case ICMP_UNREACH_NEEDFRAG:
 				code = PRC_MSGSIZE;
 				break;
 
 			/*
 			 * RFC 1122, Sections 3.2.2.1 and 4.2.3.9.
 			 * Treat subcodes 2,3 as immediate RST
 			 */
 			case ICMP_UNREACH_PROTOCOL:
 			case ICMP_UNREACH_PORT:
 				code = PRC_UNREACH_PORT;
 				break;
 
 			case ICMP_UNREACH_NET_PROHIB:
 			case ICMP_UNREACH_HOST_PROHIB:
 			case ICMP_UNREACH_FILTER_PROHIB:
 				code = PRC_UNREACH_ADMIN_PROHIB;
 				break;
 
 			default:
 				goto badcode;
 		}
 		goto deliver;
 
 	case ICMP_TIMXCEED:
 		if (code > 1)
 			goto badcode;
 		code += PRC_TIMXCEED_INTRANS;
 		goto deliver;
 
 	case ICMP_PARAMPROB:
 		if (code > 1)
 			goto badcode;
 		code = PRC_PARAMPROB;
 	deliver:
 		/*
 		 * Problem with datagram; advise higher level routines.
 		 */
 		if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
 		    icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
 			ICMPSTAT_INC(icps_badlen);
 			goto freeit;
 		}
 		/* Discard ICMP's in response to multicast packets */
 		if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr)))
 			goto badcode;
 #ifdef ICMPPRINTFS
 		if (icmpprintfs)
 			printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
 #endif
 		icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
 		/*
 		 * XXX if the packet contains [IPv4 AH TCP], we can't make a
 		 * notification to TCP layer.
 		 */
 		ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput;
 		if (ctlfunc)
 			(*ctlfunc)(code, (struct sockaddr *)&icmpsrc,
 				   (void *)&icp->icmp_ip);
 		break;
 
 	badcode:
 		ICMPSTAT_INC(icps_badcode);
 		break;
 
 	case ICMP_ECHO:
 		if (!V_icmpbmcastecho
 		    && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
 			ICMPSTAT_INC(icps_bmcastecho);
 			break;
 		}
 		icp->icmp_type = ICMP_ECHOREPLY;
 		if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0)
 			goto freeit;
 		else
 			goto reflect;
 
 	case ICMP_TSTAMP:
 		if (V_icmptstamprepl == 0)
 			break;
 		if (!V_icmpbmcastecho
 		    && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
 			ICMPSTAT_INC(icps_bmcasttstamp);
 			break;
 		}
 		if (icmplen < ICMP_TSLEN) {
 			ICMPSTAT_INC(icps_badlen);
 			break;
 		}
 		icp->icmp_type = ICMP_TSTAMPREPLY;
 		icp->icmp_rtime = iptime();
 		icp->icmp_ttime = icp->icmp_rtime;	/* bogus, do later! */
 		if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0)
 			goto freeit;
 		else
 			goto reflect;
 
 	case ICMP_MASKREQ:
 		if (V_icmpmaskrepl == 0)
 			break;
 		/*
 		 * We are not able to respond with all ones broadcast
 		 * unless we receive it over a point-to-point interface.
 		 */
 		if (icmplen < ICMP_MASKLEN)
 			break;
 		switch (ip->ip_dst.s_addr) {
 
 		case INADDR_BROADCAST:
 		case INADDR_ANY:
 			icmpdst.sin_addr = ip->ip_src;
 			break;
 
 		default:
 			icmpdst.sin_addr = ip->ip_dst;
 		}
 		ia = (struct in_ifaddr *)ifaof_ifpforaddr(
 			    (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif);
 		if (ia == NULL)
 			break;
 		if (ia->ia_ifp == NULL) {
 			ifa_free(&ia->ia_ifa);
 			break;
 		}
 		icp->icmp_type = ICMP_MASKREPLY;
 		if (V_icmpmaskfake == 0)
 			icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr;
 		else
 			icp->icmp_mask = V_icmpmaskfake;
 		if (ip->ip_src.s_addr == 0) {
 			if (ia->ia_ifp->if_flags & IFF_BROADCAST)
 			    ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr;
 			else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT)
 			    ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr;
 		}
 		ifa_free(&ia->ia_ifa);
 reflect:
 		ICMPSTAT_INC(icps_reflect);
 		ICMPSTAT_INC(icps_outhist[icp->icmp_type]);
 		icmp_reflect(m);
 		return (IPPROTO_DONE);
 
 	case ICMP_REDIRECT:
 		if (V_log_redirect) {
 			u_long src, dst, gw;
 
 			src = ntohl(ip->ip_src.s_addr);
 			dst = ntohl(icp->icmp_ip.ip_dst.s_addr);
 			gw = ntohl(icp->icmp_gwaddr.s_addr);
 			printf("icmp redirect from %d.%d.%d.%d: "
 			       "%d.%d.%d.%d => %d.%d.%d.%d\n",
 			       (int)(src >> 24), (int)((src >> 16) & 0xff),
 			       (int)((src >> 8) & 0xff), (int)(src & 0xff),
 			       (int)(dst >> 24), (int)((dst >> 16) & 0xff),
 			       (int)((dst >> 8) & 0xff), (int)(dst & 0xff),
 			       (int)(gw >> 24), (int)((gw >> 16) & 0xff),
 			       (int)((gw >> 8) & 0xff), (int)(gw & 0xff));
 		}
 		/*
 		 * RFC1812 says we must ignore ICMP redirects if we
 		 * are acting as router.
 		 */
 		if (V_drop_redirect || V_ipforwarding)
 			break;
 		if (code > 3)
 			goto badcode;
 		if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
 		    icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
 			ICMPSTAT_INC(icps_badlen);
 			break;
 		}
 		/*
 		 * Short circuit routing redirects to force
 		 * immediate change in the kernel's routing
 		 * tables.  The message is also handed to anyone
 		 * listening on a raw socket (e.g. the routing
 		 * daemon for use in updating its tables).
 		 */
 		icmpgw.sin_addr = ip->ip_src;
 		icmpdst.sin_addr = icp->icmp_gwaddr;
 #ifdef	ICMPPRINTFS
 		if (icmpprintfs) {
 			char buf[4 * sizeof "123"];
 			strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst));
 
 			printf("redirect dst %s to %s\n",
 			       buf, inet_ntoa(icp->icmp_gwaddr));
 		}
 #endif
 		icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
 		for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 			in_rtredirect((struct sockaddr *)&icmpsrc,
 			  (struct sockaddr *)&icmpdst,
 			  (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST,
 			  (struct sockaddr *)&icmpgw, fibnum);
 		}
 		pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc);
 		break;
 
 	/*
 	 * No kernel processing for the following;
 	 * just fall through to send to raw listener.
 	 */
 	case ICMP_ECHOREPLY:
 	case ICMP_ROUTERADVERT:
 	case ICMP_ROUTERSOLICIT:
 	case ICMP_TSTAMPREPLY:
 	case ICMP_IREQREPLY:
 	case ICMP_MASKREPLY:
 	case ICMP_SOURCEQUENCH:
 	default:
 		break;
 	}
 
 raw:
 	*mp = m;
 	rip_input(mp, offp, proto);
 	return (IPPROTO_DONE);
 
 freeit:
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Reflect the ip packet back to the source
  */
 static void
 icmp_reflect(struct mbuf *m)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct ip *ip = mtod(m, struct ip *);
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	struct in_ifaddr *ia;
 	struct in_addr t;
+	struct nhop4_extended nh_ext;
 	struct mbuf *opts = 0;
 	int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
 
 	if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 	    IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) ||
 	    IN_ZERONET(ntohl(ip->ip_src.s_addr)) ) {
 		m_freem(m);	/* Bad return address */
 		ICMPSTAT_INC(icps_badaddr);
 		goto done;	/* Ip_output() will check for broadcast */
 	}
 
 	t = ip->ip_dst;
 	ip->ip_dst = ip->ip_src;
 
 	/*
 	 * Source selection for ICMP replies:
 	 *
 	 * If the incoming packet was addressed directly to one of our
 	 * own addresses, use dst as the src for the reply.
 	 */
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) {
 		if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) {
 			t = IA_SIN(ia)->sin_addr;
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			goto match;
 		}
 	}
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 
 	/*
 	 * If the incoming packet was addressed to one of our broadcast
 	 * addresses, use the first non-broadcast address which corresponds
 	 * to the incoming interface.
 	 */
 	ifp = m->m_pkthdr.rcvif;
 	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    t.s_addr) {
 				t = IA_SIN(ia)->sin_addr;
 				IF_ADDR_RUNLOCK(ifp);
 				goto match;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	/*
 	 * If the packet was transiting through us, use the address of
 	 * the interface the packet came through in.  If that interface
 	 * doesn't have a suitable IP address, the normal selection
 	 * criteria apply.
 	 */
 	if (V_icmp_rfi && ifp != NULL) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			t = IA_SIN(ia)->sin_addr;
 			IF_ADDR_RUNLOCK(ifp);
 			goto match;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	/*
 	 * If the incoming packet was not addressed directly to us, use
 	 * designated interface for icmp replies specified by sysctl
 	 * net.inet.icmp.reply_src (default not set). Otherwise continue
 	 * with normal source selection.
 	 */
 	if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			t = IA_SIN(ia)->sin_addr;
 			IF_ADDR_RUNLOCK(ifp);
 			goto match;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 	/*
 	 * If the packet was transiting through us, use the address of
 	 * the interface that is the closest to the packet source.
 	 * When we don't have a route back to the packet source, stop here
 	 * and drop the packet.
 	 */
-	ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
-	if (ia == NULL) {
+	if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh_ext) != 0) {
 		m_freem(m);
 		ICMPSTAT_INC(icps_noroute);
 		goto done;
 	}
-	t = IA_SIN(ia)->sin_addr;
-	ifa_free(&ia->ia_ifa);
+	t = nh_ext.nh_src;
 match:
 #ifdef MAC
 	mac_netinet_icmp_replyinplace(m);
 #endif
 	ip->ip_src = t;
 	ip->ip_ttl = V_ip_defttl;
 
 	if (optlen > 0) {
 		register u_char *cp;
 		int opt, cnt;
 		u_int len;
 
 		/*
 		 * Retrieve any source routing from the incoming packet;
 		 * add on any record-route or timestamp options.
 		 */
 		cp = (u_char *) (ip + 1);
 		if ((opts = ip_srcroute(m)) == 0 &&
 		    (opts = m_gethdr(M_NOWAIT, MT_DATA))) {
 			opts->m_len = sizeof(struct in_addr);
 			mtod(opts, struct in_addr *)->s_addr = 0;
 		}
 		if (opts) {
 #ifdef ICMPPRINTFS
 		    if (icmpprintfs)
 			    printf("icmp_reflect optlen %d rt %d => ",
 				optlen, opts->m_len);
 #endif
 		    for (cnt = optlen; cnt > 0; cnt -= len, cp += len) {
 			    opt = cp[IPOPT_OPTVAL];
 			    if (opt == IPOPT_EOL)
 				    break;
 			    if (opt == IPOPT_NOP)
 				    len = 1;
 			    else {
 				    if (cnt < IPOPT_OLEN + sizeof(*cp))
 					    break;
 				    len = cp[IPOPT_OLEN];
 				    if (len < IPOPT_OLEN + sizeof(*cp) ||
 				        len > cnt)
 					    break;
 			    }
 			    /*
 			     * Should check for overflow, but it "can't happen"
 			     */
 			    if (opt == IPOPT_RR || opt == IPOPT_TS ||
 				opt == IPOPT_SECURITY) {
 				    bcopy((caddr_t)cp,
 					mtod(opts, caddr_t) + opts->m_len, len);
 				    opts->m_len += len;
 			    }
 		    }
 		    /* Terminate & pad, if necessary */
 		    cnt = opts->m_len % 4;
 		    if (cnt) {
 			    for (; cnt < 4; cnt++) {
 				    *(mtod(opts, caddr_t) + opts->m_len) =
 					IPOPT_EOL;
 				    opts->m_len++;
 			    }
 		    }
 #ifdef ICMPPRINTFS
 		    if (icmpprintfs)
 			    printf("%d\n", opts->m_len);
 #endif
 		}
 		ip_stripoptions(m);
 	}
 	m_tag_delete_nonpersistent(m);
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 	icmp_send(m, opts);
 done:
 	if (opts)
 		(void)m_free(opts);
 }
 
 /*
  * Send an icmp packet back to the ip level,
  * after supplying a checksum.
  */
 static void
 icmp_send(struct mbuf *m, struct mbuf *opts)
 {
 	register struct ip *ip = mtod(m, struct ip *);
 	register int hlen;
 	register struct icmp *icp;
 
 	hlen = ip->ip_hl << 2;
 	m->m_data += hlen;
 	m->m_len -= hlen;
 	icp = mtod(m, struct icmp *);
 	icp->icmp_cksum = 0;
 	icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - hlen);
 	m->m_data -= hlen;
 	m->m_len += hlen;
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef ICMPPRINTFS
 	if (icmpprintfs) {
 		char buf[4 * sizeof "123"];
 		strcpy(buf, inet_ntoa(ip->ip_dst));
 		printf("icmp_send dst %s src %s\n",
 		       buf, inet_ntoa(ip->ip_src));
 	}
 #endif
 	(void) ip_output(m, opts, NULL, 0, NULL, NULL);
 }
 
 /*
  * Return milliseconds since 00:00 UTC in network format.
  */
 uint32_t
 iptime(void)
 {
 	struct timeval atv;
 	u_long t;
 
 	getmicrotime(&atv);
 	t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000;
 	return (htonl(t));
 }
 
 /*
  * Return the next larger or smaller MTU plateau (table from RFC 1191)
  * given current value MTU.  If DIR is less than zero, a larger plateau
  * is returned; otherwise, a smaller value is returned.
  */
 int
 ip_next_mtu(int mtu, int dir)
 {
 	static int mtutab[] = {
 		65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508,
 		296, 68, 0
 	};
 	int i, size;
 
 	size = (sizeof mtutab) / (sizeof mtutab[0]);
 	if (dir >= 0) {
 		for (i = 0; i < size; i++)
 			if (mtu > mtutab[i])
 				return mtutab[i];
 	} else {
 		for (i = size - 1; i >= 0; i--)
 			if (mtu < mtutab[i])
 				return mtutab[i];
 		if (mtu == mtutab[0])
 			return mtutab[0];
 	}
 	return 0;
 }
 #endif /* INET */
 
 
 /*
  * badport_bandlim() - check for ICMP bandwidth limit
  *
  *	Return 0 if it is ok to send an ICMP error response, -1 if we have
  *	hit our bandwidth limit and it is not ok.
  *
  *	If icmplim is <= 0, the feature is disabled and 0 is returned.
  *
  *	For now we separate the TCP and UDP subsystems w/ different 'which'
  *	values.  We may eventually remove this separation (and simplify the
  *	code further).
  *
  *	Note that the printing of the error message is delayed so we can
  *	properly print the icmp error rate that the system was trying to do
  *	(i.e. 22000/100 pps, etc...).  This can cause long delays in printing
  *	the 'final' error, but it doesn't make sense to solve the printing
  *	delay with more complex code.
  */
 
 int
 badport_bandlim(int which)
 {
 
 #define	N(a)	(sizeof (a) / sizeof (a[0]))
 	static struct rate {
 		const char	*type;
 		struct timeval	lasttime;
 		int		curpps;
 	} rates[BANDLIM_MAX+1] = {
 		{ "icmp unreach response" },
 		{ "icmp ping response" },
 		{ "icmp tstamp response" },
 		{ "closed port RST response" },
 		{ "open port RST response" },
 		{ "icmp6 unreach response" },
 		{ "sctp ootb response" }
 	};
 
 	/*
 	 * Return ok status if feature disabled or argument out of range.
 	 */
 	if (V_icmplim > 0 && (u_int) which < N(rates)) {
 		struct rate *r = &rates[which];
 		int opps = r->curpps;
 
 		if (!ppsratecheck(&r->lasttime, &r->curpps, V_icmplim))
 			return -1;	/* discard packet */
 		/*
 		 * If we've dropped below the threshold after having
 		 * rate-limited traffic print the message.  This preserves
 		 * the previous behaviour at the expense of added complexity.
 		 */
 		if (V_icmplim_output && opps > V_icmplim)
 			log(LOG_NOTICE, "Limiting %s from %d to %d packets/sec\n",
 				r->type, opps, V_icmplim);
 	}
 	return 0;			/* okay to send packet */
 #undef N
 }
Index: head/sys/netinet/ip_input.c
===================================================================
--- head/sys/netinet/ip_input.c	(revision 292014)
+++ head/sys/netinet/ip_input.c	(revision 292015)
@@ -1,1371 +1,1344 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bootp.h"
 #include "opt_ipfw.h"
 #include "opt_ipstealth.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/pfil.h>
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_options.h>
 #include <machine/in_cksum.h>
 #include <netinet/ip_carp.h>
 #ifdef IPSEC
 #include <netinet/ip_ipsec.h>
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif /* IPSEC */
 #include <netinet/in_rss.h>
 
 #include <sys/socketvar.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof(struct ip) == 20);
 #endif
 
 /* IP reassembly functions are defined in ip_reass.c. */
 extern void ipreass_init(void);
 extern void ipreass_drain(void);
 extern void ipreass_slowtimo(void);
 #ifdef VIMAGE
 extern void ipreass_destroy(void);
 #endif
 
 struct rmlock in_ifaddr_lock;
 RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock");
 
 VNET_DEFINE(int, rsvp_on);
 
 VNET_DEFINE(int, ipforwarding);
 SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipforwarding), 0,
     "Enable IP forwarding between interfaces");
 
 static VNET_DEFINE(int, ipsendredirects) = 1;	/* XXX */
 #define	V_ipsendredirects	VNET(ipsendredirects)
 SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipsendredirects), 0,
     "Enable sending IP redirects");
 
 /*
  * XXX - Setting ip_checkinterface mostly implements the receive side of
  * the Strong ES model described in RFC 1122, but since the routing table
  * and transmit implementation do not implement the Strong ES model,
  * setting this to 1 results in an odd hybrid.
  *
  * XXX - ip_checkinterface currently must be disabled if you use ipnat
  * to translate the destination address to another local interface.
  *
  * XXX - ip_checkinterface must be disabled if you add IP aliases
  * to the loopback interface instead of the interface where the
  * packets for those addresses are received.
  */
 static VNET_DEFINE(int, ip_checkinterface);
 #define	V_ip_checkinterface	VNET(ip_checkinterface)
 SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_checkinterface), 0,
     "Verify packet arrives on correct interface");
 
 VNET_DEFINE(struct pfil_head, inet_pfil_hook);	/* Packet filter hooks */
 
 static struct netisr_handler ip_nh = {
 	.nh_name = "ip",
 	.nh_handler = ip_input,
 	.nh_proto = NETISR_IP,
 #ifdef	RSS
 	.nh_m2cpuid = rss_soft_m2cpuid_v4,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 #else
 	.nh_policy = NETISR_POLICY_FLOW,
 #endif
 };
 
 #ifdef	RSS
 /*
  * Directly dispatched frames are currently assumed
  * to have a flowid already calculated.
  *
  * It should likely have something that assert it
  * actually has valid flow details.
  */
 static struct netisr_handler ip_direct_nh = {
 	.nh_name = "ip_direct",
 	.nh_handler = ip_direct_input,
 	.nh_proto = NETISR_IP_DIRECT,
 	.nh_m2cpuid = rss_soft_m2cpuid_v4,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 };
 #endif
 
 extern	struct domain inetdomain;
 extern	struct protosw inetsw[];
 u_char	ip_protox[IPPROTO_MAX];
 VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead);  /* first inet address */
 VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table  */
 VNET_DEFINE(u_long, in_ifaddrhmask);		/* mask for hash table */
 
 #ifdef IPCTL_DEFMTU
 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
     &ip_mtu, 0, "Default MTU");
 #endif
 
 #ifdef IPSTEALTH
 VNET_DEFINE(int, ipstealth);
 SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipstealth), 0,
     "IP stealth mode, no TTL decrementation on forwarding");
 #endif
 
 /*
  * IP statistics are stored in the "array" of counter(9)s.
  */
 VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat);
 VNET_PCPUSTAT_SYSINIT(ipstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat,
     "IP statistics (struct ipstat, netinet/ip_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipstat);
 #endif /* VIMAGE */
 
 /*
  * Kernel module interface for updating ipstat.  The argument is an index
  * into ipstat treated as an array.
  */
 void
 kmod_ipstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], 1);
 }
 
 void
 kmod_ipstat_dec(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], -1);
 }
 
 static int
 sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I",
     "Maximum size of the IP input queue");
 
 static int
 sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I",
     "Number of packets dropped from the IP input queue");
 
 #ifdef	RSS
 static int
 sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_direct_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_direct_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I",
     "Maximum size of the IP direct input queue");
 
 static int
 sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_direct_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_direct_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops,
     CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I",
     "Number of packets dropped from the IP direct input queue");
 #endif	/* RSS */
 
 /*
  * IP initialization: fill in IP protocol switch table.
  * All protocols not implemented in kernel go to raw IP protocol handler.
  */
 void
 ip_init(void)
 {
 	struct protosw *pr;
 	int i;
 
 	TAILQ_INIT(&V_in_ifaddrhead);
 	V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
 
 	/* Initialize IP reassembly queue. */
 	ipreass_init();
 
 	/* Initialize packet filter hooks. */
 	V_inet_pfil_hook.ph_type = PFIL_TYPE_AF;
 	V_inet_pfil_hook.ph_af = AF_INET;
 	if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to register pfil hook, "
 			"error %d\n", __func__, i);
 
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET,
 	    &V_ipsec_hhh_in[HHOOK_IPSEC_INET],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register input helper hook\n",
 		    __func__);
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET,
 	    &V_ipsec_hhh_out[HHOOK_IPSEC_INET],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register output helper hook\n",
 		    __func__);
 
 	/* Skip initialization of globals for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		panic("ip_init: PF_INET not found");
 
 	/* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
 	for (i = 0; i < IPPROTO_MAX; i++)
 		ip_protox[i] = pr - inetsw;
 	/*
 	 * Cycle through IP protocols and put them into the appropriate place
 	 * in ip_protox[].
 	 */
 	for (pr = inetdomain.dom_protosw;
 	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
 			/* Be careful to only index valid IP protocols. */
 			if (pr->pr_protocol < IPPROTO_MAX)
 				ip_protox[pr->pr_protocol] = pr - inetsw;
 		}
 
 	netisr_register(&ip_nh);
 #ifdef	RSS
 	netisr_register(&ip_direct_nh);
 #endif
 }
 
 #ifdef VIMAGE
 void
 ip_destroy(void)
 {
 	int error;
 
 	if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0)
 		printf("%s: WARNING: unable to unregister pfil hook, "
 		    "error %d\n", __func__, error);
 
 	error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister input helper hook "
 		    "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: "
 		    "error %d returned\n", __func__, error);
 	}
 	error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister output helper hook "
 		    "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: "
 		    "error %d returned\n", __func__, error);
 	}
 	/* Cleanup in_ifaddr hash table; should be empty. */
 	hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask);
 
 	/* Destroy IP reassembly queue. */
 	ipreass_destroy();
 }
 #endif
 
 #ifdef	RSS
 /*
  * IP direct input routine.
  *
  * This is called when reinjecting completed fragments where
  * all of the previous checking and book-keeping has been done.
  */
 void
 ip_direct_input(struct mbuf *m)
 {
 	struct ip *ip;
 	int hlen;
 
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 
 	IPSTAT_INC(ips_delivered);
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
 	return;
 }
 #endif
 
 /*
  * Ip input routine.  Checksum and byte swap header.  If fragmented
  * try to reassemble.  Process options.  Pass to next level.
  */
 void
 ip_input(struct mbuf *m)
 {
 	struct ip *ip = NULL;
 	struct in_ifaddr *ia = NULL;
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	int    checkif, hlen = 0;
 	uint16_t sum, ip_len;
 	int dchg = 0;				/* dest changed after fw */
 	struct in_addr odst;			/* original dst address */
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		/* Set up some basics that will be used later. */
 		ip = mtod(m, struct ip *);
 		hlen = ip->ip_hl << 2;
 		ip_len = ntohs(ip->ip_len);
 		goto ours;
 	}
 
 	IPSTAT_INC(ips_total);
 
 	if (m->m_pkthdr.len < sizeof(struct ip))
 		goto tooshort;
 
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 		IPSTAT_INC(ips_toosmall);
 		return;
 	}
 	ip = mtod(m, struct ip *);
 
 	if (ip->ip_v != IPVERSION) {
 		IPSTAT_INC(ips_badvers);
 		goto bad;
 	}
 
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
 		IPSTAT_INC(ips_badhlen);
 		goto bad;
 	}
 	if (hlen > m->m_len) {
 		if ((m = m_pullup(m, hlen)) == NULL) {
 			IPSTAT_INC(ips_badhlen);
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 
 	IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL);
 
 	/* 127/8 must not appear on wire - RFC1122 */
 	ifp = m->m_pkthdr.rcvif;
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			goto bad;
 		}
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	} else {
 		if (hlen == sizeof(struct ip)) {
 			sum = in_cksum_hdr(ip);
 		} else {
 			sum = in_cksum(m, hlen);
 		}
 	}
 	if (sum) {
 		IPSTAT_INC(ips_badsum);
 		goto bad;
 	}
 
 #ifdef ALTQ
 	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
 		/* packet is dropped by traffic conditioner */
 		return;
 #endif
 
 	ip_len = ntohs(ip->ip_len);
 	if (ip_len < hlen) {
 		IPSTAT_INC(ips_badlen);
 		goto bad;
 	}
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IP header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (m->m_pkthdr.len < ip_len) {
 tooshort:
 		IPSTAT_INC(ips_tooshort);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > ip_len) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = ip_len;
 			m->m_pkthdr.len = ip_len;
 		} else
 			m_adj(m, ip_len - m->m_pkthdr.len);
 	}
 
 	/* Try to forward the packet, but if we fail continue */
 #ifdef IPSEC
 	/* For now we do not handle IPSEC in tryforward. */
 	if (!key_havesp(IPSEC_DIR_INBOUND) && !key_havesp(IPSEC_DIR_OUTBOUND) &&
 	    (V_ipforwarding == 1))
 		if (ip_tryforward(m) == NULL)
 			return;
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (ip_ipsec_filtertunnel(m))
 		goto passin;
 #else
 	if (V_ipforwarding == 1)
 		if (ip_tryforward(m) == NULL)
 			return;
 #endif /* IPSEC */
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing (e.g.
 	 *     by NAT rewriting).  When this happens, tell
 	 *     ip_forward to do the right thing.
 	 */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet_pfil_hook))
 		goto passin;
 
 	odst = ip->ip_dst;
 	if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0)
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 
 	ip = mtod(m, struct ip *);
 	dchg = (odst.s_addr != ip->ip_dst.s_addr);
 	ifp = m->m_pkthdr.rcvif;
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		goto ours;
 	}
 	if (m->m_flags & M_IP_NEXTHOP) {
 		dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL);
 		if (dchg != 0) {
 			/*
 			 * Directly ship the packet on.  This allows
 			 * forwarding packets originally destined to us
 			 * to some other directly connected host.
 			 */
 			ip_forward(m, 1);
 			return;
 		}
 	}
 passin:
 
 	/*
 	 * Process options and, if not destined for us,
 	 * ship it on.  ip_dooptions returns 1 when an
 	 * error was detected (causing an icmp message
 	 * to be sent and the original packet to be freed).
 	 */
 	if (hlen > sizeof (struct ip) && ip_dooptions(m, 0))
 		return;
 
         /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
          * matter if it is destined to another node, or whether it is 
          * a multicast one, RSVP wants it! and prevents it from being forwarded
          * anywhere else. Also checks if the rsvp daemon is running before
 	 * grabbing the packet.
          */
 	if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) 
 		goto ours;
 
 	/*
 	 * Check our list of addresses, to see if the packet is for us.
 	 * If we don't have any addresses, assume any unicast packet
 	 * we receive might be for us (and let the upper layers deal
 	 * with it).
 	 */
 	if (TAILQ_EMPTY(&V_in_ifaddrhead) &&
 	    (m->m_flags & (M_MCAST|M_BCAST)) == 0)
 		goto ours;
 
 	/*
 	 * Enable a consistency check between the destination address
 	 * and the arrival interface for a unicast packet (the RFC 1122
 	 * strong ES model) if IP forwarding is disabled and the packet
 	 * is not locally generated and the packet is not subject to
 	 * 'ipfw fwd'.
 	 *
 	 * XXX - Checking also should be disabled if the destination
 	 * address is ipnat'ed to a different interface.
 	 *
 	 * XXX - Checking is incompatible with IP aliases added
 	 * to the loopback interface instead of the interface where
 	 * the packets are received.
 	 *
 	 * XXX - This is the case for carp vhost IPs as well so we
 	 * insert a workaround. If the packet got here, we already
 	 * checked with carp_iamatch() and carp_forus().
 	 */
 	checkif = V_ip_checkinterface && (V_ipforwarding == 0) && 
 	    ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) &&
 	    ifp->if_carp == NULL && (dchg == 0);
 
 	/*
 	 * Check for exact addresses in the hash bucket.
 	 */
 	/* IN_IFADDR_RLOCK(); */
 	LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
 		/*
 		 * If the address matches, verify that the packet
 		 * arrived via the correct interface if checking is
 		 * enabled.
 		 */
 		if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && 
 		    (!checkif || ia->ia_ifp == ifp)) {
 			counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 			counter_u64_add(ia->ia_ifa.ifa_ibytes,
 			    m->m_pkthdr.len);
 			/* IN_IFADDR_RUNLOCK(); */
 			goto ours;
 		}
 	}
 	/* IN_IFADDR_RUNLOCK(); */
 
 	/*
 	 * Check for broadcast addresses.
 	 *
 	 * Only accept broadcast packets that arrive via the matching
 	 * interface.  Reception of forwarded directed broadcasts would
 	 * be handled via ip_forward() and ether_output() with the loopback
 	 * into the stack for SIMPLEX interfaces handled by ether_output().
 	 */
 	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
 		IF_ADDR_RLOCK(ifp);
 	        TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    ip->ip_dst.s_addr) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #ifdef BOOTP_COMPAT
 			if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				IF_ADDR_RUNLOCK(ifp);
 				goto ours;
 			}
 #endif
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		ia = NULL;
 	}
 	/* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */
 	if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		if (V_ip_mrouter) {
 			/*
 			 * If we are acting as a multicast router, all
 			 * incoming multicast packets are passed to the
 			 * kernel-level multicast forwarding function.
 			 * The packet is returned (relatively) intact; if
 			 * ip_mforward() returns a non-zero value, the packet
 			 * must be discarded, else it may be accepted below.
 			 */
 			if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) {
 				IPSTAT_INC(ips_cantforward);
 				m_freem(m);
 				return;
 			}
 
 			/*
 			 * The process-level routing daemon needs to receive
 			 * all multicast IGMP packets, whether or not this
 			 * host belongs to their destination groups.
 			 */
 			if (ip->ip_p == IPPROTO_IGMP)
 				goto ours;
 			IPSTAT_INC(ips_forward);
 		}
 		/*
 		 * Assume the packet is for us, to avoid prematurely taking
 		 * a lock on the in_multi hash. Protocols must perform
 		 * their own filtering and update statistics accordingly.
 		 */
 		goto ours;
 	}
 	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
 		goto ours;
 	if (ip->ip_dst.s_addr == INADDR_ANY)
 		goto ours;
 
 	/*
 	 * Not for us; forward if possible and desirable.
 	 */
 	if (V_ipforwarding == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 	} else {
 		ip_forward(m, dchg);
 	}
 	return;
 
 ours:
 #ifdef IPSTEALTH
 	/*
 	 * IPSTEALTH: Process non-routing options only
 	 * if the packet is destined for us.
 	 */
 	if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1))
 		return;
 #endif /* IPSTEALTH */
 
 	/*
 	 * Attempt reassembly; if it succeeds, proceed.
 	 * ip_reass() will return a different mbuf.
 	 */
 	if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
 		/* XXXGL: shouldn't we save & set m_flags? */
 		m = ip_reass(m);
 		if (m == NULL)
 			return;
 		ip = mtod(m, struct ip *);
 		/* Get the header length of the reassembled packet */
 		hlen = ip->ip_hl << 2;
 	}
 
 #ifdef IPSEC
 	/*
 	 * enforce IPsec policy checking if we are seeing last header.
 	 * note that we do not visit this with protocols with pcb layer
 	 * code - like udp/tcp/raw ip.
 	 */
 	if (ip_ipsec_input(m, ip->ip_p) != 0)
 		goto bad;
 #endif /* IPSEC */
 
 	/*
 	 * Switch out to protocol's input routine.
 	 */
 	IPSTAT_INC(ips_delivered);
 
 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p);
 	return;
 bad:
 	m_freem(m);
 }
 
 /*
  * IP timer processing;
  * if a timer expires on a reassembly
  * queue, discard it.
  */
 void
 ip_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ipreass_slowtimo();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 void
 ip_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ipreass_drain();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * The protocol to be inserted into ip_protox[] must be already registered
  * in inetsw[], either statically or through pf_proto_register().
  */
 int
 ipproto_register(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to IPPROTO_RAW is unused.
 	 */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] != pr - inetsw)	/* IPPROTO_RAW */
 		return (EEXIST);
 
 	/* Find the protocol position in inetsw[] and set the index. */
 	for (pr = inetdomain.dom_protosw;
 	     pr < inetdomain.dom_protoswNPROTOSW; pr++) {
 		if (pr->pr_domain->dom_family == PF_INET &&
 		    pr->pr_protocol && pr->pr_protocol == ipproto) {
 			ip_protox[pr->pr_protocol] = pr - inetsw;
 			return (0);
 		}
 	}
 	return (EPROTONOSUPPORT);
 }
 
 int
 ipproto_unregister(short ipproto)
 {
 	struct protosw *pr;
 
 	/* Sanity checks. */
 	if (ipproto <= 0 || ipproto >= IPPROTO_MAX)
 		return (EPROTONOSUPPORT);
 
 	/* Check if the protocol was indeed registered. */
 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == NULL)
 		return (EPFNOSUPPORT);
 	if (ip_protox[ipproto] == pr - inetsw)  /* IPPROTO_RAW */
 		return (ENOENT);
 
 	/* Reset the protocol slot to IPPROTO_RAW. */
 	ip_protox[ipproto] = pr - inetsw;
 	return (0);
 }
 
-/*
- * Given address of next destination (final or next hop), return (referenced)
- * internet address info of interface to be used to get there.
- */
-struct in_ifaddr *
-ip_rtaddr(struct in_addr dst, u_int fibnum)
-{
-	struct route sro;
-	struct sockaddr_in *sin;
-	struct in_ifaddr *ia;
-
-	bzero(&sro, sizeof(sro));
-	sin = (struct sockaddr_in *)&sro.ro_dst;
-	sin->sin_family = AF_INET;
-	sin->sin_len = sizeof(*sin);
-	sin->sin_addr = dst;
-	in_rtalloc_ign(&sro, 0, fibnum);
-
-	if (sro.ro_rt == NULL)
-		return (NULL);
-
-	ia = ifatoia(sro.ro_rt->rt_ifa);
-	ifa_ref(&ia->ia_ifa);
-	RTFREE(sro.ro_rt);
-	return (ia);
-}
-
 u_char inetctlerrmap[PRC_NCMDS] = {
 	0,		0,		0,		0,
 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
 	0,		0,		EHOSTUNREACH,	0,
 	ENOPROTOOPT,	ECONNREFUSED
 };
 
 /*
  * Forward a packet.  If some error occurs return the sender
  * an icmp packet.  Note we can't always generate a meaningful
  * icmp message because icmp doesn't have a large enough repertoire
  * of codes and types.
  *
  * If not forwarding, just drop the packet.  This could be confusing
  * if ipforwarding was zero but some routing protocol was advancing
  * us as a gateway to somewhere.  However, we must let the routing
  * protocol deal with that.
  *
  * The srcrt parameter indicates whether the packet is being forwarded
  * via a source route.
  */
 void
 ip_forward(struct mbuf *m, int srcrt)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct in_ifaddr *ia;
 	struct mbuf *mcopy;
 	struct sockaddr_in *sin;
 	struct in_addr dest;
 	struct route ro;
 	int error, type = 0, code = 0, mtu = 0;
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 #ifdef IPSEC
 	if (ip_ipsec_fwd(m) != 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 #endif /* IPSEC */
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 		if (ip->ip_ttl <= IPTTLDEC) {
 			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
 			    0, 0);
 			return;
 		}
 #ifdef IPSTEALTH
 	}
 #endif
 
 	bzero(&ro, sizeof(ro));
 	sin = (struct sockaddr_in *)&ro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = ip->ip_dst;
 #ifdef RADIX_MPATH
 	rtalloc_mpath_fib(&ro,
 	    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
 	    M_GETFIB(m));
 #else
 	in_rtalloc_ign(&ro, 0, M_GETFIB(m));
 #endif
 	if (ro.ro_rt != NULL) {
 		ia = ifatoia(ro.ro_rt->rt_ifa);
 		ifa_ref(&ia->ia_ifa);
 	} else
 		ia = NULL;
 #ifndef IPSEC
 	/*
 	 * 'ia' may be NULL if there is no route for this destination.
 	 * In case of IPsec, Don't discard it just yet, but pass it to
 	 * ip_output in case of outgoing IPsec policy.
 	 */
 	if (!srcrt && ia == NULL) {
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		RO_RTFREE(&ro);
 		return;
 	}
 #endif
 
 	/*
 	 * Save the IP header and at most 8 bytes of the payload,
 	 * in case we need to generate an ICMP message to the src.
 	 *
 	 * XXX this can be optimized a lot by saving the data in a local
 	 * buffer on the stack (72 bytes at most), and only allocating the
 	 * mbuf if really necessary. The vast majority of the packets
 	 * are forwarded without having to send an ICMP back (either
 	 * because unnecessary, or because rate limited), so we are
 	 * really we are wasting a lot of work here.
 	 *
 	 * We don't use m_copy() because it might return a reference
 	 * to a shared cluster. Both this function and ip_output()
 	 * assume exclusive access to the IP header in `m', so any
 	 * data in a cluster may change before we reach icmp_error().
 	 */
 	mcopy = m_gethdr(M_NOWAIT, m->m_type);
 	if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) {
 		/*
 		 * It's probably ok if the pkthdr dup fails (because
 		 * the deep copy of the tag chain failed), but for now
 		 * be conservative and just discard the copy since
 		 * code below may some day want the tags.
 		 */
 		m_free(mcopy);
 		mcopy = NULL;
 	}
 	if (mcopy != NULL) {
 		mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy));
 		mcopy->m_pkthdr.len = mcopy->m_len;
 		m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
 	}
 
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 		ip->ip_ttl -= IPTTLDEC;
 #ifdef IPSTEALTH
 	}
 #endif
 
 	/*
 	 * If forwarding packet using same interface that it came in on,
 	 * perhaps should send a redirect to sender to shortcut a hop.
 	 * Only send redirect if source is sending directly to us,
 	 * and if packet was not source routed (or has any options).
 	 * Also, don't send redirect if forwarding using a default route
 	 * or a route modified by a redirect.
 	 */
 	dest.s_addr = 0;
 	if (!srcrt && V_ipsendredirects &&
 	    ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) {
 		struct rtentry *rt;
 
 		rt = ro.ro_rt;
 
 		if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
 		    satosin(rt_key(rt))->sin_addr.s_addr != 0) {
 #define	RTA(rt)	((struct in_ifaddr *)(rt->rt_ifa))
 			u_long src = ntohl(ip->ip_src.s_addr);
 
 			if (RTA(rt) &&
 			    (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
 				if (rt->rt_flags & RTF_GATEWAY)
 					dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr;
 				else
 					dest.s_addr = ip->ip_dst.s_addr;
 				/* Router requirements says to only send host redirects */
 				type = ICMP_REDIRECT;
 				code = ICMP_REDIRECT_HOST;
 			}
 		}
 	}
 
 	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
 
 	if (error == EMSGSIZE && ro.ro_rt)
 		mtu = ro.ro_rt->rt_mtu;
 	RO_RTFREE(&ro);
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);
 	else {
 		IPSTAT_INC(ips_forward);
 		if (type)
 			IPSTAT_INC(ips_redirectsent);
 		else {
 			if (mcopy)
 				m_freem(mcopy);
 			if (ia != NULL)
 				ifa_free(&ia->ia_ifa);
 			return;
 		}
 	}
 	if (mcopy == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return;
 	}
 
 	switch (error) {
 
 	case 0:				/* forwarded, but need redirect */
 		/* type, code set above */
 		break;
 
 	case ENETUNREACH:
 	case EHOSTUNREACH:
 	case ENETDOWN:
 	case EHOSTDOWN:
 	default:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_HOST;
 		break;
 
 	case EMSGSIZE:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_NEEDFRAG;
 
 #ifdef IPSEC
 		/* 
 		 * If IPsec is configured for this path,
 		 * override any possibly mtu value set by ip_output.
 		 */ 
 		mtu = ip_ipsec_mtu(mcopy, mtu);
 #endif /* IPSEC */
 		/*
 		 * If the MTU was set before make sure we are below the
 		 * interface MTU.
 		 * If the MTU wasn't set before use the interface mtu or
 		 * fall back to the next smaller mtu step compared to the
 		 * current packet size.
 		 */
 		if (mtu != 0) {
 			if (ia != NULL)
 				mtu = min(mtu, ia->ia_ifp->if_mtu);
 		} else {
 			if (ia != NULL)
 				mtu = ia->ia_ifp->if_mtu;
 			else
 				mtu = ip_next_mtu(ntohs(ip->ip_len), 0);
 		}
 		IPSTAT_INC(ips_cantfrag);
 		break;
 
 	case ENOBUFS:
 	case EACCES:			/* ipfw denied packet */
 		m_freem(mcopy);
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return;
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	icmp_error(mcopy, type, code, dest.s_addr, mtu);
 }
 
 void
 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
     struct mbuf *m)
 {
 
 	if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) {
 		struct bintime bt;
 
 		bintime(&bt);
 		if (inp->inp_socket->so_options & SO_BINTIME) {
 			*mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt),
 			    SCM_BINTIME, SOL_SOCKET);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 		if (inp->inp_socket->so_options & SO_TIMESTAMP) {
 			struct timeval tv;
 
 			bintime2timeval(&bt, &tv);
 			*mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv),
 			    SCM_TIMESTAMP, SOL_SOCKET);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 	if (inp->inp_flags & INP_RECVDSTADDR) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_dst,
 		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTTL) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_ttl,
 		    sizeof(u_char), IP_RECVTTL, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #ifdef notyet
 	/* XXX
 	 * Moving these out of udp_input() made them even more broken
 	 * than they already were.
 	 */
 	/* options were tossed already */
 	if (inp->inp_flags & INP_RECVOPTS) {
 		*mp = sbcreatecontrol((caddr_t)opts_deleted_above,
 		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	/* ip_srcroute doesn't do what we want here, need to fix */
 	if (inp->inp_flags & INP_RECVRETOPTS) {
 		*mp = sbcreatecontrol((caddr_t)ip_srcroute(m),
 		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 	if (inp->inp_flags & INP_RECVIF) {
 		struct ifnet *ifp;
 		struct sdlbuf {
 			struct sockaddr_dl sdl;
 			u_char	pad[32];
 		} sdlbuf;
 		struct sockaddr_dl *sdp;
 		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
 
 		if ((ifp = m->m_pkthdr.rcvif) &&
 		    ifp->if_index && ifp->if_index <= V_if_index) {
 			sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
 			/*
 			 * Change our mind and don't try copy.
 			 */
 			if (sdp->sdl_family != AF_LINK ||
 			    sdp->sdl_len > sizeof(sdlbuf)) {
 				goto makedummy;
 			}
 			bcopy(sdp, sdl2, sdp->sdl_len);
 		} else {
 makedummy:	
 			sdl2->sdl_len =
 			    offsetof(struct sockaddr_dl, sdl_data[0]);
 			sdl2->sdl_family = AF_LINK;
 			sdl2->sdl_index = 0;
 			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
 		}
 		*mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len,
 		    IP_RECVIF, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTOS) {
 		*mp = sbcreatecontrol((caddr_t)&ip->ip_tos,
 		    sizeof(u_char), IP_RECVTOS, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (inp->inp_flags2 & INP_RECVFLOWID) {
 		uint32_t flowid, flow_type;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		/*
 		 * XXX should handle the failure of one or the
 		 * other - don't populate both?
 		 */
 		*mp = sbcreatecontrol((caddr_t) &flowid,
 		    sizeof(uint32_t), IP_FLOWID, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 		*mp = sbcreatecontrol((caddr_t) &flow_type,
 		    sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 #ifdef	RSS
 	if (inp->inp_flags2 & INP_RECVRSSBUCKETID) {
 		uint32_t flowid, flow_type;
 		uint32_t rss_bucketid;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
 			*mp = sbcreatecontrol((caddr_t) &rss_bucketid,
 			   sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 }
 
 /*
  * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the
  * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on
  * locking.  This code remains in ip_input.c as ip_mroute.c is optionally
  * compiled.
  */
 static VNET_DEFINE(int, ip_rsvp_on);
 VNET_DEFINE(struct socket *, ip_rsvpd);
 
 #define	V_ip_rsvp_on		VNET(ip_rsvp_on)
 
 int
 ip_rsvp_init(struct socket *so)
 {
 
 	if (so->so_type != SOCK_RAW ||
 	    so->so_proto->pr_protocol != IPPROTO_RSVP)
 		return EOPNOTSUPP;
 
 	if (V_ip_rsvpd != NULL)
 		return EADDRINUSE;
 
 	V_ip_rsvpd = so;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-increment
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (!V_ip_rsvp_on) {
 		V_ip_rsvp_on = 1;
 		V_rsvp_on++;
 	}
 
 	return 0;
 }
 
 int
 ip_rsvp_done(void)
 {
 
 	V_ip_rsvpd = NULL;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-decrement
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (V_ip_rsvp_on) {
 		V_ip_rsvp_on = 0;
 		V_rsvp_on--;
 	}
 	return 0;
 }
 
 int
 rsvp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 
 	m = *mp;
 	*mp = NULL;
 
 	if (rsvp_input_p) { /* call the real one if loaded */
 		*mp = m;
 		rsvp_input_p(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 
 	/* Can still get packets with rsvp_on = 0 if there is a local member
 	 * of the group to which the RSVP packet is addressed.  But in this
 	 * case we want to throw the packet away.
 	 */
 	
 	if (!V_rsvp_on) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (V_ip_rsvpd != NULL) { 
 		*mp = m;
 		rip_input(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 	/* Drop the packet */
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
Index: head/sys/netinet/ip_options.c
===================================================================
--- head/sys/netinet/ip_options.c	(revision 292014)
+++ head/sys/netinet/ip_options.c	(revision 292015)
@@ -1,753 +1,757 @@
 /*
  * Copyright (c) 1982, 1986, 1988, 1993
  *      The Regents of the University of California.
  * Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipstealth.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/ip_icmp.h>
 #include <machine/in_cksum.h>
 
 #include <sys/socketvar.h>
 
 static VNET_DEFINE(int, ip_dosourceroute);
 SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_dosourceroute), 0,
     "Enable forwarding source routed IP packets");
 #define	V_ip_dosourceroute	VNET(ip_dosourceroute)
 
 static VNET_DEFINE(int,	ip_acceptsourceroute);
 SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, 
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_acceptsourceroute), 0, 
     "Enable accepting source routed IP packets");
 #define	V_ip_acceptsourceroute	VNET(ip_acceptsourceroute)
 
 VNET_DEFINE(int, ip_doopts) = 1; /* 0 = ignore, 1 = process, 2 = reject */
 SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_doopts), 0, "Enable IP options processing ([LS]SRR, RR, TS)");
 
 static void	save_rte(struct mbuf *m, u_char *, struct in_addr);
 
 /*
  * Do option processing on a datagram, possibly discarding it if bad options
  * are encountered, or forwarding it if source-routed.
  *
  * The pass argument is used when operating in the IPSTEALTH mode to tell
  * what options to process: [LS]SRR (pass 0) or the others (pass 1).  The
  * reason for as many as two passes is that when doing IPSTEALTH, non-routing
  * options should be processed only if the packet is for us.
  *
  * Returns 1 if packet has been forwarded/freed, 0 if the packet should be
  * processed further.
  */
 int
 ip_dooptions(struct mbuf *m, int pass)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	u_char *cp;
 	struct in_ifaddr *ia;
 	int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
 	struct in_addr *sin, dst;
 	uint32_t ntime;
 	struct nhop4_extended nh_ext;
 	struct	sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
 
 	/* Ignore or reject packets with IP options. */
 	if (V_ip_doopts == 0)
 		return 0;
 	else if (V_ip_doopts == 2) {
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_FILTER_PROHIB;
 		goto bad;
 	}
 
 	dst = ip->ip_dst;
 	cp = (u_char *)(ip + 1);
 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[IPOPT_OPTVAL];
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < IPOPT_OLEN + sizeof(*cp)) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 			optlen = cp[IPOPT_OLEN];
 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 		}
 		switch (opt) {
 
 		default:
 			break;
 
 		/*
 		 * Source routing with record.  Find interface with current
 		 * destination address.  If none on this machine then drop if
 		 * strictly routed, or do nothing if loosely routed.  Record
 		 * interface address and bring up next address component.  If
 		 * strictly routed make sure next address is on directly
 		 * accessible net.
 		 */
 		case IPOPT_LSRR:
 		case IPOPT_SSRR:
 #ifdef IPSTEALTH
 			if (V_ipstealth && pass > 0)
 				break;
 #endif
 			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 				goto bad;
 			}
 			ipaddr.sin_addr = ip->ip_dst;
 			if (ifa_ifwithaddr_check((struct sockaddr *)&ipaddr)
 			    == 0) {
 				if (opt == IPOPT_SSRR) {
 					type = ICMP_UNREACH;
 					code = ICMP_UNREACH_SRCFAIL;
 					goto bad;
 				}
 				if (!V_ip_dosourceroute)
 					goto nosourcerouting;
 				/*
 				 * Loose routing, and not at next destination
 				 * yet; nothing to do except forward.
 				 */
 				break;
 			}
 			off--;			/* 0 origin */
 			if (off > optlen - (int)sizeof(struct in_addr)) {
 				/*
 				 * End of source route.  Should be for us.
 				 */
 				if (!V_ip_acceptsourceroute)
 					goto nosourcerouting;
 				save_rte(m, cp, ip->ip_src);
 				break;
 			}
 #ifdef IPSTEALTH
 			if (V_ipstealth)
 				goto dropit;
 #endif
 			if (!V_ip_dosourceroute) {
 				if (V_ipforwarding) {
 					char buf[16]; /* aaa.bbb.ccc.ddd\0 */
 					/*
 					 * Acting as a router, so generate
 					 * ICMP
 					 */
 nosourcerouting:
 					strcpy(buf, inet_ntoa(ip->ip_dst));
 					log(LOG_WARNING, 
 					    "attempted source route from %s to %s\n",
 					    inet_ntoa(ip->ip_src), buf);
 					type = ICMP_UNREACH;
 					code = ICMP_UNREACH_SRCFAIL;
 					goto bad;
 				} else {
 					/*
 					 * Not acting as a router, so
 					 * silently drop.
 					 */
 #ifdef IPSTEALTH
 dropit:
 #endif
 					IPSTAT_INC(ips_cantforward);
 					m_freem(m);
 					return (1);
 				}
 			}
 
 			/*
 			 * locate outgoing interface
 			 */
 			(void)memcpy(&ipaddr.sin_addr, cp + off,
 			    sizeof(ipaddr.sin_addr));
 
 			type = ICMP_UNREACH;
 			code = ICMP_UNREACH_SRCFAIL;
 
 			if (opt == IPOPT_SSRR) {
 #define	INA	struct in_ifaddr *
 #define	SA	struct sockaddr *
 			    ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr,
 					    RT_ALL_FIBS);
 			    if (ia == NULL)
 				    ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0,
 						    RT_ALL_FIBS);
 				if (ia == NULL)
 					goto bad;
 
 				memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
 				    sizeof(struct in_addr));
 				ifa_free(&ia->ia_ifa);
 			} else {
 				/* XXX MRT 0 for routing */
 				if (fib4_lookup_nh_ext(M_GETFIB(m),
 				    ipaddr.sin_addr, 0, 0, &nh_ext) != 0)
 					goto bad;
 
 				memcpy(cp + off, &nh_ext.nh_src,
 				    sizeof(struct in_addr));
 			}
 
 			ip->ip_dst = ipaddr.sin_addr;
 			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
 			/*
 			 * Let ip_intr's mcast routing check handle mcast pkts
 			 */
 			forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
 			break;
 
 		case IPOPT_RR:
 #ifdef IPSTEALTH
 			if (V_ipstealth && pass == 0)
 				break;
 #endif
 			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 				goto bad;
 			}
 			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 				goto bad;
 			}
 			/*
 			 * If no space remains, ignore.
 			 */
 			off--;			/* 0 origin */
 			if (off > optlen - (int)sizeof(struct in_addr))
 				break;
 			(void)memcpy(&ipaddr.sin_addr, &ip->ip_dst,
 			    sizeof(ipaddr.sin_addr));
 			/*
 			 * Locate outgoing interface; if we're the
 			 * destination, use the incoming interface (should be
 			 * same).
 			 */
-			if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL &&
-			    (ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m))) == NULL) {
+			if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) != NULL) {
+				memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
+				    sizeof(struct in_addr));
+				ifa_free(&ia->ia_ifa);
+			} else if (fib4_lookup_nh_ext(M_GETFIB(m),
+			    ipaddr.sin_addr, 0, 0, &nh_ext) == 0) {
+				memcpy(cp + off, &nh_ext.nh_src,
+				    sizeof(struct in_addr));
+			} else {
 				type = ICMP_UNREACH;
 				code = ICMP_UNREACH_HOST;
 				goto bad;
 			}
-			(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
-			    sizeof(struct in_addr));
-			ifa_free(&ia->ia_ifa);
 			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
 			break;
 
 		case IPOPT_TS:
 #ifdef IPSTEALTH
 			if (V_ipstealth && pass == 0)
 				break;
 #endif
 			code = cp - (u_char *)ip;
 			if (optlen < 4 || optlen > 40) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 			if ((off = cp[IPOPT_OFFSET]) < 5) {
 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
 				goto bad;
 			}
 			if (off > optlen - (int)sizeof(int32_t)) {
 				cp[IPOPT_OFFSET + 1] += (1 << 4);
 				if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) {
 					code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 					goto bad;
 				}
 				break;
 			}
 			off--;				/* 0 origin */
 			sin = (struct in_addr *)(cp + off);
 			switch (cp[IPOPT_OFFSET + 1] & 0x0f) {
 
 			case IPOPT_TS_TSONLY:
 				break;
 
 			case IPOPT_TS_TSANDADDR:
 				if (off + sizeof(uint32_t) +
 				    sizeof(struct in_addr) > optlen) {
 					code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 					goto bad;
 				}
 				ipaddr.sin_addr = dst;
 				ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
 							    m->m_pkthdr.rcvif);
 				if (ia == NULL)
 					continue;
 				(void)memcpy(sin, &IA_SIN(ia)->sin_addr,
 				    sizeof(struct in_addr));
 				ifa_free(&ia->ia_ifa);
 				cp[IPOPT_OFFSET] += sizeof(struct in_addr);
 				off += sizeof(struct in_addr);
 				break;
 
 			case IPOPT_TS_PRESPEC:
 				if (off + sizeof(uint32_t) +
 				    sizeof(struct in_addr) > optlen) {
 					code = &cp[IPOPT_OFFSET] - (u_char *)ip;
 					goto bad;
 				}
 				(void)memcpy(&ipaddr.sin_addr, sin,
 				    sizeof(struct in_addr));
 				if (ifa_ifwithaddr_check((SA)&ipaddr) == 0)
 					continue;
 				cp[IPOPT_OFFSET] += sizeof(struct in_addr);
 				off += sizeof(struct in_addr);
 				break;
 
 			default:
 				code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip;
 				goto bad;
 			}
 			ntime = iptime();
 			(void)memcpy(cp + off, &ntime, sizeof(uint32_t));
 			cp[IPOPT_OFFSET] += sizeof(uint32_t);
 		}
 	}
 	if (forward && V_ipforwarding) {
 		ip_forward(m, 1);
 		return (1);
 	}
 	return (0);
 bad:
 	icmp_error(m, type, code, 0, 0);
 	IPSTAT_INC(ips_badoptions);
 	return (1);
 }
 
 /*
  * Save incoming source route for use in replies, to be picked up later by
  * ip_srcroute if the receiver is interested.
  */
 static void
 save_rte(struct mbuf *m, u_char *option, struct in_addr dst)
 {
 	unsigned olen;
 	struct ipopt_tag *opts;
 
 	opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS,
 	    sizeof(struct ipopt_tag), M_NOWAIT);
 	if (opts == NULL)
 		return;
 
 	olen = option[IPOPT_OLEN];
 	if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) {
 		m_tag_free((struct m_tag *)opts);
 		return;
 	}
 	bcopy(option, opts->ip_srcrt.srcopt, olen);
 	opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
 	opts->ip_srcrt.dst = dst;
 	m_tag_prepend(m, (struct m_tag *)opts);
 }
 
 /*
  * Retrieve incoming source route for use in replies, in the same form used
  * by setsockopt.  The first hop is placed before the options, will be
  * removed later.
  */
 struct mbuf *
 ip_srcroute(struct mbuf *m0)
 {
 	struct in_addr *p, *q;
 	struct mbuf *m;
 	struct ipopt_tag *opts;
 
 	opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL);
 	if (opts == NULL)
 		return (NULL);
 
 	if (opts->ip_nhops == 0)
 		return (NULL);
 	m = m_get(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 #define OPTSIZ	(sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt))
 
 	/* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
 	m->m_len = opts->ip_nhops * sizeof(struct in_addr) +
 	    sizeof(struct in_addr) + OPTSIZ;
 
 	/*
 	 * First, save first hop for return route.
 	 */
 	p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]);
 	*(mtod(m, struct in_addr *)) = *p--;
 
 	/*
 	 * Copy option fields and padding (nop) to mbuf.
 	 */
 	opts->ip_srcrt.nop = IPOPT_NOP;
 	opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
 	(void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
 	    &(opts->ip_srcrt.nop), OPTSIZ);
 	q = (struct in_addr *)(mtod(m, caddr_t) +
 	    sizeof(struct in_addr) + OPTSIZ);
 #undef OPTSIZ
 	/*
 	 * Record return path as an IP source route, reversing the path
 	 * (pointers are now aligned).
 	 */
 	while (p >= opts->ip_srcrt.route) {
 		*q++ = *p--;
 	}
 	/*
 	 * Last hop goes to final destination.
 	 */
 	*q = opts->ip_srcrt.dst;
 	m_tag_delete(m0, (struct m_tag *)opts);
 	return (m);
 }
 
 /*
  * Strip out IP options, at higher level protocol in the kernel.
  */
 void
 ip_stripoptions(struct mbuf *m)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	int olen;
 
 	olen = (ip->ip_hl << 2) - sizeof(struct ip);
 	m->m_len -= olen;
 	if (m->m_flags & M_PKTHDR)
 		m->m_pkthdr.len -= olen;
 	ip->ip_len = htons(ntohs(ip->ip_len) - olen);
 	ip->ip_hl = sizeof(struct ip) >> 2;
 
 	bcopy((char *)ip + sizeof(struct ip) + olen, (ip + 1),
 	    (size_t )(m->m_len - sizeof(struct ip)));
 }
 
 /*
  * Insert IP options into preformed packet.  Adjust IP destination as
  * required for IP source routing, as indicated by a non-zero in_addr at the
  * start of the options.
  *
  * XXX This routine assumes that the packet has no options in place.
  */
 struct mbuf *
 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
 {
 	struct ipoption *p = mtod(opt, struct ipoption *);
 	struct mbuf *n;
 	struct ip *ip = mtod(m, struct ip *);
 	unsigned optlen;
 
 	optlen = opt->m_len - sizeof(p->ipopt_dst);
 	if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) {
 		*phlen = 0;
 		return (m);		/* XXX should fail */
 	}
 	if (p->ipopt_dst.s_addr)
 		ip->ip_dst = p->ipopt_dst;
 	if (!M_WRITABLE(m) || M_LEADINGSPACE(m) < optlen) {
 		n = m_gethdr(M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			*phlen = 0;
 			return (m);
 		}
 		m_move_pkthdr(n, m);
 		n->m_pkthdr.rcvif = NULL;
 		n->m_pkthdr.len += optlen;
 		m->m_len -= sizeof(struct ip);
 		m->m_data += sizeof(struct ip);
 		n->m_next = m;
 		m = n;
 		m->m_len = optlen + sizeof(struct ip);
 		m->m_data += max_linkhdr;
 		bcopy(ip, mtod(m, void *), sizeof(struct ip));
 	} else {
 		m->m_data -= optlen;
 		m->m_len += optlen;
 		m->m_pkthdr.len += optlen;
 		bcopy(ip, mtod(m, void *), sizeof(struct ip));
 	}
 	ip = mtod(m, struct ip *);
 	bcopy(p->ipopt_list, ip + 1, optlen);
 	*phlen = sizeof(struct ip) + optlen;
 	ip->ip_v = IPVERSION;
 	ip->ip_hl = *phlen >> 2;
 	ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
 	return (m);
 }
 
 /*
  * Copy options from ip to jp, omitting those not copied during
  * fragmentation.
  */
 int
 ip_optcopy(struct ip *ip, struct ip *jp)
 {
 	u_char *cp, *dp;
 	int opt, optlen, cnt;
 
 	cp = (u_char *)(ip + 1);
 	dp = (u_char *)(jp + 1);
 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP) {
 			/* Preserve for IP mcast tunnel's LSRR alignment. */
 			*dp++ = IPOPT_NOP;
 			optlen = 1;
 			continue;
 		}
 
 		KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
 		    ("ip_optcopy: malformed ipv4 option"));
 		optlen = cp[IPOPT_OLEN];
 		KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
 		    ("ip_optcopy: malformed ipv4 option"));
 
 		/* Bogus lengths should have been caught by ip_dooptions. */
 		if (optlen > cnt)
 			optlen = cnt;
 		if (IPOPT_COPIED(opt)) {
 			bcopy(cp, dp, optlen);
 			dp += optlen;
 		}
 	}
 	for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
 		*dp++ = IPOPT_EOL;
 	return (optlen);
 }
 
 /*
  * Set up IP options in pcb for insertion in output packets.  Store in mbuf
  * with pointer in pcbopt, adding pseudo-option with destination address if
  * source routed.
  */
 int
 ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m)
 {
 	int cnt, optlen;
 	u_char *cp;
 	struct mbuf **pcbopt;
 	u_char opt;
 
 	INP_WLOCK_ASSERT(inp);
 
 	pcbopt = &inp->inp_options;
 
 	/* turn off any old options */
 	if (*pcbopt)
 		(void)m_free(*pcbopt);
 	*pcbopt = 0;
 	if (m == NULL || m->m_len == 0) {
 		/*
 		 * Only turning off any previous options.
 		 */
 		if (m != NULL)
 			(void)m_free(m);
 		return (0);
 	}
 
 	if (m->m_len % sizeof(int32_t))
 		goto bad;
 	/*
 	 * IP first-hop destination address will be stored before actual
 	 * options; move other options back and clear it when none present.
 	 */
 	if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
 		goto bad;
 	cnt = m->m_len;
 	m->m_len += sizeof(struct in_addr);
 	cp = mtod(m, u_char *) + sizeof(struct in_addr);
 	bcopy(mtod(m, void *), cp, (unsigned)cnt);
 	bzero(mtod(m, void *), sizeof(struct in_addr));
 
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[IPOPT_OPTVAL];
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < IPOPT_OLEN + sizeof(*cp))
 				goto bad;
 			optlen = cp[IPOPT_OLEN];
 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
 				goto bad;
 		}
 		switch (opt) {
 
 		default:
 			break;
 
 		case IPOPT_LSRR:
 		case IPOPT_SSRR:
 			/*
 			 * User process specifies route as:
 			 *
 			 *	->A->B->C->D
 			 *
 			 * D must be our final destination (but we can't
 			 * check that since we may not have connected yet).
 			 * A is first hop destination, which doesn't appear
 			 * in actual IP option, but is stored before the
 			 * options.
 			 */
 			/* XXX-BZ PRIV_NETINET_SETHDROPTS? */
 			if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
 				goto bad;
 			m->m_len -= sizeof(struct in_addr);
 			cnt -= sizeof(struct in_addr);
 			optlen -= sizeof(struct in_addr);
 			cp[IPOPT_OLEN] = optlen;
 			/*
 			 * Move first hop before start of options.
 			 */
 			bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
 			    sizeof(struct in_addr));
 			/*
 			 * Then copy rest of options back
 			 * to close up the deleted entry.
 			 */
 			bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)),
 			    &cp[IPOPT_OFFSET+1],
 			    (unsigned)cnt - (IPOPT_MINOFF - 1));
 			break;
 		}
 	}
 	if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
 		goto bad;
 	*pcbopt = m;
 	return (0);
 
 bad:
 	(void)m_free(m);
 	return (EINVAL);
 }
 
 /*
  * Check for the presence of the IP Router Alert option [RFC2113]
  * in the header of an IPv4 datagram.
  *
  * This call is not intended for use from the forwarding path; it is here
  * so that protocol domains may check for the presence of the option.
  * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert
  * option does not have much relevance to the implementation, though this
  * may change in future.
  * Router alert options SHOULD be passed if running in IPSTEALTH mode and
  * we are not the endpoint.
  * Length checks on individual options should already have been peformed
  * by ip_dooptions() therefore they are folded under INVARIANTS here.
  *
  * Return zero if not present or options are invalid, non-zero if present.
  */
 int
 ip_checkrouteralert(struct mbuf *m)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	u_char *cp;
 	int opt, optlen, cnt, found_ra;
 
 	found_ra = 0;
 	cp = (u_char *)(ip + 1);
 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[IPOPT_OPTVAL];
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 #ifdef INVARIANTS
 			if (cnt < IPOPT_OLEN + sizeof(*cp))
 				break;
 #endif
 			optlen = cp[IPOPT_OLEN];
 #ifdef INVARIANTS
 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
 				break;
 #endif
 		}
 		switch (opt) {
 		case IPOPT_RA:
 #ifdef INVARIANTS
 			if (optlen != IPOPT_OFFSET + sizeof(uint16_t) ||
 			    (*((uint16_t *)&cp[IPOPT_OFFSET]) != 0))
 			    break;
 			else
 #endif
 			found_ra = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	return (found_ra);
 }
Index: head/sys/netinet/ip_var.h
===================================================================
--- head/sys/netinet/ip_var.h	(revision 292014)
+++ head/sys/netinet/ip_var.h	(revision 292015)
@@ -1,306 +1,304 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_var.h	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IP_VAR_H_
 #define	_NETINET_IP_VAR_H_
 
 #include <sys/queue.h>
 
 /*
  * Overlay for ip header used by other protocols (tcp, udp).
  */
 struct ipovly {
 	u_char	ih_x1[9];		/* (unused) */
 	u_char	ih_pr;			/* protocol */
 	u_short	ih_len;			/* protocol length */
 	struct	in_addr ih_src;		/* source internet address */
 	struct	in_addr ih_dst;		/* destination internet address */
 };
 
 #ifdef _KERNEL
 /*
  * Ip reassembly queue structure.  Each fragment
  * being reassembled is attached to one of these structures.
  * They are timed out after ipq_ttl drops to 0, and may also
  * be reclaimed if memory becomes tight.
  */
 struct ipq {
 	TAILQ_ENTRY(ipq) ipq_list;	/* to other reass headers */
 	u_char	ipq_ttl;		/* time for reass q to live */
 	u_char	ipq_p;			/* protocol of this fragment */
 	u_short	ipq_id;			/* sequence id for reassembly */
 	struct mbuf *ipq_frags;		/* to ip headers of fragments */
 	struct	in_addr ipq_src,ipq_dst;
 	u_char	ipq_nfrags;		/* # frags in this packet */
 	struct label *ipq_label;	/* MAC label */
 };
 #endif /* _KERNEL */
 
 /*
  * Structure stored in mbuf in inpcb.ip_options
  * and passed to ip_output when ip options are in use.
  * The actual length of the options (including ipopt_dst)
  * is in m_len.
  */
 #define MAX_IPOPTLEN	40
 
 struct ipoption {
 	struct	in_addr ipopt_dst;	/* first-hop dst if source routed */
 	char	ipopt_list[MAX_IPOPTLEN];	/* options proper */
 };
 
 /*
  * Structure attached to inpcb.ip_moptions and
  * passed to ip_output when IP multicast options are in use.
  * This structure is lazy-allocated.
  */
 struct ip_moptions {
 	struct	ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */
 	struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */
 	u_long	imo_multicast_vif;	/* vif num outgoing multicasts */
 	u_char	imo_multicast_ttl;	/* TTL for outgoing multicasts */
 	u_char	imo_multicast_loop;	/* 1 => hear sends if a member */
 	u_short	imo_num_memberships;	/* no. memberships this socket */
 	u_short	imo_max_memberships;	/* max memberships this socket */
 	struct	in_multi **imo_membership;	/* group memberships */
 	struct	in_mfilter *imo_mfilters;	/* source filters */
 	STAILQ_ENTRY(ip_moptions) imo_link;
 };
 
 struct	ipstat {
 	uint64_t ips_total;		/* total packets received */
 	uint64_t ips_badsum;		/* checksum bad */
 	uint64_t ips_tooshort;		/* packet too short */
 	uint64_t ips_toosmall;		/* not enough data */
 	uint64_t ips_badhlen;		/* ip header length < data size */
 	uint64_t ips_badlen;		/* ip length < ip header length */
 	uint64_t ips_fragments;		/* fragments received */
 	uint64_t ips_fragdropped;	/* frags dropped (dups, out of space) */
 	uint64_t ips_fragtimeout;	/* fragments timed out */
 	uint64_t ips_forward;		/* packets forwarded */
 	uint64_t ips_fastforward;	/* packets fast forwarded */
 	uint64_t ips_cantforward;	/* packets rcvd for unreachable dest */
 	uint64_t ips_redirectsent;	/* packets forwarded on same net */
 	uint64_t ips_noproto;		/* unknown or unsupported protocol */
 	uint64_t ips_delivered;		/* datagrams delivered to upper level*/
 	uint64_t ips_localout;		/* total ip packets generated here */
 	uint64_t ips_odropped;		/* lost packets due to nobufs, etc. */
 	uint64_t ips_reassembled;	/* total packets reassembled ok */
 	uint64_t ips_fragmented;	/* datagrams successfully fragmented */
 	uint64_t ips_ofragments;	/* output fragments created */
 	uint64_t ips_cantfrag;		/* don't fragment flag was set, etc. */
 	uint64_t ips_badoptions;		/* error in option processing */
 	uint64_t ips_noroute;		/* packets discarded due to no route */
 	uint64_t ips_badvers;		/* ip version != 4 */
 	uint64_t ips_rawout;		/* total raw ip packets generated */
 	uint64_t ips_toolong;		/* ip length > max ip packet size */
 	uint64_t ips_notmember;		/* multicasts for unregistered grps */
 	uint64_t ips_nogif;		/* no match gif found */
 	uint64_t ips_badaddr;		/* invalid address on header */
 };
 
 #ifdef _KERNEL
 
 #include <sys/counter.h>
 #include <net/vnet.h>
 
 VNET_PCPUSTAT_DECLARE(struct ipstat, ipstat);
 /*
  * In-kernel consumers can use these accessor macros directly to update
  * stats.
  */
 #define	IPSTAT_ADD(name, val)	\
     VNET_PCPUSTAT_ADD(struct ipstat, ipstat, name, (val))
 #define	IPSTAT_SUB(name, val)	IPSTAT_ADD(name, -(val))
 #define	IPSTAT_INC(name)	IPSTAT_ADD(name, 1)
 #define	IPSTAT_DEC(name)	IPSTAT_SUB(name, 1)
 
 /*
  * Kernel module consumers must use this accessor macro.
  */
 void	kmod_ipstat_inc(int statnum);
 #define	KMOD_IPSTAT_INC(name)	\
     kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(uint64_t))
 void	kmod_ipstat_dec(int statnum);
 #define	KMOD_IPSTAT_DEC(name)	\
     kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(uint64_t))
 
 /* flags passed to ip_output as last parameter */
 #define	IP_FORWARDING		0x1		/* most of ip header exists */
 #define	IP_RAWOUTPUT		0x2		/* raw ip header exists */
 #define	IP_SENDONES		0x4		/* send all-ones broadcast */
 #define	IP_SENDTOIF		0x8		/* send on specific ifnet */
 #define IP_ROUTETOIF		SO_DONTROUTE	/* 0x10 bypass routing tables */
 #define IP_ALLOWBROADCAST	SO_BROADCAST	/* 0x20 can send broadcast packets */
 #define	IP_NODEFAULTFLOWID	0x40		/* Don't set the flowid from inp */
 
 #ifdef __NO_STRICT_ALIGNMENT
 #define IP_HDR_ALIGNED_P(ip)	1
 #else
 #define IP_HDR_ALIGNED_P(ip)	((((intptr_t) (ip)) & 3) == 0)
 #endif
 
 struct ip;
 struct inpcb;
 struct route;
 struct sockopt;
 
 VNET_DECLARE(int, ip_defttl);			/* default IP ttl */
 VNET_DECLARE(int, ipforwarding);		/* ip forwarding */
 #ifdef IPSTEALTH
 VNET_DECLARE(int, ipstealth);			/* stealth forwarding */
 #endif
 extern u_char	ip_protox[];
 VNET_DECLARE(struct socket *, ip_rsvpd);	/* reservation protocol daemon*/
 VNET_DECLARE(struct socket *, ip_mrouter);	/* multicast routing daemon */
 extern int	(*legal_vif_num)(int);
 extern u_long	(*ip_mcast_src)(int);
 VNET_DECLARE(int, rsvp_on);
 VNET_DECLARE(int, drop_redirect);
 extern struct	pr_usrreqs rip_usrreqs;
 
 #define	V_ip_id			VNET(ip_id)
 #define	V_ip_defttl		VNET(ip_defttl)
 #define	V_ipforwarding		VNET(ipforwarding)
 #ifdef IPSTEALTH
 #define	V_ipstealth		VNET(ipstealth)
 #endif
 #define	V_ip_rsvpd		VNET(ip_rsvpd)
 #define	V_ip_mrouter		VNET(ip_mrouter)
 #define	V_rsvp_on		VNET(rsvp_on)
 #define	V_drop_redirect		VNET(drop_redirect)
 
 void	inp_freemoptions(struct ip_moptions *);
 int	inp_getmoptions(struct inpcb *, struct sockopt *);
 int	inp_setmoptions(struct inpcb *, struct sockopt *);
 
 int	ip_ctloutput(struct socket *, struct sockopt *sopt);
 void	ip_drain(void);
 int	ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
 	    u_long if_hwassist_flags);
 void	ip_forward(struct mbuf *m, int srcrt);
 void	ip_init(void);
 #ifdef VIMAGE
 void	ip_destroy(void);
 #endif
 extern int
 	(*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 	    struct ip_moptions *);
 int	ip_output(struct mbuf *,
 	    struct mbuf *, struct route *, int, struct ip_moptions *,
 	    struct inpcb *);
 int	ipproto_register(short);
 int	ipproto_unregister(short);
 struct mbuf *
 	ip_reass(struct mbuf *);
-struct in_ifaddr *
-	ip_rtaddr(struct in_addr, u_int fibnum);
 void	ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
 	    struct mbuf *);
 void	ip_slowtimo(void);
 void	ip_fillid(struct ip *);
 int	rip_ctloutput(struct socket *, struct sockopt *);
 void	rip_ctlinput(int, struct sockaddr *, void *);
 void	rip_init(void);
 #ifdef VIMAGE
 void	rip_destroy(void);
 #endif
 int	rip_input(struct mbuf **, int *, int);
 int	rip_output(struct mbuf *, struct socket *, ...);
 int	ipip_input(struct mbuf **, int *, int);
 int	rsvp_input(struct mbuf **, int *, int);
 int	ip_rsvp_init(struct socket *);
 int	ip_rsvp_done(void);
 extern int	(*ip_rsvp_vif)(struct socket *, struct sockopt *);
 extern void	(*ip_rsvp_force_done)(struct socket *);
 extern int	(*rsvp_input_p)(struct mbuf **, int *, int);
 
 VNET_DECLARE(struct pfil_head, inet_pfil_hook);	/* packet filter hooks */
 #define	V_inet_pfil_hook	VNET(inet_pfil_hook)
 
 void	in_delayed_cksum(struct mbuf *m);
 
 /* Hooks for ipfw, dummynet, divert etc. Most are declared in raw_ip.c */
 /*
  * Reference to an ipfw or packet filter rule that can be carried
  * outside critical sections.
  * A rule is identified by rulenum:rule_id which is ordered.
  * In version chain_id the rule can be found in slot 'slot', so
  * we don't need a lookup if chain_id == chain->id.
  *
  * On exit from the firewall this structure refers to the rule after
  * the matching one (slot points to the new rule; rulenum:rule_id-1
  * is the matching rule), and additional info (e.g. info often contains
  * the insn argument or tablearg in the low 16 bits, in host format).
  * On entry, the structure is valid if slot>0, and refers to the starting
  * rules. 'info' contains the reason for reinject, e.g. divert port,
  * divert direction, and so on.
  */
 struct ipfw_rule_ref {
 	uint32_t	slot;		/* slot for matching rule	*/
 	uint32_t	rulenum;	/* matching rule number		*/
 	uint32_t	rule_id;	/* matching rule id		*/
 	uint32_t	chain_id;	/* ruleset id			*/
 	uint32_t	info;		/* see below			*/
 };
 
 enum {
 	IPFW_INFO_MASK	= 0x0000ffff,
 	IPFW_INFO_OUT	= 0x00000000,	/* outgoing, just for convenience */
 	IPFW_INFO_IN	= 0x80000000,	/* incoming, overloads dir */
 	IPFW_ONEPASS	= 0x40000000,	/* One-pass, do not reinject */
 	IPFW_IS_MASK	= 0x30000000,	/* which source ? */
 	IPFW_IS_DIVERT	= 0x20000000,
 	IPFW_IS_DUMMYNET =0x10000000,
 	IPFW_IS_PIPE	= 0x08000000,	/* pipe=1, queue = 0 */
 };
 #define MTAG_IPFW	1148380143	/* IPFW-tagged cookie */
 #define MTAG_IPFW_RULE	1262273568	/* rule reference */
 #define	MTAG_IPFW_CALL	1308397630	/* call stack */
 
 struct ip_fw_args;
 typedef int	(*ip_fw_chk_ptr_t)(struct ip_fw_args *args);
 typedef int	(*ip_fw_ctl_ptr_t)(struct sockopt *);
 VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr);
 #define	V_ip_fw_ctl_ptr		VNET(ip_fw_ctl_ptr)
 
 /* Divert hooks. */
 extern void	(*ip_divert_ptr)(struct mbuf *m, int incoming);
 /* ng_ipfw hooks -- XXX make it the same as divert and dummynet */
 extern int	(*ng_ipfw_input_p)(struct mbuf **, int,
 			struct ip_fw_args *, int);
 
 extern int	(*ip_dn_ctl_ptr)(struct sockopt *);
 extern int	(*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *);
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_IP_VAR_H_ */
Index: head/sys/netinet6/icmp6.c
===================================================================
--- head/sys/netinet6/icmp6.c	(revision 292014)
+++ head/sys/netinet6/icmp6.c	(revision 292015)
@@ -1,2882 +1,2873 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_icmp.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define	MBUF_PRIVATE	/* XXXRW: Optimisation tries to avoid M_EXT mbufs */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/tcp_var.h>
 
+#include <netinet6/in6_fib.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6protosw.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/send.h>
 
 extern struct domain inet6domain;
 
 VNET_PCPUSTAT_DEFINE(struct icmp6stat, icmp6stat);
 VNET_PCPUSTAT_SYSINIT(icmp6stat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(icmp6stat);
 #endif /* VIMAGE */
 
 VNET_DECLARE(struct inpcbinfo, ripcbinfo);
 VNET_DECLARE(struct inpcbhead, ripcb);
 VNET_DECLARE(int, icmp6errppslim);
 static VNET_DEFINE(int, icmp6errpps_count) = 0;
 static VNET_DEFINE(struct timeval, icmp6errppslim_last);
 VNET_DECLARE(int, icmp6_nodeinfo);
 
 #define	V_ripcbinfo			VNET(ripcbinfo)
 #define	V_ripcb				VNET(ripcb)
 #define	V_icmp6errppslim		VNET(icmp6errppslim)
 #define	V_icmp6errpps_count		VNET(icmp6errpps_count)
 #define	V_icmp6errppslim_last		VNET(icmp6errppslim_last)
 #define	V_icmp6_nodeinfo		VNET(icmp6_nodeinfo)
 
 static void icmp6_errcount(int, int);
 static int icmp6_rip6_input(struct mbuf **, int);
 static int icmp6_ratelimit(const struct in6_addr *, const int, const int);
 static const char *icmp6_redirect_diag(struct in6_addr *,
 	struct in6_addr *, struct in6_addr *);
 static struct mbuf *ni6_input(struct mbuf *, int);
 static struct mbuf *ni6_nametodns(const char *, int, int);
 static int ni6_dnsmatch(const char *, int, const char *, int);
 static int ni6_addrs(struct icmp6_nodeinfo *, struct mbuf *,
 			  struct ifnet **, struct in6_addr *);
 static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *,
 				struct ifnet *, int);
 static int icmp6_notify_error(struct mbuf **, int, int, int);
 
 /*
  * Kernel module interface for updating icmp6stat.  The argument is an index
  * into icmp6stat treated as an array of u_quad_t.  While this encodes the
  * general layout of icmp6stat into the caller, it doesn't encode its
  * location, so that future changes to add, for example, per-CPU stats
  * support won't cause binary compatibility problems for kernel modules.
  */
 void
 kmod_icmp6stat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(icmp6stat)[statnum], 1);
 }
 
 static void
 icmp6_errcount(int type, int code)
 {
 	switch (type) {
 	case ICMP6_DST_UNREACH:
 		switch (code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 			ICMP6STAT_INC(icp6s_odst_unreach_noroute);
 			return;
 		case ICMP6_DST_UNREACH_ADMIN:
 			ICMP6STAT_INC(icp6s_odst_unreach_admin);
 			return;
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			ICMP6STAT_INC(icp6s_odst_unreach_beyondscope);
 			return;
 		case ICMP6_DST_UNREACH_ADDR:
 			ICMP6STAT_INC(icp6s_odst_unreach_addr);
 			return;
 		case ICMP6_DST_UNREACH_NOPORT:
 			ICMP6STAT_INC(icp6s_odst_unreach_noport);
 			return;
 		}
 		break;
 	case ICMP6_PACKET_TOO_BIG:
 		ICMP6STAT_INC(icp6s_opacket_too_big);
 		return;
 	case ICMP6_TIME_EXCEEDED:
 		switch (code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 			ICMP6STAT_INC(icp6s_otime_exceed_transit);
 			return;
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			ICMP6STAT_INC(icp6s_otime_exceed_reassembly);
 			return;
 		}
 		break;
 	case ICMP6_PARAM_PROB:
 		switch (code) {
 		case ICMP6_PARAMPROB_HEADER:
 			ICMP6STAT_INC(icp6s_oparamprob_header);
 			return;
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			ICMP6STAT_INC(icp6s_oparamprob_nextheader);
 			return;
 		case ICMP6_PARAMPROB_OPTION:
 			ICMP6STAT_INC(icp6s_oparamprob_option);
 			return;
 		}
 		break;
 	case ND_REDIRECT:
 		ICMP6STAT_INC(icp6s_oredirect);
 		return;
 	}
 	ICMP6STAT_INC(icp6s_ounknown);
 }
 
 /*
  * A wrapper function for icmp6_error() necessary when the erroneous packet
  * may not contain enough scope zone information.
  */
 void
 icmp6_error2(struct mbuf *m, int type, int code, int param,
     struct ifnet *ifp)
 {
 	struct ip6_hdr *ip6;
 
 	if (ifp == NULL)
 		return;
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), );
 #else
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL)
 			return;
 	}
 #endif
 
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0)
 		return;
 	if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
 		return;
 
 	icmp6_error(m, type, code, param);
 }
 
 /*
  * Generate an error packet of type error in response to bad IP6 packet.
  */
 void
 icmp6_error(struct mbuf *m, int type, int code, int param)
 {
 	struct ip6_hdr *oip6, *nip6;
 	struct icmp6_hdr *icmp6;
 	u_int preplen;
 	int off;
 	int nxt;
 
 	ICMP6STAT_INC(icp6s_error);
 
 	/* count per-type-code statistics */
 	icmp6_errcount(type, code);
 
 #ifdef M_DECRYPTED	/*not openbsd*/
 	if (m->m_flags & M_DECRYPTED) {
 		ICMP6STAT_INC(icp6s_canterror);
 		goto freeit;
 	}
 #endif
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), );
 #else
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL)
 			return;
 	}
 #endif
 	oip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * If the destination address of the erroneous packet is a multicast
 	 * address, or the packet was sent using link-layer multicast,
 	 * we should basically suppress sending an error (RFC 2463, Section
 	 * 2.4).
 	 * We have two exceptions (the item e.2 in that section):
 	 * - the Packet Too Big message can be sent for path MTU discovery.
 	 * - the Parameter Problem Message that can be allowed an icmp6 error
 	 *   in the option type field.  This check has been done in
 	 *   ip6_unknown_opt(), so we can just check the type and code.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST) ||
 	     IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) &&
 	    (type != ICMP6_PACKET_TOO_BIG &&
 	     (type != ICMP6_PARAM_PROB ||
 	      code != ICMP6_PARAMPROB_OPTION)))
 		goto freeit;
 
 	/*
 	 * RFC 2463, 2.4 (e.5): source address check.
 	 * XXX: the case of anycast source?
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) ||
 	    IN6_IS_ADDR_MULTICAST(&oip6->ip6_src))
 		goto freeit;
 
 	/*
 	 * If we are about to send ICMPv6 against ICMPv6 error/redirect,
 	 * don't do it.
 	 */
 	nxt = -1;
 	off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
 	if (off >= 0 && nxt == IPPROTO_ICMPV6) {
 		struct icmp6_hdr *icp;
 
 #ifndef PULLDOWN_TEST
 		IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), );
 		icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 #else
 		IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off,
 			sizeof(*icp));
 		if (icp == NULL) {
 			ICMP6STAT_INC(icp6s_tooshort);
 			return;
 		}
 #endif
 		if (icp->icmp6_type < ICMP6_ECHO_REQUEST ||
 		    icp->icmp6_type == ND_REDIRECT) {
 			/*
 			 * ICMPv6 error
 			 * Special case: for redirect (which is
 			 * informational) we must not send icmp6 error.
 			 */
 			ICMP6STAT_INC(icp6s_canterror);
 			goto freeit;
 		} else {
 			/* ICMPv6 informational - send the error */
 		}
 	} else {
 		/* non-ICMPv6 - send the error */
 	}
 
 	oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */
 
 	/* Finally, do rate limitation check. */
 	if (icmp6_ratelimit(&oip6->ip6_src, type, code)) {
 		ICMP6STAT_INC(icp6s_toofreq);
 		goto freeit;
 	}
 
 	/*
 	 * OK, ICMP6 can be generated.
 	 */
 
 	if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN)
 		m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len);
 
 	preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 	M_PREPEND(m, preplen, M_NOWAIT);	/* FIB is also copied over. */
 	if (m == NULL) {
 		nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__));
 		return;
 	}
 
 	nip6 = mtod(m, struct ip6_hdr *);
 	nip6->ip6_src  = oip6->ip6_src;
 	nip6->ip6_dst  = oip6->ip6_dst;
 
 	in6_clearscope(&oip6->ip6_src);
 	in6_clearscope(&oip6->ip6_dst);
 
 	icmp6 = (struct icmp6_hdr *)(nip6 + 1);
 	icmp6->icmp6_type = type;
 	icmp6->icmp6_code = code;
 	icmp6->icmp6_pptr = htonl((u_int32_t)param);
 
 	/*
 	 * icmp6_reflect() is designed to be in the input path.
 	 * icmp6_error() can be called from both input and output path,
 	 * and if we are in output path rcvif could contain bogus value.
 	 * clear m->m_pkthdr.rcvif for safety, we should have enough scope
 	 * information in ip header (nip6).
 	 */
 	m->m_pkthdr.rcvif = NULL;
 
 	ICMP6STAT_INC(icp6s_outhist[type]);
 	icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */
 
 	return;
 
   freeit:
 	/*
 	 * If we can't tell whether or not we can generate ICMP6, free it.
 	 */
 	m_freem(m);
 }
 
 /*
  * Process a received ICMP6 message.
  */
 int
 icmp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp, *n;
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6, *nip6;
 	struct icmp6_hdr *icmp6, *nicmp6;
 	int off = *offp;
 	int icmp6len = m->m_pkthdr.len - *offp;
 	int code, sum, noff;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 	int ip6len, error;
 
 	ifp = m->m_pkthdr.rcvif;
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE);
 	/* m might change if M_LOOP.  So, call mtod after this */
 #endif
 
 	/*
 	 * Locate icmp6 structure in mbuf, and check
 	 * that not corrupted and of at least minimum length
 	 */
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
 	if (icmp6len < sizeof(struct icmp6_hdr)) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		goto freeit;
 	}
 
 	/*
 	 * Check multicast group membership.
 	 * Note: SSM filters are not applied for ICMPv6 traffic.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		struct in6_multi	*inm;
 
 		inm = in6m_lookup(ifp, &ip6->ip6_dst);
 		if (inm == NULL) {
 			IP6STAT_INC(ip6s_notmember);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 			goto freeit;
 		}
 	}
 
 	/*
 	 * calculate the checksum
 	 */
 #ifndef PULLDOWN_TEST
 	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6));
 	if (icmp6 == NULL) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		return IPPROTO_DONE;
 	}
 #endif
 	code = icmp6->icmp6_code;
 
 	if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) {
 		nd6log((LOG_ERR,
 		    "ICMP6 checksum error(%d|%x) %s\n",
 		    icmp6->icmp6_type, sum,
 		    ip6_sprintf(ip6bufs, &ip6->ip6_src)));
 		ICMP6STAT_INC(icp6s_checksum);
 		goto freeit;
 	}
 
 	ICMP6STAT_INC(icp6s_inhist[icmp6->icmp6_type]);
 	icmp6_ifstat_inc(ifp, ifs6_in_msg);
 	if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK)
 		icmp6_ifstat_inc(ifp, ifs6_in_error);
 
 	switch (icmp6->icmp6_type) {
 	case ICMP6_DST_UNREACH:
 		icmp6_ifstat_inc(ifp, ifs6_in_dstunreach);
 		switch (code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 			code = PRC_UNREACH_NET;
 			break;
 		case ICMP6_DST_UNREACH_ADMIN:
 			icmp6_ifstat_inc(ifp, ifs6_in_adminprohib);
 			code = PRC_UNREACH_PROTOCOL; /* is this a good code? */
 			break;
 		case ICMP6_DST_UNREACH_ADDR:
 			code = PRC_HOSTDEAD;
 			break;
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			/* I mean "source address was incorrect." */
 			code = PRC_PARAMPROB;
 			break;
 		case ICMP6_DST_UNREACH_NOPORT:
 			code = PRC_UNREACH_PORT;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_PACKET_TOO_BIG:
 		icmp6_ifstat_inc(ifp, ifs6_in_pkttoobig);
 
 		/* validation is made in icmp6_mtudisc_update */
 
 		code = PRC_MSGSIZE;
 
 		/*
 		 * Updating the path MTU will be done after examining
 		 * intermediate extension headers.
 		 */
 		goto deliver;
 		break;
 
 	case ICMP6_TIME_EXCEEDED:
 		icmp6_ifstat_inc(ifp, ifs6_in_timeexceed);
 		switch (code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 			code = PRC_TIMXCEED_INTRANS;
 			break;
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			code = PRC_TIMXCEED_REASS;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_PARAM_PROB:
 		icmp6_ifstat_inc(ifp, ifs6_in_paramprob);
 		switch (code) {
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			code = PRC_UNREACH_PROTOCOL;
 			break;
 		case ICMP6_PARAMPROB_HEADER:
 		case ICMP6_PARAMPROB_OPTION:
 			code = PRC_PARAMPROB;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_ECHO_REQUEST:
 		icmp6_ifstat_inc(ifp, ifs6_in_echo);
 		if (code != 0)
 			goto badcode;
 		if ((n = m_copy(m, 0, M_COPYALL)) == NULL) {
 			/* Give up remote */
 			break;
 		}
 		if (!M_WRITABLE(n)
 		 || n->m_len < off + sizeof(struct icmp6_hdr)) {
 			struct mbuf *n0 = n;
 			int n0len;
 
 			CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) <= MHLEN);
 			n = m_gethdr(M_NOWAIT, n0->m_type);
 			if (n == NULL) {
 				/* Give up remote */
 				m_freem(n0);
 				break;
 			}
 
 			m_move_pkthdr(n, n0);	/* FIB copied. */
 			n0len = n0->m_pkthdr.len;	/* save for use below */
 			/*
 			 * Copy IPv6 and ICMPv6 only.
 			 */
 			nip6 = mtod(n, struct ip6_hdr *);
 			bcopy(ip6, nip6, sizeof(struct ip6_hdr));
 			nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
 			bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
 			noff = sizeof(struct ip6_hdr);
 			/* new mbuf contains only ipv6+icmpv6 headers */
 			n->m_len = noff + sizeof(struct icmp6_hdr);
 			/*
 			 * Adjust mbuf.  ip6_plen will be adjusted in
 			 * ip6_output().
 			 */
 			m_adj(n0, off + sizeof(struct icmp6_hdr));
 			/* recalculate complete packet size */
 			n->m_pkthdr.len = n0len + (noff - off);
 			n->m_next = n0;
 		} else {
 			nip6 = mtod(n, struct ip6_hdr *);
 			IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off,
 			    sizeof(*nicmp6));
 			noff = off;
 		}
 		nicmp6->icmp6_type = ICMP6_ECHO_REPLY;
 		nicmp6->icmp6_code = 0;
 		if (n) {
 			ICMP6STAT_INC(icp6s_reflect);
 			ICMP6STAT_INC(icp6s_outhist[ICMP6_ECHO_REPLY]);
 			icmp6_reflect(n, noff);
 		}
 		break;
 
 	case ICMP6_ECHO_REPLY:
 		icmp6_ifstat_inc(ifp, ifs6_in_echoreply);
 		if (code != 0)
 			goto badcode;
 		break;
 
 	case MLD_LISTENER_QUERY:
 	case MLD_LISTENER_REPORT:
 	case MLD_LISTENER_DONE:
 	case MLDV2_LISTENER_REPORT:
 		/*
 		 * Drop MLD traffic which is not link-local, has a hop limit
 		 * of greater than 1 hop, or which does not have the
 		 * IPv6 HBH Router Alert option.
 		 * As IPv6 HBH options are stripped in ip6_input() we must
 		 * check an mbuf header flag.
 		 * XXX Should we also sanity check that these messages
 		 * were directed to a link-local multicast prefix?
 		 */
 		if ((ip6->ip6_hlim != 1) || (m->m_flags & M_RTALERT_MLD) == 0)
 			goto freeit;
 		if (mld_input(m, off, icmp6len) != 0)
 			return (IPPROTO_DONE);
 		/* m stays. */
 		break;
 
 	case ICMP6_WRUREQUEST:	/* ICMP6_FQDN_QUERY */
 	    {
 		enum { WRU, FQDN } mode;
 
 		if (!V_icmp6_nodeinfo)
 			break;
 
 		if (icmp6len == sizeof(struct icmp6_hdr) + 4)
 			mode = WRU;
 		else if (icmp6len >= sizeof(struct icmp6_nodeinfo))
 			mode = FQDN;
 		else
 			goto badlen;
 
 		if (mode == FQDN) {
 #ifndef PULLDOWN_TEST
 			IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo),
 			    IPPROTO_DONE);
 #endif
 			n = m_copy(m, 0, M_COPYALL);
 			if (n)
 				n = ni6_input(n, off);
 			/* XXX meaningless if n == NULL */
 			noff = sizeof(struct ip6_hdr);
 		} else {
 			struct prison *pr;
 			u_char *p;
 			int maxhlen, hlen;
 
 			/*
 			 * XXX: this combination of flags is pointless,
 			 * but should we keep this for compatibility?
 			 */
 			if ((V_icmp6_nodeinfo & 5) != 5)
 				break;
 
 			if (code != 0)
 				goto badcode;
 
 			CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) + 4 <= MHLEN);
 			n = m_gethdr(M_NOWAIT, m->m_type);
 			if (n == NULL) {
 				/* Give up remote */
 				break;
 			}
 			if (!m_dup_pkthdr(n, m, M_NOWAIT)) {
 				/*
 				 * Previous code did a blind M_COPY_PKTHDR
 				 * and said "just for rcvif".  If true, then
 				 * we could tolerate the dup failing (due to
 				 * the deep copy of the tag chain).  For now
 				 * be conservative and just fail.
 				 */
 				m_free(n);
 				n = NULL;
 			}
 			maxhlen = M_TRAILINGSPACE(n) -
 			    (sizeof(*nip6) + sizeof(*nicmp6) + 4);
 			pr = curthread->td_ucred->cr_prison;
 			mtx_lock(&pr->pr_mtx);
 			hlen = strlen(pr->pr_hostname);
 			if (maxhlen > hlen)
 				maxhlen = hlen;
 			/*
 			 * Copy IPv6 and ICMPv6 only.
 			 */
 			nip6 = mtod(n, struct ip6_hdr *);
 			bcopy(ip6, nip6, sizeof(struct ip6_hdr));
 			nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
 			bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
 			p = (u_char *)(nicmp6 + 1);
 			bzero(p, 4);
 			/* meaningless TTL */
 			bcopy(pr->pr_hostname, p + 4, maxhlen);
 			mtx_unlock(&pr->pr_mtx);
 			noff = sizeof(struct ip6_hdr);
 			n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
 				sizeof(struct icmp6_hdr) + 4 + maxhlen;
 			nicmp6->icmp6_type = ICMP6_WRUREPLY;
 			nicmp6->icmp6_code = 0;
 		}
 		if (n) {
 			ICMP6STAT_INC(icp6s_reflect);
 			ICMP6STAT_INC(icp6s_outhist[ICMP6_WRUREPLY]);
 			icmp6_reflect(n, noff);
 		}
 		break;
 	    }
 
 	case ICMP6_WRUREPLY:
 		if (code != 0)
 			goto badcode;
 		break;
 
 	case ND_ROUTER_SOLICIT:
 		icmp6_ifstat_inc(ifp, ifs6_in_routersolicit);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_solicit))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) {
 			/* give up local */
 
 			/* Send incoming SeND packet to user space. */
 			if (send_sendso_input_hook != NULL) {
 				IP6_EXTHDR_CHECK(m, off,
 				    icmp6len, IPPROTO_DONE);
 				error = send_sendso_input_hook(m, ifp,
 				    SND_IN, ip6len);
 				/* -1 == no app on SEND socket */
 				if (error == 0)
 					return (IPPROTO_DONE);
 				nd6_rs_input(m, off, icmp6len);
 			} else
 				nd6_rs_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		if (send_sendso_input_hook != NULL) {
 			IP6_EXTHDR_CHECK(n, off,
 			    icmp6len, IPPROTO_DONE);
                         error = send_sendso_input_hook(n, ifp,
 			    SND_IN, ip6len);
 			if (error == 0)
 				goto freeit;
 			/* -1 == no app on SEND socket */
 			nd6_rs_input(n, off, icmp6len);
 		} else
 			nd6_rs_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_ROUTER_ADVERT:
 		icmp6_ifstat_inc(ifp, ifs6_in_routeradvert);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_advert))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) {
 
 			/* Send incoming SeND-protected/ND packet to user space. */
 			if (send_sendso_input_hook != NULL) {
 				error = send_sendso_input_hook(m, ifp,
 				    SND_IN, ip6len);
 				if (error == 0)
 					return (IPPROTO_DONE);
 				nd6_ra_input(m, off, icmp6len);
 			} else
 				nd6_ra_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		if (send_sendso_input_hook != NULL) {
 			error = send_sendso_input_hook(n, ifp,
 			    SND_IN, ip6len);
 			if (error == 0)
 				goto freeit;
 			nd6_ra_input(n, off, icmp6len);
 		} else
 			nd6_ra_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_NEIGHBOR_SOLICIT:
 		icmp6_ifstat_inc(ifp, ifs6_in_neighborsolicit);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_solicit))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) {
 			if (send_sendso_input_hook != NULL) {
 				error = send_sendso_input_hook(m, ifp,
 				    SND_IN, ip6len);
 				if (error == 0)
 					return (IPPROTO_DONE);
 				nd6_ns_input(m, off, icmp6len);
 			} else
 				nd6_ns_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		if (send_sendso_input_hook != NULL) {
 			error = send_sendso_input_hook(n, ifp,
 			    SND_IN, ip6len);
 			if (error == 0)
 				goto freeit;
 			nd6_ns_input(n, off, icmp6len);
 		} else
 			nd6_ns_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_NEIGHBOR_ADVERT:
 		icmp6_ifstat_inc(ifp, ifs6_in_neighboradvert);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_advert))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) {
 
 			/* Send incoming SeND-protected/ND packet to user space. */
 			if (send_sendso_input_hook != NULL) {
 				error = send_sendso_input_hook(m, ifp,
 				    SND_IN, ip6len);
 				if (error == 0)
 					return (IPPROTO_DONE);
 				nd6_na_input(m, off, icmp6len);
 			} else
 				nd6_na_input(m, off, icmp6len);
 			m = NULL;
 			goto freeit;
 		}
 		if (send_sendso_input_hook != NULL) {
 			error = send_sendso_input_hook(n, ifp,
 			    SND_IN, ip6len);
 			if (error == 0)
 				goto freeit;
 			nd6_na_input(n, off, icmp6len);
 		} else
 			nd6_na_input(n, off, icmp6len);
 		/* m stays. */
 		break;
 
 	case ND_REDIRECT:
 		icmp6_ifstat_inc(ifp, ifs6_in_redirect);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_redirect))
 			goto badlen;
 		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) {
 			if (send_sendso_input_hook != NULL) {
 				error = send_sendso_input_hook(m, ifp,
 				    SND_IN, ip6len);
 		 		if (error == 0)
 					return (IPPROTO_DONE);
 			    icmp6_redirect_input(m, off);
 			} else
 				icmp6_redirect_input(m, off);
 			m = NULL;
 			goto freeit;
 		}
 		if (send_sendso_input_hook != NULL) {
 			error = send_sendso_input_hook(n, ifp,
 			    SND_IN, ip6len);
 			if (error == 0)
 				goto freeit;
 			icmp6_redirect_input(n, off);
 		} else
 			icmp6_redirect_input(n, off);
 		/* m stays. */
 		break;
 
 	case ICMP6_ROUTER_RENUMBERING:
 		if (code != ICMP6_ROUTER_RENUMBERING_COMMAND &&
 		    code != ICMP6_ROUTER_RENUMBERING_RESULT)
 			goto badcode;
 		if (icmp6len < sizeof(struct icmp6_router_renum))
 			goto badlen;
 		break;
 
 	default:
 		nd6log((LOG_DEBUG,
 		    "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n",
 		    icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 		    ifp ? ifp->if_index : 0));
 		if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) {
 			/* ICMPv6 error: MUST deliver it by spec... */
 			code = PRC_NCMDS;
 			/* deliver */
 		} else {
 			/* ICMPv6 informational: MUST not deliver */
 			break;
 		}
 	deliver:
 		if (icmp6_notify_error(&m, off, icmp6len, code) != 0) {
 			/* In this case, m should've been freed. */
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	badcode:
 		ICMP6STAT_INC(icp6s_badcode);
 		break;
 
 	badlen:
 		ICMP6STAT_INC(icp6s_badlen);
 		break;
 	}
 
 	/* deliver the packet to appropriate sockets */
 	icmp6_rip6_input(&m, *offp);
 
 	return IPPROTO_DONE;
 
  freeit:
 	m_freem(m);
 	return IPPROTO_DONE;
 }
 
 static int
 icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code)
 {
 	struct mbuf *m = *mp;
 	struct icmp6_hdr *icmp6;
 	struct ip6_hdr *eip6;
 	u_int32_t notifymtu;
 	struct sockaddr_in6 icmp6src, icmp6dst;
 
 	if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		goto freeit;
 	}
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off,
 	    sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1);
 	icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
 	    sizeof(*icmp6) + sizeof(struct ip6_hdr));
 	if (icmp6 == NULL) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		return (-1);
 	}
 #endif
 	eip6 = (struct ip6_hdr *)(icmp6 + 1);
 
 	/* Detect the upper level protocol */
 	{
 		void (*ctlfunc)(int, struct sockaddr *, void *);
 		u_int8_t nxt = eip6->ip6_nxt;
 		int eoff = off + sizeof(struct icmp6_hdr) +
 		    sizeof(struct ip6_hdr);
 		struct ip6ctlparam ip6cp;
 		struct in6_addr *finaldst = NULL;
 		int icmp6type = icmp6->icmp6_type;
 		struct ip6_frag *fh;
 		struct ip6_rthdr *rth;
 		struct ip6_rthdr0 *rth0;
 		int rthlen;
 
 		while (1) { /* XXX: should avoid infinite loop explicitly? */
 			struct ip6_ext *eh;
 
 			switch (nxt) {
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_AH:
 #ifndef PULLDOWN_TEST
 				IP6_EXTHDR_CHECK(m, 0,
 				    eoff + sizeof(struct ip6_ext), -1);
 				eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff);
 #else
 				IP6_EXTHDR_GET(eh, struct ip6_ext *, m,
 				    eoff, sizeof(*eh));
 				if (eh == NULL) {
 					ICMP6STAT_INC(icp6s_tooshort);
 					return (-1);
 				}
 #endif
 
 				if (nxt == IPPROTO_AH)
 					eoff += (eh->ip6e_len + 2) << 2;
 				else
 					eoff += (eh->ip6e_len + 1) << 3;
 				nxt = eh->ip6e_nxt;
 				break;
 			case IPPROTO_ROUTING:
 				/*
 				 * When the erroneous packet contains a
 				 * routing header, we should examine the
 				 * header to determine the final destination.
 				 * Otherwise, we can't properly update
 				 * information that depends on the final
 				 * destination (e.g. path MTU).
 				 */
 #ifndef PULLDOWN_TEST
 				IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1);
 				rth = (struct ip6_rthdr *)
 				    (mtod(m, caddr_t) + eoff);
 #else
 				IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m,
 				    eoff, sizeof(*rth));
 				if (rth == NULL) {
 					ICMP6STAT_INC(icp6s_tooshort);
 					return (-1);
 				}
 #endif
 				rthlen = (rth->ip6r_len + 1) << 3;
 				/*
 				 * XXX: currently there is no
 				 * officially defined type other
 				 * than type-0.
 				 * Note that if the segment left field
 				 * is 0, all intermediate hops must
 				 * have been passed.
 				 */
 				if (rth->ip6r_segleft &&
 				    rth->ip6r_type == IPV6_RTHDR_TYPE_0) {
 					int hops;
 
 #ifndef PULLDOWN_TEST
 					IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1);
 					rth0 = (struct ip6_rthdr0 *)
 					    (mtod(m, caddr_t) + eoff);
 #else
 					IP6_EXTHDR_GET(rth0,
 					    struct ip6_rthdr0 *, m,
 					    eoff, rthlen);
 					if (rth0 == NULL) {
 						ICMP6STAT_INC(icp6s_tooshort);
 						return (-1);
 					}
 #endif
 					/* just ignore a bogus header */
 					if ((rth0->ip6r0_len % 2) == 0 &&
 					    (hops = rth0->ip6r0_len/2))
 						finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1);
 				}
 				eoff += rthlen;
 				nxt = rth->ip6r_nxt;
 				break;
 			case IPPROTO_FRAGMENT:
 #ifndef PULLDOWN_TEST
 				IP6_EXTHDR_CHECK(m, 0, eoff +
 				    sizeof(struct ip6_frag), -1);
 				fh = (struct ip6_frag *)(mtod(m, caddr_t) +
 				    eoff);
 #else
 				IP6_EXTHDR_GET(fh, struct ip6_frag *, m,
 				    eoff, sizeof(*fh));
 				if (fh == NULL) {
 					ICMP6STAT_INC(icp6s_tooshort);
 					return (-1);
 				}
 #endif
 				/*
 				 * Data after a fragment header is meaningless
 				 * unless it is the first fragment, but
 				 * we'll go to the notify label for path MTU
 				 * discovery.
 				 */
 				if (fh->ip6f_offlg & IP6F_OFF_MASK)
 					goto notify;
 
 				eoff += sizeof(struct ip6_frag);
 				nxt = fh->ip6f_nxt;
 				break;
 			default:
 				/*
 				 * This case includes ESP and the No Next
 				 * Header.  In such cases going to the notify
 				 * label does not have any meaning
 				 * (i.e. ctlfunc will be NULL), but we go
 				 * anyway since we might have to update
 				 * path MTU information.
 				 */
 				goto notify;
 			}
 		}
 	  notify:
 #ifndef PULLDOWN_TEST
 		icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 #else
 		IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
 		    sizeof(*icmp6) + sizeof(struct ip6_hdr));
 		if (icmp6 == NULL) {
 			ICMP6STAT_INC(icp6s_tooshort);
 			return (-1);
 		}
 #endif
 
 		/*
 		 * retrieve parameters from the inner IPv6 header, and convert
 		 * them into sockaddr structures.
 		 * XXX: there is no guarantee that the source or destination
 		 * addresses of the inner packet are in the same scope as
 		 * the addresses of the icmp packet.  But there is no other
 		 * way to determine the zone.
 		 */
 		eip6 = (struct ip6_hdr *)(icmp6 + 1);
 
 		bzero(&icmp6dst, sizeof(icmp6dst));
 		icmp6dst.sin6_len = sizeof(struct sockaddr_in6);
 		icmp6dst.sin6_family = AF_INET6;
 		if (finaldst == NULL)
 			icmp6dst.sin6_addr = eip6->ip6_dst;
 		else
 			icmp6dst.sin6_addr = *finaldst;
 		if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL))
 			goto freeit;
 		bzero(&icmp6src, sizeof(icmp6src));
 		icmp6src.sin6_len = sizeof(struct sockaddr_in6);
 		icmp6src.sin6_family = AF_INET6;
 		icmp6src.sin6_addr = eip6->ip6_src;
 		if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL))
 			goto freeit;
 		icmp6src.sin6_flowinfo =
 		    (eip6->ip6_flow & IPV6_FLOWLABEL_MASK);
 
 		if (finaldst == NULL)
 			finaldst = &eip6->ip6_dst;
 		ip6cp.ip6c_m = m;
 		ip6cp.ip6c_icmp6 = icmp6;
 		ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1);
 		ip6cp.ip6c_off = eoff;
 		ip6cp.ip6c_finaldst = finaldst;
 		ip6cp.ip6c_src = &icmp6src;
 		ip6cp.ip6c_nxt = nxt;
 
 		if (icmp6type == ICMP6_PACKET_TOO_BIG) {
 			notifymtu = ntohl(icmp6->icmp6_mtu);
 			ip6cp.ip6c_cmdarg = (void *)&notifymtu;
 			icmp6_mtudisc_update(&ip6cp, 1);	/*XXX*/
 		}
 
 		ctlfunc = (void (*)(int, struct sockaddr *, void *))
 		    (inet6sw[ip6_protox[nxt]].pr_ctlinput);
 		if (ctlfunc) {
 			(void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst,
 			    &ip6cp);
 		}
 	}
 	*mp = m;
 	return (0);
 
   freeit:
 	m_freem(m);
 	return (-1);
 }
 
 void
 icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated)
 {
 	struct in6_addr *dst = ip6cp->ip6c_finaldst;
 	struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6;
 	struct mbuf *m = ip6cp->ip6c_m;	/* will be necessary for scope issue */
 	u_int mtu = ntohl(icmp6->icmp6_mtu);
 	struct in_conninfo inc;
 
 #if 0
 	/*
 	 * RFC2460 section 5, last paragraph.
 	 * even though minimum link MTU for IPv6 is IPV6_MMTU,
 	 * we may see ICMPv6 too big with mtu < IPV6_MMTU
 	 * due to packet translator in the middle.
 	 * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for
 	 * special handling.
 	 */
 	if (mtu < IPV6_MMTU)
 		return;
 #endif
 
 	/*
 	 * we reject ICMPv6 too big with abnormally small value.
 	 * XXX what is the good definition of "abnormally small"?
 	 */
 	if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8)
 		return;
 
 	if (!validated)
 		return;
 
 	/*
 	 * In case the suggested mtu is less than IPV6_MMTU, we
 	 * only need to remember that it was for above mentioned
 	 * "alwaysfrag" case.
 	 * Try to be as close to the spec as possible.
 	 */
 	if (mtu < IPV6_MMTU)
 		mtu = IPV6_MMTU - 8;
 
 	bzero(&inc, sizeof(inc));
 	inc.inc_fibnum = M_GETFIB(m);
 	inc.inc_flags |= INC_ISIPV6;
 	inc.inc6_faddr = *dst;
 	if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL))
 		return;
 
 	if (mtu < tcp_maxmtu6(&inc, NULL)) {
 		tcp_hc_updatemtu(&inc, mtu);
 		ICMP6STAT_INC(icp6s_pmtuchg);
 	}
 }
 
 /*
  * Process a Node Information Query packet, based on
  * draft-ietf-ipngwg-icmp-name-lookups-07.
  *
  * Spec incompatibilities:
  * - IPv6 Subject address handling
  * - IPv4 Subject address handling support missing
  * - Proxy reply (answer even if it's not for me)
  * - joins NI group address at in6_ifattach() time only, does not cope
  *   with hostname changes by sethostname(3)
  */
 static struct mbuf *
 ni6_input(struct mbuf *m, int off)
 {
 	struct icmp6_nodeinfo *ni6, *nni6;
 	struct mbuf *n = NULL;
 	struct prison *pr;
 	u_int16_t qtype;
 	int subjlen;
 	int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
 	struct ni_reply_fqdn *fqdn;
 	int addrs;		/* for NI_QTYPE_NODEADDR */
 	struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */
 	struct in6_addr in6_subj; /* subject address */
 	struct ip6_hdr *ip6;
 	int oldfqdn = 0;	/* if 1, return pascal string (03 draft) */
 	char *subj = NULL;
 	struct in6_ifaddr *ia6 = NULL;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 #ifndef PULLDOWN_TEST
 	ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off);
 #else
 	IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6));
 	if (ni6 == NULL) {
 		/* m is already reclaimed */
 		return (NULL);
 	}
 #endif
 
 	/*
 	 * Validate IPv6 source address.
 	 * The default configuration MUST be to refuse answering queries from
 	 * global-scope addresses according to RFC4602.
 	 * Notes:
 	 *  - it's not very clear what "refuse" means; this implementation
 	 *    simply drops it.
 	 *  - it's not very easy to identify global-scope (unicast) addresses
 	 *    since there are many prefixes for them.  It should be safer
 	 *    and in practice sufficient to check "all" but loopback and
 	 *    link-local (note that site-local unicast was deprecated and
 	 *    ULA is defined as global scope-wise)
 	 */
 	if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 &&
 	    !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) &&
 	    !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src))
 		goto bad;
 
 	/*
 	 * Validate IPv6 destination address.
 	 *
 	 * The Responder must discard the Query without further processing
 	 * unless it is one of the Responder's unicast or anycast addresses, or
 	 * a link-local scope multicast address which the Responder has joined.
 	 * [RFC4602, Section 5.]
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
 			goto bad;
 		/* else it's a link-local multicast, fine */
 	} else {		/* unicast or anycast */
 		ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
 		if (ia6 == NULL)
 			goto bad; /* XXX impossible */
 
 		if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) &&
 		    !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) {
 			ifa_free(&ia6->ia_ifa);
 			nd6log((LOG_DEBUG, "ni6_input: ignore node info to "
 				"a temporary address in %s:%d",
 			       __FILE__, __LINE__));
 			goto bad;
 		}
 		ifa_free(&ia6->ia_ifa);
 	}
 
 	/* validate query Subject field. */
 	qtype = ntohs(ni6->ni_qtype);
 	subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo);
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 	case NI_QTYPE_SUPTYPES:
 		/* 07 draft */
 		if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0)
 			break;
 		/* FALLTHROUGH */
 	case NI_QTYPE_FQDN:
 	case NI_QTYPE_NODEADDR:
 	case NI_QTYPE_IPV4ADDR:
 		switch (ni6->ni_code) {
 		case ICMP6_NI_SUBJ_IPV6:
 #if ICMP6_NI_SUBJ_IPV6 != 0
 		case 0:
 #endif
 			/*
 			 * backward compatibility - try to accept 03 draft
 			 * format, where no Subject is present.
 			 */
 			if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 &&
 			    subjlen == 0) {
 				oldfqdn++;
 				break;
 			}
 #if ICMP6_NI_SUBJ_IPV6 != 0
 			if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6)
 				goto bad;
 #endif
 
 			if (subjlen != sizeof(struct in6_addr))
 				goto bad;
 
 			/*
 			 * Validate Subject address.
 			 *
 			 * Not sure what exactly "address belongs to the node"
 			 * means in the spec, is it just unicast, or what?
 			 *
 			 * At this moment we consider Subject address as
 			 * "belong to the node" if the Subject address equals
 			 * to the IPv6 destination address; validation for
 			 * IPv6 destination address should have done enough
 			 * check for us.
 			 *
 			 * We do not do proxy at this moment.
 			 */
 			/* m_pulldown instead of copy? */
 			m_copydata(m, off + sizeof(struct icmp6_nodeinfo),
 			    subjlen, (caddr_t)&in6_subj);
 			if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL))
 				goto bad;
 
 			subj = (char *)&in6_subj;
 			if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj))
 				break;
 
 			/*
 			 * XXX if we are to allow other cases, we should really
 			 * be careful about scope here.
 			 * basically, we should disallow queries toward IPv6
 			 * destination X with subject Y,
 			 * if scope(X) > scope(Y).
 			 * if we allow scope(X) > scope(Y), it will result in
 			 * information leakage across scope boundary.
 			 */
 			goto bad;
 
 		case ICMP6_NI_SUBJ_FQDN:
 			/*
 			 * Validate Subject name with gethostname(3).
 			 *
 			 * The behavior may need some debate, since:
 			 * - we are not sure if the node has FQDN as
 			 *   hostname (returned by gethostname(3)).
 			 * - the code does wildcard match for truncated names.
 			 *   however, we are not sure if we want to perform
 			 *   wildcard match, if gethostname(3) side has
 			 *   truncated hostname.
 			 */
 			pr = curthread->td_ucred->cr_prison;
 			mtx_lock(&pr->pr_mtx);
 			n = ni6_nametodns(pr->pr_hostname,
 			    strlen(pr->pr_hostname), 0);
 			mtx_unlock(&pr->pr_mtx);
 			if (!n || n->m_next || n->m_len == 0)
 				goto bad;
 			IP6_EXTHDR_GET(subj, char *, m,
 			    off + sizeof(struct icmp6_nodeinfo), subjlen);
 			if (subj == NULL)
 				goto bad;
 			if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *),
 			    n->m_len)) {
 				goto bad;
 			}
 			m_freem(n);
 			n = NULL;
 			break;
 
 		case ICMP6_NI_SUBJ_IPV4:	/* XXX: to be implemented? */
 		default:
 			goto bad;
 		}
 		break;
 	}
 
 	/* refuse based on configuration.  XXX ICMP6_NI_REFUSED? */
 	switch (qtype) {
 	case NI_QTYPE_FQDN:
 		if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0)
 			goto bad;
 		break;
 	case NI_QTYPE_NODEADDR:
 	case NI_QTYPE_IPV4ADDR:
 		if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0)
 			goto bad;
 		break;
 	}
 
 	/* guess reply length */
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 		break;		/* no reply data */
 	case NI_QTYPE_SUPTYPES:
 		replylen += sizeof(u_int32_t);
 		break;
 	case NI_QTYPE_FQDN:
 		/* XXX will append an mbuf */
 		replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
 		break;
 	case NI_QTYPE_NODEADDR:
 		addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj);
 		if ((replylen += addrs * (sizeof(struct in6_addr) +
 		    sizeof(u_int32_t))) > MCLBYTES)
 			replylen = MCLBYTES; /* XXX: will truncate pkt later */
 		break;
 	case NI_QTYPE_IPV4ADDR:
 		/* unsupported - should respond with unknown Qtype? */
 		break;
 	default:
 		/*
 		 * XXX: We must return a reply with the ICMP6 code
 		 * `unknown Qtype' in this case.  However we regard the case
 		 * as an FQDN query for backward compatibility.
 		 * Older versions set a random value to this field,
 		 * so it rarely varies in the defined qtypes.
 		 * But the mechanism is not reliable...
 		 * maybe we should obsolete older versions.
 		 */
 		qtype = NI_QTYPE_FQDN;
 		/* XXX will append an mbuf */
 		replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
 		oldfqdn++;
 		break;
 	}
 
 	/* Allocate an mbuf to reply. */
 	if (replylen > MCLBYTES) {
 		/*
 		 * XXX: should we try to allocate more? But MCLBYTES
 		 * is probably much larger than IPV6_MMTU...
 		 */
 		goto bad;
 	}
 	if (replylen > MHLEN)
 		n = m_getcl(M_NOWAIT, m->m_type, M_PKTHDR);
 	else
 		n = m_gethdr(M_NOWAIT, m->m_type);
 	if (n == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	m_move_pkthdr(n, m); /* just for recvif and FIB */
 	n->m_pkthdr.len = n->m_len = replylen;
 
 	/* copy mbuf header and IPv6 + Node Information base headers */
 	bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr));
 	nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1);
 	bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo));
 
 	/* qtype dependent procedure */
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		nni6->ni_flags = 0;
 		break;
 	case NI_QTYPE_SUPTYPES:
 	{
 		u_int32_t v;
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		nni6->ni_flags = htons(0x0000);	/* raw bitmap */
 		/* supports NOOP, SUPTYPES, FQDN, and NODEADDR */
 		v = (u_int32_t)htonl(0x0000000f);
 		bcopy(&v, nni6 + 1, sizeof(u_int32_t));
 		break;
 	}
 	case NI_QTYPE_FQDN:
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) +
 		    sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo));
 		nni6->ni_flags = 0; /* XXX: meaningless TTL */
 		fqdn->ni_fqdn_ttl = 0;	/* ditto. */
 		/*
 		 * XXX do we really have FQDN in hostname?
 		 */
 		pr = curthread->td_ucred->cr_prison;
 		mtx_lock(&pr->pr_mtx);
 		n->m_next = ni6_nametodns(pr->pr_hostname,
 		    strlen(pr->pr_hostname), oldfqdn);
 		mtx_unlock(&pr->pr_mtx);
 		if (n->m_next == NULL)
 			goto bad;
 		/* XXX we assume that n->m_next is not a chain */
 		if (n->m_next->m_next != NULL)
 			goto bad;
 		n->m_pkthdr.len += n->m_next->m_len;
 		break;
 	case NI_QTYPE_NODEADDR:
 	{
 		int lenlim, copied;
 
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		n->m_pkthdr.len = n->m_len =
 		    sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
 		lenlim = M_TRAILINGSPACE(n);
 		copied = ni6_store_addrs(ni6, nni6, ifp, lenlim);
 		/* XXX: reset mbuf length */
 		n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
 		    sizeof(struct icmp6_nodeinfo) + copied;
 		break;
 	}
 	default:
 		break;		/* XXX impossible! */
 	}
 
 	nni6->ni_type = ICMP6_NI_REPLY;
 	m_freem(m);
 	return (n);
 
   bad:
 	m_freem(m);
 	if (n)
 		m_freem(n);
 	return (NULL);
 }
 
 /*
  * make a mbuf with DNS-encoded string.  no compression support.
  *
  * XXX names with less than 2 dots (like "foo" or "foo.section") will be
  * treated as truncated name (two \0 at the end).  this is a wild guess.
  *
  * old - return pascal string if non-zero
  */
 static struct mbuf *
 ni6_nametodns(const char *name, int namelen, int old)
 {
 	struct mbuf *m;
 	char *cp, *ep;
 	const char *p, *q;
 	int i, len, nterm;
 
 	if (old)
 		len = namelen + 1;
 	else
 		len = MCLBYTES;
 
 	/* Because MAXHOSTNAMELEN is usually 256, we use cluster mbuf. */
 	if (len > MLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		m = m_get(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		goto fail;
 
 	if (old) {
 		m->m_len = len;
 		*mtod(m, char *) = namelen;
 		bcopy(name, mtod(m, char *) + 1, namelen);
 		return m;
 	} else {
 		m->m_len = 0;
 		cp = mtod(m, char *);
 		ep = mtod(m, char *) + M_TRAILINGSPACE(m);
 
 		/* if not certain about my name, return empty buffer */
 		if (namelen == 0)
 			return m;
 
 		/*
 		 * guess if it looks like shortened hostname, or FQDN.
 		 * shortened hostname needs two trailing "\0".
 		 */
 		i = 0;
 		for (p = name; p < name + namelen; p++) {
 			if (*p && *p == '.')
 				i++;
 		}
 		if (i < 2)
 			nterm = 2;
 		else
 			nterm = 1;
 
 		p = name;
 		while (cp < ep && p < name + namelen) {
 			i = 0;
 			for (q = p; q < name + namelen && *q && *q != '.'; q++)
 				i++;
 			/* result does not fit into mbuf */
 			if (cp + i + 1 >= ep)
 				goto fail;
 			/*
 			 * DNS label length restriction, RFC1035 page 8.
 			 * "i == 0" case is included here to avoid returning
 			 * 0-length label on "foo..bar".
 			 */
 			if (i <= 0 || i >= 64)
 				goto fail;
 			*cp++ = i;
 			bcopy(p, cp, i);
 			cp += i;
 			p = q;
 			if (p < name + namelen && *p == '.')
 				p++;
 		}
 		/* termination */
 		if (cp + nterm >= ep)
 			goto fail;
 		while (nterm-- > 0)
 			*cp++ = '\0';
 		m->m_len = cp - mtod(m, char *);
 		return m;
 	}
 
 	panic("should not reach here");
 	/* NOTREACHED */
 
  fail:
 	if (m)
 		m_freem(m);
 	return NULL;
 }
 
 /*
  * check if two DNS-encoded string matches.  takes care of truncated
  * form (with \0\0 at the end).  no compression support.
  * XXX upper/lowercase match (see RFC2065)
  */
 static int
 ni6_dnsmatch(const char *a, int alen, const char *b, int blen)
 {
 	const char *a0, *b0;
 	int l;
 
 	/* simplest case - need validation? */
 	if (alen == blen && bcmp(a, b, alen) == 0)
 		return 1;
 
 	a0 = a;
 	b0 = b;
 
 	/* termination is mandatory */
 	if (alen < 2 || blen < 2)
 		return 0;
 	if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0')
 		return 0;
 	alen--;
 	blen--;
 
 	while (a - a0 < alen && b - b0 < blen) {
 		if (a - a0 + 1 > alen || b - b0 + 1 > blen)
 			return 0;
 
 		if ((signed char)a[0] < 0 || (signed char)b[0] < 0)
 			return 0;
 		/* we don't support compression yet */
 		if (a[0] >= 64 || b[0] >= 64)
 			return 0;
 
 		/* truncated case */
 		if (a[0] == 0 && a - a0 == alen - 1)
 			return 1;
 		if (b[0] == 0 && b - b0 == blen - 1)
 			return 1;
 		if (a[0] == 0 || b[0] == 0)
 			return 0;
 
 		if (a[0] != b[0])
 			return 0;
 		l = a[0];
 		if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen)
 			return 0;
 		if (bcmp(a + 1, b + 1, l) != 0)
 			return 0;
 
 		a += 1 + l;
 		b += 1 + l;
 	}
 
 	if (a - a0 == alen && b - b0 == blen)
 		return 1;
 	else
 		return 0;
 }
 
 /*
  * calculate the number of addresses to be returned in the node info reply.
  */
 static int
 ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp,
     struct in6_addr *subj)
 {
 	struct ifnet *ifp;
 	struct in6_ifaddr *ifa6;
 	struct ifaddr *ifa;
 	int addrs = 0, addrsofif, iffound = 0;
 	int niflags = ni6->ni_flags;
 
 	if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) {
 		switch (ni6->ni_code) {
 		case ICMP6_NI_SUBJ_IPV6:
 			if (subj == NULL) /* must be impossible... */
 				return (0);
 			break;
 		default:
 			/*
 			 * XXX: we only support IPv6 subject address for
 			 * this Qtype.
 			 */
 			return (0);
 		}
 	}
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		addrsofif = 0;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifa6 = (struct in6_ifaddr *)ifa;
 
 			if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 &&
 			    IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr))
 				iffound = 1;
 
 			/*
 			 * IPv4-mapped addresses can only be returned by a
 			 * Node Information proxy, since they represent
 			 * addresses of IPv4-only nodes, which perforce do
 			 * not implement this protocol.
 			 * [icmp-name-lookups-07, Section 5.4]
 			 * So we don't support NI_NODEADDR_FLAG_COMPAT in
 			 * this function at this moment.
 			 */
 
 			/* What do we have to do about ::1? */
 			switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
 			case IPV6_ADDR_SCOPE_LINKLOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_SITELOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_GLOBAL:
 				if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
 					continue;
 				break;
 			default:
 				continue;
 			}
 
 			/*
 			 * check if anycast is okay.
 			 * XXX: just experimental.  not in the spec.
 			 */
 			if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
 			    (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
 				continue; /* we need only unicast addresses */
 			if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) {
 				continue;
 			}
 			addrsofif++; /* count the address */
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (iffound) {
 			*ifpp = ifp;
 			IFNET_RUNLOCK_NOSLEEP();
 			return (addrsofif);
 		}
 
 		addrs += addrsofif;
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 
 	return (addrs);
 }
 
 static int
 ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6,
     struct ifnet *ifp0, int resid)
 {
 	struct ifnet *ifp;
 	struct in6_ifaddr *ifa6;
 	struct ifaddr *ifa;
 	struct ifnet *ifp_dep = NULL;
 	int copied = 0, allow_deprecated = 0;
 	u_char *cp = (u_char *)(nni6 + 1);
 	int niflags = ni6->ni_flags;
 	u_int32_t ltime;
 
 	if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL))
 		return (0);	/* needless to copy */
 
 	IFNET_RLOCK_NOSLEEP();
 	ifp = ifp0 ? ifp0 : TAILQ_FIRST(&V_ifnet);
   again:
 
 	for (; ifp; ifp = TAILQ_NEXT(ifp, if_link)) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifa6 = (struct in6_ifaddr *)ifa;
 
 			if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 &&
 			    allow_deprecated == 0) {
 				/*
 				 * prefererred address should be put before
 				 * deprecated addresses.
 				 */
 
 				/* record the interface for later search */
 				if (ifp_dep == NULL)
 					ifp_dep = ifp;
 
 				continue;
 			} else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 &&
 			    allow_deprecated != 0)
 				continue; /* we now collect deprecated addrs */
 
 			/* What do we have to do about ::1? */
 			switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
 			case IPV6_ADDR_SCOPE_LINKLOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_SITELOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_GLOBAL:
 				if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
 					continue;
 				break;
 			default:
 				continue;
 			}
 
 			/*
 			 * check if anycast is okay.
 			 * XXX: just experimental.  not in the spec.
 			 */
 			if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
 			    (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
 				continue;
 			if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) {
 				continue;
 			}
 
 			/* now we can copy the address */
 			if (resid < sizeof(struct in6_addr) +
 			    sizeof(u_int32_t)) {
 				IF_ADDR_RUNLOCK(ifp);
 				/*
 				 * We give up much more copy.
 				 * Set the truncate flag and return.
 				 */
 				nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE;
 				IFNET_RUNLOCK_NOSLEEP();
 				return (copied);
 			}
 
 			/*
 			 * Set the TTL of the address.
 			 * The TTL value should be one of the following
 			 * according to the specification:
 			 *
 			 * 1. The remaining lifetime of a DHCP lease on the
 			 *    address, or
 			 * 2. The remaining Valid Lifetime of a prefix from
 			 *    which the address was derived through Stateless
 			 *    Autoconfiguration.
 			 *
 			 * Note that we currently do not support stateful
 			 * address configuration by DHCPv6, so the former
 			 * case can't happen.
 			 */
 			if (ifa6->ia6_lifetime.ia6t_expire == 0)
 				ltime = ND6_INFINITE_LIFETIME;
 			else {
 				if (ifa6->ia6_lifetime.ia6t_expire >
 				    time_uptime)
 					ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_uptime);
 				else
 					ltime = 0;
 			}
 
 			bcopy(&ltime, cp, sizeof(u_int32_t));
 			cp += sizeof(u_int32_t);
 
 			/* copy the address itself */
 			bcopy(&ifa6->ia_addr.sin6_addr, cp,
 			    sizeof(struct in6_addr));
 			in6_clearscope((struct in6_addr *)cp); /* XXX */
 			cp += sizeof(struct in6_addr);
 
 			resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t));
 			copied += (sizeof(struct in6_addr) + sizeof(u_int32_t));
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (ifp0)	/* we need search only on the specified IF */
 			break;
 	}
 
 	if (allow_deprecated == 0 && ifp_dep != NULL) {
 		ifp = ifp_dep;
 		allow_deprecated = 1;
 
 		goto again;
 	}
 
 	IFNET_RUNLOCK_NOSLEEP();
 
 	return (copied);
 }
 
 /*
  * XXX almost dup'ed code with rip6_input.
  */
 static int
 icmp6_rip6_input(struct mbuf **mp, int off)
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct inpcb *in6p;
 	struct inpcb *last = NULL;
 	struct sockaddr_in6 fromsa;
 	struct icmp6_hdr *icmp6;
 	struct mbuf *opts = NULL;
 
 #ifndef PULLDOWN_TEST
 	/* this is assumed to be safe. */
 	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6));
 	if (icmp6 == NULL) {
 		/* m is already reclaimed */
 		return (IPPROTO_DONE);
 	}
 #endif
 
 	/*
 	 * XXX: the address may have embedded scope zone ID, which should be
 	 * hidden from applications.
 	 */
 	bzero(&fromsa, sizeof(fromsa));
 	fromsa.sin6_family = AF_INET6;
 	fromsa.sin6_len = sizeof(struct sockaddr_in6);
 	fromsa.sin6_addr = ip6->ip6_src;
 	if (sa6_recoverscope(&fromsa)) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	INP_INFO_RLOCK(&V_ripcbinfo);
 	LIST_FOREACH(in6p, &V_ripcb, inp_list) {
 		if ((in6p->inp_vflag & INP_IPV6) == 0)
 			continue;
 		if (in6p->inp_ip_p != IPPROTO_ICMPV6)
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) &&
 		   !IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst))
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) &&
 		   !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src))
 			continue;
 		INP_RLOCK(in6p);
 		if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
 		    in6p->in6p_icmp6filt)) {
 			INP_RUNLOCK(in6p);
 			continue;
 		}
 		if (last != NULL) {
 			struct	mbuf *n = NULL;
 
 			/*
 			 * Recent network drivers tend to allocate a single
 			 * mbuf cluster, rather than to make a couple of
 			 * mbufs without clusters.  Also, since the IPv6 code
 			 * path tries to avoid m_pullup(), it is highly
 			 * probable that we still have an mbuf cluster here
 			 * even though the necessary length can be stored in an
 			 * mbuf's internal buffer.
 			 * Meanwhile, the default size of the receive socket
 			 * buffer for raw sockets is not so large.  This means
 			 * the possibility of packet loss is relatively higher
 			 * than before.  To avoid this scenario, we copy the
 			 * received data to a separate mbuf that does not use
 			 * a cluster, if possible.
 			 * XXX: it is better to copy the data after stripping
 			 * intermediate headers.
 			 */
 			if ((m->m_flags & M_EXT) && m->m_next == NULL &&
 			    m->m_len <= MHLEN) {
 				n = m_get(M_NOWAIT, m->m_type);
 				if (n != NULL) {
 					if (m_dup_pkthdr(n, m, M_NOWAIT)) {
 						bcopy(m->m_data, n->m_data,
 						      m->m_len);
 						n->m_len = m->m_len;
 					} else {
 						m_free(n);
 						n = NULL;
 					}
 				}
 			}
 			if (n != NULL ||
 			    (n = m_copy(m, 0, (int)M_COPYALL)) != NULL) {
 				if (last->inp_flags & INP_CONTROLOPTS)
 					ip6_savecontrol(last, n, &opts);
 				/* strip intermediate headers */
 				m_adj(n, off);
 				SOCKBUF_LOCK(&last->inp_socket->so_rcv);
 				if (sbappendaddr_locked(
 				    &last->inp_socket->so_rcv,
 				    (struct sockaddr *)&fromsa, n, opts)
 				    == 0) {
 					/* should notify about lost packet */
 					m_freem(n);
 					if (opts) {
 						m_freem(opts);
 					}
 					SOCKBUF_UNLOCK(
 					    &last->inp_socket->so_rcv);
 				} else
 					sorwakeup_locked(last->inp_socket);
 				opts = NULL;
 			}
 			INP_RUNLOCK(last);
 		}
 		last = in6p;
 	}
 	INP_INFO_RUNLOCK(&V_ripcbinfo);
 	if (last != NULL) {
 		if (last->inp_flags & INP_CONTROLOPTS)
 			ip6_savecontrol(last, m, &opts);
 		/* strip intermediate headers */
 		m_adj(m, off);
 
 		/* avoid using mbuf clusters if possible (see above) */
 		if ((m->m_flags & M_EXT) && m->m_next == NULL &&
 		    m->m_len <= MHLEN) {
 			struct mbuf *n;
 
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n != NULL) {
 				if (m_dup_pkthdr(n, m, M_NOWAIT)) {
 					bcopy(m->m_data, n->m_data, m->m_len);
 					n->m_len = m->m_len;
 
 					m_freem(m);
 					m = n;
 				} else {
 					m_freem(n);
 					n = NULL;
 				}
 			}
 		}
 		SOCKBUF_LOCK(&last->inp_socket->so_rcv);
 		if (sbappendaddr_locked(&last->inp_socket->so_rcv,
 		    (struct sockaddr *)&fromsa, m, opts) == 0) {
 			m_freem(m);
 			if (opts)
 				m_freem(opts);
 			SOCKBUF_UNLOCK(&last->inp_socket->so_rcv);
 		} else
 			sorwakeup_locked(last->inp_socket);
 		INP_RUNLOCK(last);
 	} else {
 		m_freem(m);
 		IP6STAT_DEC(ip6s_delivered);
 	}
 	return IPPROTO_DONE;
 }
 
 /*
  * Reflect the ip6 packet back to the source.
  * OFF points to the icmp6 header, counted from the top of the mbuf.
  */
 void
 icmp6_reflect(struct mbuf *m, size_t off)
 {
 	struct in6_addr src, *srcp = NULL;
 	struct ip6_hdr *ip6;
 	struct icmp6_hdr *icmp6;
 	struct in6_ifaddr *ia = NULL;
 	struct ifnet *outif = NULL;
 	int plen;
 	int type, code;
 
 	/* too short to reflect */
 	if (off < sizeof(struct ip6_hdr)) {
 		nd6log((LOG_DEBUG,
 		    "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n",
 		    (u_long)off, (u_long)sizeof(struct ip6_hdr),
 		    __FILE__, __LINE__));
 		goto bad;
 	}
 
 	/*
 	 * If there are extra headers between IPv6 and ICMPv6, strip
 	 * off that header first.
 	 */
 #ifdef DIAGNOSTIC
 	if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN)
 		panic("assumption failed in icmp6_reflect");
 #endif
 	if (off > sizeof(struct ip6_hdr)) {
 		size_t l;
 		struct ip6_hdr nip6;
 
 		l = off - sizeof(struct ip6_hdr);
 		m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6);
 		m_adj(m, l);
 		l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 		if (m->m_len < l) {
 			if ((m = m_pullup(m, l)) == NULL)
 				return;
 		}
 		bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6));
 	} else /* off == sizeof(struct ip6_hdr) */ {
 		size_t l;
 		l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 		if (m->m_len < l) {
 			if ((m = m_pullup(m, l)) == NULL)
 				return;
 		}
 	}
 	plen = m->m_pkthdr.len - sizeof(struct ip6_hdr);
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	icmp6 = (struct icmp6_hdr *)(ip6 + 1);
 	type = icmp6->icmp6_type; /* keep type for statistics */
 	code = icmp6->icmp6_code; /* ditto. */
 
 	/*
 	 * If the incoming packet was addressed directly to us (i.e. unicast),
 	 * use dst as the src for the reply.
 	 * The IN6_IFF_NOTREADY case should be VERY rare, but is possible
 	 * (for example) when we encounter an error while forwarding procedure
 	 * destined to a duplicated address of ours.
 	 */
 	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
 		if (ia != NULL && !(ia->ia6_flags &
 		    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)))
 			srcp = &ia->ia_addr.sin6_addr;
 	}
 
 	if (srcp == NULL) {
 		int e;
 		struct sockaddr_in6 sin6;
 		struct route_in6 ro;
 
 		/*
 		 * This case matches to multicasts, our anycast, or unicasts
 		 * that we do not own.  Select a source address based on the
 		 * source address of the erroneous packet.
 		 */
 		bzero(&sin6, sizeof(sin6));
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_len = sizeof(sin6);
 		sin6.sin6_addr = ip6->ip6_dst; /* zone ID should be embedded */
 
 		bzero(&ro, sizeof(ro));
 		e = in6_selectsrc(&sin6, NULL, NULL, &ro, NULL, &outif, &src);
 		if (ro.ro_rt)
 			RTFREE(ro.ro_rt); /* XXX: we could use this */
 		if (e) {
 			char ip6buf[INET6_ADDRSTRLEN];
 			nd6log((LOG_DEBUG,
 			    "icmp6_reflect: source can't be determined: "
 			    "dst=%s, error=%d\n",
 			    ip6_sprintf(ip6buf, &sin6.sin6_addr), e));
 			goto bad;
 		}
 		srcp = &src;
 	}
 	/*
 	 * ip6_input() drops a packet if its src is multicast.
 	 * So, the src is never multicast.
 	 */
 	ip6->ip6_dst = ip6->ip6_src;
 	ip6->ip6_src = *srcp;
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	if (outif)
 		ip6->ip6_hlim = ND_IFINFO(outif)->chlim;
 	else if (m->m_pkthdr.rcvif) {
 		/* XXX: This may not be the outgoing interface */
 		ip6->ip6_hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim;
 	} else
 		ip6->ip6_hlim = V_ip6_defhlim;
 
 	icmp6->icmp6_cksum = 0;
 	icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), plen);
 
 	/*
 	 * XXX option handling
 	 */
 
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 
 	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
 	if (outif)
 		icmp6_ifoutstat_inc(outif, type, code);
 
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return;
 
  bad:
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	m_freem(m);
 	return;
 }
 
 void
 icmp6_fasttimo(void)
 {
 
 	mld_fasttimo();
 }
 
 void
 icmp6_slowtimo(void)
 {
 
 	mld_slowtimo();
 }
 
 static const char *
 icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6,
     struct in6_addr *tgt6)
 {
 	static char buf[1024];
 	char ip6bufs[INET6_ADDRSTRLEN];
 	char ip6bufd[INET6_ADDRSTRLEN];
 	char ip6buft[INET6_ADDRSTRLEN];
 	snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)",
 	    ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6),
 	    ip6_sprintf(ip6buft, tgt6));
 	return buf;
 }
 
 void
 icmp6_redirect_input(struct mbuf *m, int off)
 {
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_redirect *nd_rd;
 	int icmp6len = ntohs(ip6->ip6_plen);
 	char *lladdr = NULL;
 	int lladdrlen = 0;
-	struct rtentry *rt = NULL;
 	int is_router;
 	int is_onlink;
 	struct in6_addr src6 = ip6->ip6_src;
 	struct in6_addr redtgt6;
 	struct in6_addr reddst6;
 	union nd_opts ndopts;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: no rcvif", __func__));
 
 	ifp = m->m_pkthdr.rcvif;
 
 	/* XXX if we are router, we don't update route by icmp6 redirect */
 	if (V_ip6_forwarding)
 		goto freeit;
 	if (!V_icmp6_rediraccept)
 		goto freeit;
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len);
 	if (nd_rd == NULL) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		return;
 	}
 #endif
 	redtgt6 = nd_rd->nd_rd_target;
 	reddst6 = nd_rd->nd_rd_dst;
 
 	if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) ||
 	    in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) {
 		goto freeit;
 	}
 
 	/* validation */
 	if (!IN6_IS_ADDR_LINKLOCAL(&src6)) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect sent from %s rejected; "
 		    "must be from linklocal\n",
 		    ip6_sprintf(ip6buf, &src6)));
 		goto bad;
 	}
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect sent from %s rejected; "
 		    "hlim=%d (must be 255)\n",
 		    ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim));
 		goto bad;
 	}
     {
 	/* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */
-	struct sockaddr_in6 sin6;
-	struct in6_addr *gw6;
+	struct nhop6_basic nh6;
+	struct in6_addr kdst;
+	uint32_t scopeid;
 
-	bzero(&sin6, sizeof(sin6));
-	sin6.sin6_family = AF_INET6;
-	sin6.sin6_len = sizeof(struct sockaddr_in6);
-	bcopy(&reddst6, &sin6.sin6_addr, sizeof(reddst6));
-	rt = in6_rtalloc1((struct sockaddr *)&sin6, 0, 0UL, RT_DEFAULT_FIB);
-	if (rt) {
-		if (rt->rt_gateway == NULL ||
-		    rt->rt_gateway->sa_family != AF_INET6) {
-			RTFREE_LOCKED(rt);
+	in6_splitscope(&reddst6, &kdst, &scopeid);
+	if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &kdst, scopeid, 0, 0,&nh6)==0){
+		if ((nh6.nh_flags & NHF_GATEWAY) == 0) {
 			nd6log((LOG_ERR,
 			    "ICMP6 redirect rejected; no route "
 			    "with inet6 gateway found for redirect dst: %s\n",
 			    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 			goto bad;
 		}
 
-		gw6 = &(((struct sockaddr_in6 *)rt->rt_gateway)->sin6_addr);
-		if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) {
-			RTFREE_LOCKED(rt);
+		if (IN6_ARE_ADDR_EQUAL(&src6, &nh6.nh_addr) == 0) {
 			nd6log((LOG_ERR,
 			    "ICMP6 redirect rejected; "
 			    "not equal to gw-for-src=%s (must be same): "
 			    "%s\n",
-			    ip6_sprintf(ip6buf, gw6),
+			    ip6_sprintf(ip6buf, &nh6.nh_addr),
 			    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 			goto bad;
 		}
 	} else {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "no route found for redirect dst: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
-	RTFREE_LOCKED(rt);
-	rt = NULL;
     }
 	if (IN6_IS_ADDR_MULTICAST(&reddst6)) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "redirect dst must be unicast: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	is_router = is_onlink = 0;
 	if (IN6_IS_ADDR_LINKLOCAL(&redtgt6))
 		is_router = 1;	/* router case */
 	if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0)
 		is_onlink = 1;	/* on-link destination case */
 	if (!is_router && !is_onlink) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "neither router case nor onlink case: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	icmp6len -= sizeof(*nd_rd);
 	nd6_option_init(nd_rd + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO, "%s: invalid ND option, rejected: %s\n",
 		    __func__, icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	if (ndopts.nd_opts_tgt_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO, "%s: lladdrlen mismatch for %s "
 		    "(if %d, icmp6 packet %d): %s\n",
 		    __func__, ip6_sprintf(ip6buf, &redtgt6),
 		    ifp->if_addrlen, lladdrlen - 2,
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	/* Validation passed. */
 
 	/* RFC 2461 8.3 */
 	nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT,
 	    is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER);
 
 	/*
 	 * Install a gateway route in the better-router case or an interface
 	 * route in the on-link-destination case.
 	 */
 	{
 		struct sockaddr_in6 sdst;
 		struct sockaddr_in6 sgw;
 		struct sockaddr_in6 ssrc;
 		struct sockaddr *gw;
 		int rt_flags;
 		u_int fibnum;
 
 		bzero(&sdst, sizeof(sdst));
 		bzero(&ssrc, sizeof(ssrc));
 		sdst.sin6_family = ssrc.sin6_family = AF_INET6;
 		sdst.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6);
 		bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
 		bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr));
 		rt_flags = RTF_HOST;
 		if (is_router) {
 			bzero(&sgw, sizeof(sgw));
 			sgw.sin6_family = AF_INET6;
 			sgw.sin6_len = sizeof(struct sockaddr_in6);
 			bcopy(&redtgt6, &sgw.sin6_addr,
 				sizeof(struct in6_addr));
 			gw = (struct sockaddr *)&sgw;
 			rt_flags |= RTF_GATEWAY;
 		} else
 			gw = ifp->if_addr->ifa_addr;
 		for (fibnum = 0; fibnum < rt_numfibs; fibnum++)
 			in6_rtredirect((struct sockaddr *)&sdst, gw,
 			    (struct sockaddr *)NULL, rt_flags,
 			    (struct sockaddr *)&ssrc, fibnum);
 	}
 	/* finally update cached route in each socket via pfctlinput */
     {
 	struct sockaddr_in6 sdst;
 
 	bzero(&sdst, sizeof(sdst));
 	sdst.sin6_family = AF_INET6;
 	sdst.sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
 	pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst);
     }
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	ICMP6STAT_INC(icp6s_badredirect);
 	m_freem(m);
 }
 
 void
 icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt)
 {
 	struct ifnet *ifp;	/* my outgoing interface */
 	struct in6_addr *ifp_ll6;
 	struct in6_addr *router_ll6;
 	struct ip6_hdr *sip6;	/* m0 as struct ip6_hdr */
 	struct mbuf *m = NULL;	/* newly allocated one */
 	struct m_tag *mtag;
 	struct ip6_hdr *ip6;	/* m as struct ip6_hdr */
 	struct nd_redirect *nd_rd;
 	struct llentry *ln = NULL;
 	size_t maxlen;
 	u_char *p;
 	struct ifnet *outif = NULL;
 	struct sockaddr_in6 src_sa;
 
 	icmp6_errcount(ND_REDIRECT, 0);
 
 	/* if we are not router, we don't send icmp6 redirect */
 	if (!V_ip6_forwarding)
 		goto fail;
 
 	/* sanity check */
 	if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp))
 		goto fail;
 
 	/*
 	 * Address check:
 	 *  the source address must identify a neighbor, and
 	 *  the destination address must not be a multicast address
 	 *  [RFC 2461, sec 8.2]
 	 */
 	sip6 = mtod(m0, struct ip6_hdr *);
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = sip6->ip6_src;
 	if (nd6_is_addr_neighbor(&src_sa, ifp) == 0)
 		goto fail;
 	if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst))
 		goto fail;	/* what should we do here? */
 
 	/* rate limit */
 	if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0))
 		goto fail;
 
 	/*
 	 * Since we are going to append up to 1280 bytes (= IPV6_MMTU),
 	 * we almost always ask for an mbuf cluster for simplicity.
 	 * (MHLEN < IPV6_MMTU is almost always true)
 	 */
 #if IPV6_MMTU >= MCLBYTES
 # error assumption failed about IPV6_MMTU and MCLBYTES
 #endif
 	m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		goto fail;
 	M_SETFIB(m, rt->rt_fibnum);
 	maxlen = M_TRAILINGSPACE(m);
 	maxlen = min(IPV6_MMTU, maxlen);
 	/* just for safety */
 	if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) +
 	    ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) {
 		goto fail;
 	}
 
 	{
 		/* get ip6 linklocal address for ifp(my outgoing interface). */
 		struct in6_ifaddr *ia;
 		if ((ia = in6ifa_ifpforlinklocal(ifp,
 						 IN6_IFF_NOTREADY|
 						 IN6_IFF_ANYCAST)) == NULL)
 			goto fail;
 		ifp_ll6 = &ia->ia_addr.sin6_addr;
 		/* XXXRW: reference released prematurely. */
 		ifa_free(&ia->ia_ifa);
 	}
 
 	/* get ip6 linklocal address for the router. */
 	if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) {
 		struct sockaddr_in6 *sin6;
 		sin6 = (struct sockaddr_in6 *)rt->rt_gateway;
 		router_ll6 = &sin6->sin6_addr;
 		if (!IN6_IS_ADDR_LINKLOCAL(router_ll6))
 			router_ll6 = (struct in6_addr *)NULL;
 	} else
 		router_ll6 = (struct in6_addr *)NULL;
 
 	/* ip6 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	/* ip6->ip6_plen will be set later */
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	/* ip6->ip6_src must be linklocal addr for my outgoing if. */
 	bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr));
 	bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr));
 
 	/* ND Redirect */
 	nd_rd = (struct nd_redirect *)(ip6 + 1);
 	nd_rd->nd_rd_type = ND_REDIRECT;
 	nd_rd->nd_rd_code = 0;
 	nd_rd->nd_rd_reserved = 0;
 	if (rt->rt_flags & RTF_GATEWAY) {
 		/*
 		 * nd_rd->nd_rd_target must be a link-local address in
 		 * better router cases.
 		 */
 		if (!router_ll6)
 			goto fail;
 		bcopy(router_ll6, &nd_rd->nd_rd_target,
 		    sizeof(nd_rd->nd_rd_target));
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
 		    sizeof(nd_rd->nd_rd_dst));
 	} else {
 		/* make sure redtgt == reddst */
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target,
 		    sizeof(nd_rd->nd_rd_target));
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
 		    sizeof(nd_rd->nd_rd_dst));
 	}
 
 	p = (u_char *)(nd_rd + 1);
 
 	if (!router_ll6)
 		goto nolladdropt;
 
 	{
 		/* target lladdr option */
 		int len;
 		struct nd_opt_hdr *nd_opt;
 		char *lladdr;
 
 		IF_AFDATA_RLOCK(ifp);
 		ln = nd6_lookup(router_ll6, 0, ifp);
 		IF_AFDATA_RUNLOCK(ifp);
 		if (ln == NULL)
 			goto nolladdropt;
 
 		len = sizeof(*nd_opt) + ifp->if_addrlen;
 		len = (len + 7) & ~7;	/* round by 8 */
 		/* safety check */
 		if (len + (p - (u_char *)ip6) > maxlen) 			
 			goto nolladdropt;
 
 		if (ln->la_flags & LLE_VALID) {
 			nd_opt = (struct nd_opt_hdr *)p;
 			nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
 			nd_opt->nd_opt_len = len >> 3;
 			lladdr = (char *)(nd_opt + 1);
 			bcopy(&ln->ll_addr, lladdr, ifp->if_addrlen);
 			p += len;
 		}
 	}
 nolladdropt:
 	if (ln != NULL)
 		LLE_RUNLOCK(ln);
 		
 	m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
 
 	/* just to be safe */
 #ifdef M_DECRYPTED	/*not openbsd*/
 	if (m0->m_flags & M_DECRYPTED)
 		goto noredhdropt;
 #endif
 	if (p - (u_char *)ip6 > maxlen)
 		goto noredhdropt;
 
 	{
 		/* redirected header option */
 		int len;
 		struct nd_opt_rd_hdr *nd_opt_rh;
 
 		/*
 		 * compute the maximum size for icmp6 redirect header option.
 		 * XXX room for auth header?
 		 */
 		len = maxlen - (p - (u_char *)ip6);
 		len &= ~7;
 
 		/* This is just for simplicity. */
 		if (m0->m_pkthdr.len != m0->m_len) {
 			if (m0->m_next) {
 				m_freem(m0->m_next);
 				m0->m_next = NULL;
 			}
 			m0->m_pkthdr.len = m0->m_len;
 		}
 
 		/*
 		 * Redirected header option spec (RFC2461 4.6.3) talks nothing
 		 * about padding/truncate rule for the original IP packet.
 		 * From the discussion on IPv6imp in Feb 1999,
 		 * the consensus was:
 		 * - "attach as much as possible" is the goal
 		 * - pad if not aligned (original size can be guessed by
 		 *   original ip6 header)
 		 * Following code adds the padding if it is simple enough,
 		 * and truncates if not.
 		 */
 		if (m0->m_next || m0->m_pkthdr.len != m0->m_len)
 			panic("assumption failed in %s:%d", __FILE__,
 			    __LINE__);
 
 		if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) {
 			/* not enough room, truncate */
 			m0->m_pkthdr.len = m0->m_len = len -
 			    sizeof(*nd_opt_rh);
 		} else {
 			/* enough room, pad or truncate */
 			size_t extra;
 
 			extra = m0->m_pkthdr.len % 8;
 			if (extra) {
 				/* pad if easy enough, truncate if not */
 				if (8 - extra <= M_TRAILINGSPACE(m0)) {
 					/* pad */
 					m0->m_len += (8 - extra);
 					m0->m_pkthdr.len += (8 - extra);
 				} else {
 					/* truncate */
 					m0->m_pkthdr.len -= extra;
 					m0->m_len -= extra;
 				}
 			}
 			len = m0->m_pkthdr.len + sizeof(*nd_opt_rh);
 			m0->m_pkthdr.len = m0->m_len = len -
 			    sizeof(*nd_opt_rh);
 		}
 
 		nd_opt_rh = (struct nd_opt_rd_hdr *)p;
 		bzero(nd_opt_rh, sizeof(*nd_opt_rh));
 		nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER;
 		nd_opt_rh->nd_opt_rh_len = len >> 3;
 		p += sizeof(*nd_opt_rh);
 		m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
 
 		/* connect m0 to m */
 		m_tag_delete_chain(m0, NULL);
 		m0->m_flags &= ~M_PKTHDR;
 		m->m_next = m0;
 		m->m_pkthdr.len = m->m_len + m0->m_len;
 		m0 = NULL;
 	}
 noredhdropt:;
 	if (m0) {
 		m_freem(m0);
 		m0 = NULL;
 	}
 
 	/* XXX: clear embedded link IDs in the inner header */
 	in6_clearscope(&sip6->ip6_src);
 	in6_clearscope(&sip6->ip6_dst);
 	in6_clearscope(&nd_rd->nd_rd_target);
 	in6_clearscope(&nd_rd->nd_rd_dst);
 
 	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
 
 	nd_rd->nd_rd_cksum = 0;
 	nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6,
 	    sizeof(*ip6), ntohs(ip6->ip6_plen));
 
         if (send_sendso_input_hook != NULL) {
 		mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short),
 			M_NOWAIT);
 		if (mtag == NULL)
 			goto fail;
 		*(unsigned short *)(mtag + 1) = nd_rd->nd_rd_type;
 		m_tag_prepend(m, mtag);
 	}
 
 	/* send the packet to outside... */
 	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
 	if (outif) {
 		icmp6_ifstat_inc(outif, ifs6_out_msg);
 		icmp6_ifstat_inc(outif, ifs6_out_redirect);
 	}
 	ICMP6STAT_INC(icp6s_outhist[ND_REDIRECT]);
 
 	return;
 
 fail:
 	if (m)
 		m_freem(m);
 	if (m0)
 		m_freem(m0);
 }
 
 /*
  * ICMPv6 socket option processing.
  */
 int
 icmp6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0;
 	int optlen;
 	struct inpcb *inp = sotoinpcb(so);
 	int level, op, optname;
 
 	if (sopt) {
 		level = sopt->sopt_level;
 		op = sopt->sopt_dir;
 		optname = sopt->sopt_name;
 		optlen = sopt->sopt_valsize;
 	} else
 		level = op = optname = optlen = 0;
 
 	if (level != IPPROTO_ICMPV6) {
 		return EINVAL;
 	}
 
 	switch (op) {
 	case PRCO_SETOPT:
 		switch (optname) {
 		case ICMP6_FILTER:
 		    {
 			struct icmp6_filter ic6f;
 
 			if (optlen != sizeof(ic6f)) {
 				error = EMSGSIZE;
 				break;
 			}
 			error = sooptcopyin(sopt, &ic6f, optlen, optlen);
 			if (error == 0) {
 				INP_WLOCK(inp);
 				*inp->in6p_icmp6filt = ic6f;
 				INP_WUNLOCK(inp);
 			}
 			break;
 		    }
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case PRCO_GETOPT:
 		switch (optname) {
 		case ICMP6_FILTER:
 		    {
 			struct icmp6_filter ic6f;
 
 			INP_RLOCK(inp);
 			ic6f = *inp->in6p_icmp6filt;
 			INP_RUNLOCK(inp);
 			error = sooptcopyout(sopt, &ic6f, sizeof(ic6f));
 			break;
 		    }
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Perform rate limit check.
  * Returns 0 if it is okay to send the icmp6 packet.
  * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate
  * limitation.
  *
  * XXX per-destination/type check necessary?
  *
  * dst - not used at this moment
  * type - not used at this moment
  * code - not used at this moment
  */
 static int
 icmp6_ratelimit(const struct in6_addr *dst, const int type,
     const int code)
 {
 	int ret;
 
 	ret = 0;	/* okay to send */
 
 	/* PPS limit */
 	if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count,
 	    V_icmp6errppslim)) {
 		/* The packet is subject to rate limit */
 		ret++;
 	}
 
 	return ret;
 }
Index: head/sys/netinet6/in6.c
===================================================================
--- head/sys/netinet6/in6.c	(revision 292014)
+++ head/sys/netinet6/in6.c	(revision 292015)
@@ -1,2504 +1,2505 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in.c	8.2 (Berkeley) 11/15/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/errno.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/if_dl.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_carp.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/scope6_var.h>
+#include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 
 VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix);
 #define V_icmp6_nodeinfo_oldmcprefix	VNET(icmp6_nodeinfo_oldmcprefix)
 
 /*
  * Definitions of some costant IP6 addresses.
  */
 const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
 const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
 const struct in6_addr in6addr_nodelocal_allnodes =
 	IN6ADDR_NODELOCAL_ALLNODES_INIT;
 const struct in6_addr in6addr_linklocal_allnodes =
 	IN6ADDR_LINKLOCAL_ALLNODES_INIT;
 const struct in6_addr in6addr_linklocal_allrouters =
 	IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
 const struct in6_addr in6addr_linklocal_allv2routers =
 	IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT;
 
 const struct in6_addr in6mask0 = IN6MASK0;
 const struct in6_addr in6mask32 = IN6MASK32;
 const struct in6_addr in6mask64 = IN6MASK64;
 const struct in6_addr in6mask96 = IN6MASK96;
 const struct in6_addr in6mask128 = IN6MASK128;
 
 const struct sockaddr_in6 sa6_any =
 	{ sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 };
 
 static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *,
 	struct in6_aliasreq *, int);
 static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *);
 
 static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *,
     struct in6_ifaddr *, int);
 static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *,
     struct in6_aliasreq *, int flags);
 static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *,
     struct in6_ifaddr *, int, int);
 static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *,
     struct in6_ifaddr *, int);
 
 #define ifa2ia6(ifa)	((struct in6_ifaddr *)(ifa))
 #define ia62ifa(ia6)	(&((ia6)->ia_ifa))
 
 
 void
 in6_newaddrmsg(struct in6_ifaddr *ia, int cmd)
 {
 	struct sockaddr_dl gateway;
 	struct sockaddr_in6 mask, addr;
 	struct rtentry rt;
 
 	/*
 	 * initialize for rtmsg generation
 	 */
 	bzero(&gateway, sizeof(gateway));
 	gateway.sdl_len = sizeof(gateway);
 	gateway.sdl_family = AF_LINK;
 
 	bzero(&rt, sizeof(rt));
 	rt.rt_gateway = (struct sockaddr *)&gateway;
 	memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask));
 	memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr));
 	rt_mask(&rt) = (struct sockaddr *)&mask;
 	rt_key(&rt) = (struct sockaddr *)&addr;
 	rt.rt_flags = RTF_HOST | RTF_STATIC;
 	if (cmd == RTM_ADD)
 		rt.rt_flags |= RTF_UP;
 	/* Announce arrival of local address to all FIBs. */
 	rt_newaddrmsg(cmd, &ia->ia_ifa, 0, &rt);
 }
 
 int
 in6_mask2len(struct in6_addr *mask, u_char *lim0)
 {
 	int x = 0, y;
 	u_char *lim = lim0, *p;
 
 	/* ignore the scope_id part */
 	if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask))
 		lim = (u_char *)mask + sizeof(*mask);
 	for (p = (u_char *)mask; p < lim; x++, p++) {
 		if (*p != 0xff)
 			break;
 	}
 	y = 0;
 	if (p < lim) {
 		for (y = 0; y < 8; y++) {
 			if ((*p & (0x80 >> y)) == 0)
 				break;
 		}
 	}
 
 	/*
 	 * when the limit pointer is given, do a stricter check on the
 	 * remaining bits.
 	 */
 	if (p < lim) {
 		if (y != 0 && (*p & (0x00ff >> y)) != 0)
 			return (-1);
 		for (p = p + 1; p < lim; p++)
 			if (*p != 0)
 				return (-1);
 	}
 
 	return x * 8 + y;
 }
 
 #ifdef COMPAT_FREEBSD32
 struct in6_ndifreq32 {
 	char ifname[IFNAMSIZ];
 	uint32_t ifindex;
 };
 #define	SIOCGDEFIFACE32_IN6	_IOWR('i', 86, struct in6_ndifreq32)
 #endif
 
 int
 in6_control(struct socket *so, u_long cmd, caddr_t data,
     struct ifnet *ifp, struct thread *td)
 {
 	struct	in6_ifreq *ifr = (struct in6_ifreq *)data;
 	struct	in6_ifaddr *ia = NULL;
 	struct	in6_aliasreq *ifra = (struct in6_aliasreq *)data;
 	struct sockaddr_in6 *sa6;
 	int carp_attached = 0;
 	int error;
 	u_long ocmd = cmd;
 
 	/*
 	 * Compat to make pre-10.x ifconfig(8) operable.
 	 */
 	if (cmd == OSIOCAIFADDR_IN6)
 		cmd = SIOCAIFADDR_IN6;
 
 	switch (cmd) {
 	case SIOCGETSGCNT_IN6:
 	case SIOCGETMIFCNT_IN6:
 		/*
 		 * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c.
 		 * We cannot see how that would be needed, so do not adjust the
 		 * KPI blindly; more likely should clean up the IPv4 variant.
 		 */
 		return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP);
 	}
 
 	switch (cmd) {
 	case SIOCAADDRCTL_POLICY:
 	case SIOCDADDRCTL_POLICY:
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_ADDRCTRL6);
 			if (error)
 				return (error);
 		}
 		return (in6_src_ioctl(cmd, data));
 	}
 
 	if (ifp == NULL)
 		return (EOPNOTSUPP);
 
 	switch (cmd) {
 	case SIOCSNDFLUSH_IN6:
 	case SIOCSPFXFLUSH_IN6:
 	case SIOCSRTRFLUSH_IN6:
 	case SIOCSDEFIFACE_IN6:
 	case SIOCSIFINFO_FLAGS:
 	case SIOCSIFINFO_IN6:
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_ND6);
 			if (error)
 				return (error);
 		}
 		/* FALLTHROUGH */
 	case OSIOCGIFINFO_IN6:
 	case SIOCGIFINFO_IN6:
 	case SIOCGNBRINFO_IN6:
 	case SIOCGDEFIFACE_IN6:
 		return (nd6_ioctl(cmd, data, ifp));
 
 #ifdef COMPAT_FREEBSD32
 	case SIOCGDEFIFACE32_IN6:
 		{
 			struct in6_ndifreq ndif;
 			struct in6_ndifreq32 *ndif32;
 
 			error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif,
 			    ifp);
 			if (error)
 				return (error);
 			ndif32 = (struct in6_ndifreq32 *)data;
 			ndif32->ifindex = ndif.ifindex;
 			return (0);
 		}
 #endif
 	}
 
 	switch (cmd) {
 	case SIOCSIFPREFIX_IN6:
 	case SIOCDIFPREFIX_IN6:
 	case SIOCAIFPREFIX_IN6:
 	case SIOCCIFPREFIX_IN6:
 	case SIOCSGIFPREFIX_IN6:
 	case SIOCGIFPREFIX_IN6:
 		log(LOG_NOTICE,
 		    "prefix ioctls are now invalidated. "
 		    "please use ifconfig.\n");
 		return (EOPNOTSUPP);
 	}
 
 	switch (cmd) {
 	case SIOCSSCOPE6:
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_SCOPE6);
 			if (error)
 				return (error);
 		}
 		/* FALLTHROUGH */
 	case SIOCGSCOPE6:
 	case SIOCGSCOPE6DEF:
 		return (scope6_ioctl(cmd, data, ifp));
 	}
 
 	/*
 	 * Find address for this interface, if it exists.
 	 *
 	 * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation
 	 * only, and used the first interface address as the target of other
 	 * operations (without checking ifra_addr).  This was because netinet
 	 * code/API assumed at most 1 interface address per interface.
 	 * Since IPv6 allows a node to assign multiple addresses
 	 * on a single interface, we almost always look and check the
 	 * presence of ifra_addr, and reject invalid ones here.
 	 * It also decreases duplicated code among SIOC*_IN6 operations.
 	 */
 	switch (cmd) {
 	case SIOCAIFADDR_IN6:
 	case SIOCSIFPHYADDR_IN6:
 		sa6 = &ifra->ifra_addr;
 		break;
 	case SIOCSIFADDR_IN6:
 	case SIOCGIFADDR_IN6:
 	case SIOCSIFDSTADDR_IN6:
 	case SIOCSIFNETMASK_IN6:
 	case SIOCGIFDSTADDR_IN6:
 	case SIOCGIFNETMASK_IN6:
 	case SIOCDIFADDR_IN6:
 	case SIOCGIFPSRCADDR_IN6:
 	case SIOCGIFPDSTADDR_IN6:
 	case SIOCGIFAFLAG_IN6:
 	case SIOCSNDFLUSH_IN6:
 	case SIOCSPFXFLUSH_IN6:
 	case SIOCSRTRFLUSH_IN6:
 	case SIOCGIFALIFETIME_IN6:
 	case SIOCSIFALIFETIME_IN6:
 	case SIOCGIFSTAT_IN6:
 	case SIOCGIFSTAT_ICMP6:
 		sa6 = &ifr->ifr_addr;
 		break;
 	case SIOCSIFADDR:
 	case SIOCSIFBRDADDR:
 	case SIOCSIFDSTADDR:
 	case SIOCSIFNETMASK:
 		/*
 		 * Although we should pass any non-INET6 ioctl requests
 		 * down to driver, we filter some legacy INET requests.
 		 * Drivers trust SIOCSIFADDR et al to come from an already
 		 * privileged layer, and do not perform any credentials
 		 * checks or input validation.
 		 */
 		return (EINVAL);
 	default:
 		sa6 = NULL;
 		break;
 	}
 	if (sa6 && sa6->sin6_family == AF_INET6) {
 		if (sa6->sin6_scope_id != 0)
 			error = sa6_embedscope(sa6, 0);
 		else
 			error = in6_setscope(&sa6->sin6_addr, ifp, NULL);
 		if (error != 0)
 			return (error);
 		if (td != NULL && (error = prison_check_ip6(td->td_ucred,
 		    &sa6->sin6_addr)) != 0)
 			return (error);
 		ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr);
 	} else
 		ia = NULL;
 
 	switch (cmd) {
 	case SIOCSIFADDR_IN6:
 	case SIOCSIFDSTADDR_IN6:
 	case SIOCSIFNETMASK_IN6:
 		/*
 		 * Since IPv6 allows a node to assign multiple addresses
 		 * on a single interface, SIOCSIFxxx ioctls are deprecated.
 		 */
 		/* we decided to obsolete this command (20000704) */
 		error = EINVAL;
 		goto out;
 
 	case SIOCDIFADDR_IN6:
 		/*
 		 * for IPv4, we look for existing in_ifaddr here to allow
 		 * "ifconfig if0 delete" to remove the first IPv4 address on
 		 * the interface.  For IPv6, as the spec allows multiple
 		 * interface address from the day one, we consider "remove the
 		 * first one" semantics to be not preferable.
 		 */
 		if (ia == NULL) {
 			error = EADDRNOTAVAIL;
 			goto out;
 		}
 		/* FALLTHROUGH */
 	case SIOCAIFADDR_IN6:
 		/*
 		 * We always require users to specify a valid IPv6 address for
 		 * the corresponding operation.
 		 */
 		if (ifra->ifra_addr.sin6_family != AF_INET6 ||
 		    ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) {
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 
 		if (td != NULL) {
 			error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ?
 			    PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR);
 			if (error)
 				goto out;
 		}
 		/* FALLTHROUGH */
 	case SIOCGIFSTAT_IN6:
 	case SIOCGIFSTAT_ICMP6:
 		if (ifp->if_afdata[AF_INET6] == NULL) {
 			error = EPFNOSUPPORT;
 			goto out;
 		}
 		break;
 
 	case SIOCGIFADDR_IN6:
 		/* This interface is basically deprecated. use SIOCGIFCONF. */
 		/* FALLTHROUGH */
 	case SIOCGIFAFLAG_IN6:
 	case SIOCGIFNETMASK_IN6:
 	case SIOCGIFDSTADDR_IN6:
 	case SIOCGIFALIFETIME_IN6:
 		/* must think again about its semantics */
 		if (ia == NULL) {
 			error = EADDRNOTAVAIL;
 			goto out;
 		}
 		break;
 
 	case SIOCSIFALIFETIME_IN6:
 	    {
 		struct in6_addrlifetime *lt;
 
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_ALIFETIME6);
 			if (error)
 				goto out;
 		}
 		if (ia == NULL) {
 			error = EADDRNOTAVAIL;
 			goto out;
 		}
 		/* sanity for overflow - beware unsigned */
 		lt = &ifr->ifr_ifru.ifru_lifetime;
 		if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME &&
 		    lt->ia6t_vltime + time_uptime < time_uptime) {
 			error = EINVAL;
 			goto out;
 		}
 		if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME &&
 		    lt->ia6t_pltime + time_uptime < time_uptime) {
 			error = EINVAL;
 			goto out;
 		}
 		break;
 	    }
 	}
 
 	switch (cmd) {
 	case SIOCGIFADDR_IN6:
 		ifr->ifr_addr = ia->ia_addr;
 		if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0)
 			goto out;
 		break;
 
 	case SIOCGIFDSTADDR_IN6:
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
 			error = EINVAL;
 			goto out;
 		}
 		/*
 		 * XXX: should we check if ifa_dstaddr is NULL and return
 		 * an error?
 		 */
 		ifr->ifr_dstaddr = ia->ia_dstaddr;
 		if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0)
 			goto out;
 		break;
 
 	case SIOCGIFNETMASK_IN6:
 		ifr->ifr_addr = ia->ia_prefixmask;
 		break;
 
 	case SIOCGIFAFLAG_IN6:
 		ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags;
 		break;
 
 	case SIOCGIFSTAT_IN6:
 		COUNTER_ARRAY_COPY(((struct in6_ifextra *)
 		    ifp->if_afdata[AF_INET6])->in6_ifstat,
 		    &ifr->ifr_ifru.ifru_stat,
 		    sizeof(struct in6_ifstat) / sizeof(uint64_t));
 		break;
 
 	case SIOCGIFSTAT_ICMP6:
 		COUNTER_ARRAY_COPY(((struct in6_ifextra *)
 		    ifp->if_afdata[AF_INET6])->icmp6_ifstat,
 		    &ifr->ifr_ifru.ifru_icmp6stat,
 		    sizeof(struct icmp6_ifstat) / sizeof(uint64_t));
 		break;
 
 	case SIOCGIFALIFETIME_IN6:
 		ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime;
 		if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 			time_t maxexpire;
 			struct in6_addrlifetime *retlt =
 			    &ifr->ifr_ifru.ifru_lifetime;
 
 			/*
 			 * XXX: adjust expiration time assuming time_t is
 			 * signed.
 			 */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (ia->ia6_lifetime.ia6t_vltime <
 			    maxexpire - ia->ia6_updatetime) {
 				retlt->ia6t_expire = ia->ia6_updatetime +
 				    ia->ia6_lifetime.ia6t_vltime;
 			} else
 				retlt->ia6t_expire = maxexpire;
 		}
 		if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 			time_t maxexpire;
 			struct in6_addrlifetime *retlt =
 			    &ifr->ifr_ifru.ifru_lifetime;
 
 			/*
 			 * XXX: adjust expiration time assuming time_t is
 			 * signed.
 			 */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (ia->ia6_lifetime.ia6t_pltime <
 			    maxexpire - ia->ia6_updatetime) {
 				retlt->ia6t_preferred = ia->ia6_updatetime +
 				    ia->ia6_lifetime.ia6t_pltime;
 			} else
 				retlt->ia6t_preferred = maxexpire;
 		}
 		break;
 
 	case SIOCSIFALIFETIME_IN6:
 		ia->ia6_lifetime = ifr->ifr_ifru.ifru_lifetime;
 		/* for sanity */
 		if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 			ia->ia6_lifetime.ia6t_expire =
 				time_uptime + ia->ia6_lifetime.ia6t_vltime;
 		} else
 			ia->ia6_lifetime.ia6t_expire = 0;
 		if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 			ia->ia6_lifetime.ia6t_preferred =
 				time_uptime + ia->ia6_lifetime.ia6t_pltime;
 		} else
 			ia->ia6_lifetime.ia6t_preferred = 0;
 		break;
 
 	case SIOCAIFADDR_IN6:
 	{
 		struct nd_prefixctl pr0;
 		struct nd_prefix *pr;
 
 		/*
 		 * first, make or update the interface address structure,
 		 * and link it to the list.
 		 */
 		if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0)
 			goto out;
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr))
 		    == NULL) {
 			/*
 			 * this can happen when the user specify the 0 valid
 			 * lifetime.
 			 */
 			break;
 		}
 
 		if (cmd == ocmd && ifra->ifra_vhid > 0) {
 			if (carp_attach_p != NULL)
 				error = (*carp_attach_p)(&ia->ia_ifa,
 				    ifra->ifra_vhid);
 			else
 				error = EPROTONOSUPPORT;
 			if (error)
 				goto out;
 			else
 				carp_attached = 1;
 		}
 
 		/*
 		 * then, make the prefix on-link on the interface.
 		 * XXX: we'd rather create the prefix before the address, but
 		 * we need at least one address to install the corresponding
 		 * interface route, so we configure the address first.
 		 */
 
 		/*
 		 * convert mask to prefix length (prefixmask has already
 		 * been validated in in6_update_ifa().
 		 */
 		bzero(&pr0, sizeof(pr0));
 		pr0.ndpr_ifp = ifp;
 		pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
 		    NULL);
 		if (pr0.ndpr_plen == 128) {
 			/* we don't need to install a host route. */
 			goto aifaddr_out;
 		}
 		pr0.ndpr_prefix = ifra->ifra_addr;
 		/* apply the mask for safety. */
 		IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr,
 		    &ifra->ifra_prefixmask.sin6_addr);
 
 		/*
 		 * XXX: since we don't have an API to set prefix (not address)
 		 * lifetimes, we just use the same lifetimes as addresses.
 		 * The (temporarily) installed lifetimes can be overridden by
 		 * later advertised RAs (when accept_rtadv is non 0), which is
 		 * an intended behavior.
 		 */
 		pr0.ndpr_raf_onlink = 1; /* should be configurable? */
 		pr0.ndpr_raf_auto =
 		    ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0);
 		pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime;
 		pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime;
 
 		/* add the prefix if not yet. */
 		if ((pr = nd6_prefix_lookup(&pr0)) == NULL) {
 			/*
 			 * nd6_prelist_add will install the corresponding
 			 * interface route.
 			 */
 			if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) {
 				if (carp_attached)
 					(*carp_detach_p)(&ia->ia_ifa);
 				goto out;
 			}
 			if (pr == NULL) {
 				if (carp_attached)
 					(*carp_detach_p)(&ia->ia_ifa);
 				log(LOG_ERR, "nd6_prelist_add succeeded but "
 				    "no prefix\n");
 				error = EINVAL;
 				goto out;
 			}
 		}
 
 		/* relate the address to the prefix */
 		if (ia->ia6_ndpr == NULL) {
 			ia->ia6_ndpr = pr;
 			pr->ndpr_refcnt++;
 
 			/*
 			 * If this is the first autoconf address from the
 			 * prefix, create a temporary address as well
 			 * (when required).
 			 */
 			if ((ia->ia6_flags & IN6_IFF_AUTOCONF) &&
 			    V_ip6_use_tempaddr && pr->ndpr_refcnt == 1) {
 				int e;
 				if ((e = in6_tmpifadd(ia, 1, 0)) != 0) {
 					log(LOG_NOTICE, "in6_control: failed "
 					    "to create a temporary address, "
 					    "errno=%d\n", e);
 				}
 			}
 		}
 
 		/*
 		 * this might affect the status of autoconfigured addresses,
 		 * that is, this address might make other addresses detached.
 		 */
 		pfxlist_onlink_check();
 aifaddr_out:
 		if (error != 0 || ia == NULL)
 			break;
 		/*
 		 * Try to clear the flag when a new IPv6 address is added
 		 * onto an IFDISABLED interface and it succeeds.
 		 */
 		if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) {
 			struct in6_ndireq nd;
 
 			memset(&nd, 0, sizeof(nd));
 			nd.ndi.flags = ND_IFINFO(ifp)->flags;
 			nd.ndi.flags &= ~ND6_IFF_IFDISABLED;
 			if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0)
 				log(LOG_NOTICE, "SIOCAIFADDR_IN6: "
 				    "SIOCSIFINFO_FLAGS for -ifdisabled "
 				    "failed.");
 			/*
 			 * Ignore failure of clearing the flag intentionally.
 			 * The failure means address duplication was detected.
 			 */
 		}
 		EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 		break;
 	}
 
 	case SIOCDIFADDR_IN6:
 	{
 		struct nd_prefix *pr;
 
 		/*
 		 * If the address being deleted is the only one that owns
 		 * the corresponding prefix, expire the prefix as well.
 		 * XXX: theoretically, we don't have to worry about such
 		 * relationship, since we separate the address management
 		 * and the prefix management.  We do this, however, to provide
 		 * as much backward compatibility as possible in terms of
 		 * the ioctl operation.
 		 * Note that in6_purgeaddr() will decrement ndpr_refcnt.
 		 */
 		pr = ia->ia6_ndpr;
 		in6_purgeaddr(&ia->ia_ifa);
 		if (pr && pr->ndpr_refcnt == 0)
 			prelist_remove(pr);
 		EVENTHANDLER_INVOKE(ifaddr_event, ifp);
 		break;
 	}
 
 	default:
 		if (ifp->if_ioctl == NULL) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		goto out;
 	}
 
 	error = 0;
 out:
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (error);
 }
 
 
 /*
  * Join necessary multicast groups.  Factored out from in6_update_ifa().
  * This entire work should only be done once, for the default FIB.
  */
 static int
 in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol)
 {
 	char ip6buf[INET6_ADDRSTRLEN];
 	struct in6_addr mltaddr;
 	struct in6_multi_mship *imm;
 	int delay, error;
 
 	KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__));
 
 	/* Join solicited multicast addr for new host id. */
 	bzero(&mltaddr, sizeof(struct in6_addr));
 	mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL;
 	mltaddr.s6_addr32[2] = htonl(1);
 	mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3];
 	mltaddr.s6_addr8[12] = 0xff;
 	if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) {
 		/* XXX: should not happen */
 		log(LOG_ERR, "%s: in6_setscope failed\n", __func__);
 		goto cleanup;
 	}
 	delay = error = 0;
 	if ((flags & IN6_IFAUPDATE_DADDELAY)) {
 		/*
 		 * We need a random delay for DAD on the address being
 		 * configured.  It also means delaying transmission of the
 		 * corresponding MLD report to avoid report collision.
 		 * [RFC 4861, Section 6.3.7]
 		 */
 		delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz);
 	}
 	imm = in6_joingroup(ifp, &mltaddr, &error, delay);
 	if (imm == NULL) {
 		nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s "
 		    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr),
 		    if_name(ifp), error));
 		goto cleanup;
 	}
 	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 	*in6m_sol = imm->i6mm_maddr;
 
 	/*
 	 * Join link-local all-nodes address.
 	 */
 	mltaddr = in6addr_linklocal_allnodes;
 	if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0)
 		goto cleanup; /* XXX: should not fail */
 
 	imm = in6_joingroup(ifp, &mltaddr, &error, 0);
 	if (imm == NULL) {
 		nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s "
 		    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr),
 		    if_name(ifp), error));
 		goto cleanup;
 	}
 	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 
 	/*
 	 * Join node information group address.
 	 */
 	delay = 0;
 	if ((flags & IN6_IFAUPDATE_DADDELAY)) {
 		/*
 		 * The spec does not say anything about delay for this group,
 		 * but the same logic should apply.
 		 */
 		delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz);
 	}
 	if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) {
 		/* XXX jinmei */
 		imm = in6_joingroup(ifp, &mltaddr, &error, delay);
 		if (imm == NULL)
 			nd6log((LOG_WARNING,
 			    "%s: in6_joingroup failed for %s on %s "
 			    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf,
 			    &mltaddr), if_name(ifp), error));
 			/* XXX not very fatal, go on... */
 		else
 			LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 	}
 	if (V_icmp6_nodeinfo_oldmcprefix &&
 	    in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) {
 		imm = in6_joingroup(ifp, &mltaddr, &error, delay);
 		if (imm == NULL)
 			nd6log((LOG_WARNING,
 			    "%s: in6_joingroup failed for %s on %s "
 			    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf,
 			    &mltaddr), if_name(ifp), error));
 			/* XXX not very fatal, go on... */
 		else
 			LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 	}
 
 	/*
 	 * Join interface-local all-nodes address.
 	 * (ff01::1%ifN, and ff01::%ifN/32)
 	 */
 	mltaddr = in6addr_nodelocal_allnodes;
 	if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0)
 		goto cleanup; /* XXX: should not fail */
 
 	imm = in6_joingroup(ifp, &mltaddr, &error, 0);
 	if (imm == NULL) {
 		nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s "
 		    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf,
 		    &mltaddr), if_name(ifp), error));
 		goto cleanup;
 	}
 	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 
 cleanup:
 	return (error);
 }
 
 /*
  * Update parameters of an IPv6 interface address.
  * If necessary, a new entry is created and linked into address chains.
  * This function is separated from in6_control().
  */
 int
 in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags)
 {
 	int error, hostIsNew = 0;
 
 	if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0)
 		return (error);
 
 	if (ia == NULL) {
 		hostIsNew = 1;
 		if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL)
 			return (ENOBUFS);
 	}
 
 	error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags);
 	if (error != 0) {
 		if (hostIsNew != 0) {
 			in6_unlink_ifa(ia, ifp);
 			ifa_free(&ia->ia_ifa);
 		}
 		return (error);
 	}
 
 	if (hostIsNew)
 		error = in6_broadcast_ifa(ifp, ifra, ia, flags);
 
 	return (error);
 }
 
 /*
  * Fill in basic IPv6 address request info.
  */
 void
 in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr,
     const struct in6_addr *mask)
 {
 
 	memset(ifra, 0, sizeof(struct in6_aliasreq));
 
 	ifra->ifra_addr.sin6_family = AF_INET6;
 	ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6);
 	if (addr != NULL)
 		ifra->ifra_addr.sin6_addr = *addr;
 
 	ifra->ifra_prefixmask.sin6_family = AF_INET6;
 	ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
 	if (mask != NULL)
 		ifra->ifra_prefixmask.sin6_addr = *mask;
 }
 
 static int
 in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags)
 {
 	int plen = -1;
 	struct sockaddr_in6 dst6;
 	struct in6_addrlifetime *lt;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/* Validate parameters */
 	if (ifp == NULL || ifra == NULL) /* this maybe redundant */
 		return (EINVAL);
 
 	/*
 	 * The destination address for a p2p link must have a family
 	 * of AF_UNSPEC or AF_INET6.
 	 */
 	if ((ifp->if_flags & IFF_POINTOPOINT) != 0 &&
 	    ifra->ifra_dstaddr.sin6_family != AF_INET6 &&
 	    ifra->ifra_dstaddr.sin6_family != AF_UNSPEC)
 		return (EAFNOSUPPORT);
 
 	/*
 	 * Validate address
 	 */
 	if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) ||
 	    ifra->ifra_addr.sin6_family != AF_INET6)
 		return (EINVAL);
 
 	/*
 	 * validate ifra_prefixmask.  don't check sin6_family, netmask
 	 * does not carry fields other than sin6_len.
 	 */
 	if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6))
 		return (EINVAL);
 	/*
 	 * Because the IPv6 address architecture is classless, we require
 	 * users to specify a (non 0) prefix length (mask) for a new address.
 	 * We also require the prefix (when specified) mask is valid, and thus
 	 * reject a non-consecutive mask.
 	 */
 	if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0)
 		return (EINVAL);
 	if (ifra->ifra_prefixmask.sin6_len != 0) {
 		plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
 		    (u_char *)&ifra->ifra_prefixmask +
 		    ifra->ifra_prefixmask.sin6_len);
 		if (plen <= 0)
 			return (EINVAL);
 	} else {
 		/*
 		 * In this case, ia must not be NULL.  We just use its prefix
 		 * length.
 		 */
 		plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL);
 	}
 	/*
 	 * If the destination address on a p2p interface is specified,
 	 * and the address is a scoped one, validate/set the scope
 	 * zone identifier.
 	 */
 	dst6 = ifra->ifra_dstaddr;
 	if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 &&
 	    (dst6.sin6_family == AF_INET6)) {
 		struct in6_addr in6_tmp;
 		u_int32_t zoneid;
 
 		in6_tmp = dst6.sin6_addr;
 		if (in6_setscope(&in6_tmp, ifp, &zoneid))
 			return (EINVAL); /* XXX: should be impossible */
 
 		if (dst6.sin6_scope_id != 0) {
 			if (dst6.sin6_scope_id != zoneid)
 				return (EINVAL);
 		} else		/* user omit to specify the ID. */
 			dst6.sin6_scope_id = zoneid;
 
 		/* convert into the internal form */
 		if (sa6_embedscope(&dst6, 0))
 			return (EINVAL); /* XXX: should be impossible */
 	}
 	/* Modify original ifra_dstaddr to reflect changes */
 	ifra->ifra_dstaddr = dst6;
 
 	/*
 	 * The destination address can be specified only for a p2p or a
 	 * loopback interface.  If specified, the corresponding prefix length
 	 * must be 128.
 	 */
 	if (ifra->ifra_dstaddr.sin6_family == AF_INET6) {
 		if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) {
 			/* XXX: noisy message */
 			nd6log((LOG_INFO, "in6_update_ifa: a destination can "
 			    "be specified for a p2p or a loopback IF only\n"));
 			return (EINVAL);
 		}
 		if (plen != 128) {
 			nd6log((LOG_INFO, "in6_update_ifa: prefixlen should "
 			    "be 128 when dstaddr is specified\n"));
 			return (EINVAL);
 		}
 	}
 	/* lifetime consistency check */
 	lt = &ifra->ifra_lifetime;
 	if (lt->ia6t_pltime > lt->ia6t_vltime)
 		return (EINVAL);
 	if (lt->ia6t_vltime == 0) {
 		/*
 		 * the following log might be noisy, but this is a typical
 		 * configuration mistake or a tool's bug.
 		 */
 		nd6log((LOG_INFO,
 		    "in6_update_ifa: valid lifetime is 0 for %s\n",
 		    ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr)));
 
 		if (ia == NULL)
 			return (0); /* there's nothing to do */
 	}
 
 	/* Check prefix mask */
 	if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) {
 		/*
 		 * We prohibit changing the prefix length of an existing
 		 * address, because
 		 * + such an operation should be rare in IPv6, and
 		 * + the operation would confuse prefix management.
 		 */
 		if (ia->ia_prefixmask.sin6_len != 0 &&
 		    in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) {
 			nd6log((LOG_INFO, "in6_validate_ifa: the prefix length "
 			    "of an existing %s address should not be changed\n",
 			    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 
 			return (EINVAL);
 		}
 	}
 
 	return (0);
 }
 
 
 /*
  * Allocate a new ifaddr and link it into chains.
  */
 static struct in6_ifaddr *
 in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags)
 {
 	struct in6_ifaddr *ia;
 
 	/*
 	 * When in6_alloc_ifa() is called in a process of a received
 	 * RA, it is called under an interrupt context.  So, we should
 	 * call malloc with M_NOWAIT.
 	 */
 	ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT);
 	if (ia == NULL)
 		return (NULL);
 	LIST_INIT(&ia->ia6_memberships);
 	/* Initialize the address and masks, and put time stamp */
 	ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr;
 	ia->ia_addr.sin6_family = AF_INET6;
 	ia->ia_addr.sin6_len = sizeof(ia->ia_addr);
 	/* XXX: Can we assign ,sin6_addr and skip the rest? */
 	ia->ia_addr = ifra->ifra_addr;
 	ia->ia6_createtime = time_uptime;
 	if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) {
 		/*
 		 * Some functions expect that ifa_dstaddr is not
 		 * NULL for p2p interfaces.
 		 */
 		ia->ia_ifa.ifa_dstaddr =
 		    (struct sockaddr *)&ia->ia_dstaddr;
 	} else {
 		ia->ia_ifa.ifa_dstaddr = NULL;
 	}
 
 	/* set prefix mask if any */
 	ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask;
 	if (ifra->ifra_prefixmask.sin6_len != 0) {
 		ia->ia_prefixmask.sin6_family = AF_INET6;
 		ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len;
 		ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr;
 	}
 
 	ia->ia_ifp = ifp;
 	ifa_ref(&ia->ia_ifa);			/* if_addrhead */
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 
 	ifa_ref(&ia->ia_ifa);			/* in6_ifaddrhead */
 	IN6_IFADDR_WLOCK();
 	TAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link);
 	LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash);
 	IN6_IFADDR_WUNLOCK();
 
 	return (ia);
 }
 
 /*
  * Update/configure interface address parameters:
  *
  * 1) Update lifetime
  * 2) Update interface metric ad flags
  * 3) Notify other subsystems
  */
 static int
 in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int hostIsNew, int flags)
 {
 	int error;
 
 	/* update timestamp */
 	ia->ia6_updatetime = time_uptime;
 
 	/*
 	 * Set lifetimes.  We do not refer to ia6t_expire and ia6t_preferred
 	 * to see if the address is deprecated or invalidated, but initialize
 	 * these members for applications.
 	 */
 	ia->ia6_lifetime = ifra->ifra_lifetime;
 	if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 		ia->ia6_lifetime.ia6t_expire =
 		    time_uptime + ia->ia6_lifetime.ia6t_vltime;
 	} else
 		ia->ia6_lifetime.ia6t_expire = 0;
 	if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 		ia->ia6_lifetime.ia6t_preferred =
 		    time_uptime + ia->ia6_lifetime.ia6t_pltime;
 	} else
 		ia->ia6_lifetime.ia6t_preferred = 0;
 
 	/*
 	 * backward compatibility - if IN6_IFF_DEPRECATED is set from the
 	 * userland, make it deprecated.
 	 */
 	if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) {
 		ia->ia6_lifetime.ia6t_pltime = 0;
 		ia->ia6_lifetime.ia6t_preferred = time_uptime;
 	}
 
 	/*
 	 * configure address flags.
 	 */
 	ia->ia6_flags = ifra->ifra_flags;
 
 	/*
 	 * Make the address tentative before joining multicast addresses,
 	 * so that corresponding MLD responses would not have a tentative
 	 * source address.
 	 */
 	ia->ia6_flags &= ~IN6_IFF_DUPLICATED;	/* safety */
 
 	/*
 	 * DAD should be performed for an new address or addresses on
 	 * an interface with ND6_IFF_IFDISABLED.
 	 */
 	if (in6if_do_dad(ifp) &&
 	    (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)))
 		ia->ia6_flags |= IN6_IFF_TENTATIVE;
 
 	/* notify other subsystems */
 	error = in6_notify_ifa(ifp, ia, ifra, hostIsNew);
 
 	return (error);
 }
 
 /*
  * Do link-level ifa job:
  * 1) Add lle entry for added address
  * 2) Notifies routing socket users about new address
  * 3) join appropriate multicast group
  * 4) start DAD if enabled
  */
 static int
 in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags)
 {
 	struct in6_multi *in6m_sol;
 	int error = 0;
 
 	/* Add local address to lltable, if necessary (ex. on p2p link). */
 	if ((error = nd6_add_ifa_lle(ia)) != 0) {
 		in6_purgeaddr(&ia->ia_ifa);
 		ifa_free(&ia->ia_ifa);
 		return (error);
 	}
 
 	/* Join necessary multicast groups. */
 	in6m_sol = NULL;
 	if ((ifp->if_flags & IFF_MULTICAST) != 0) {
 		error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol);
 		if (error != 0) {
 			in6_purgeaddr(&ia->ia_ifa);
 			ifa_free(&ia->ia_ifa);
 			return (error);
 		}
 	}
 
 	/* Perform DAD, if the address is TENTATIVE. */
 	if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) {
 		int delay, mindelay, maxdelay;
 
 		delay = 0;
 		if ((flags & IN6_IFAUPDATE_DADDELAY)) {
 			/*
 			 * We need to impose a delay before sending an NS
 			 * for DAD.  Check if we also needed a delay for the
 			 * corresponding MLD message.  If we did, the delay
 			 * should be larger than the MLD delay (this could be
 			 * relaxed a bit, but this simple logic is at least
 			 * safe).
 			 * XXX: Break data hiding guidelines and look at
 			 * state for the solicited multicast group.
 			 */
 			mindelay = 0;
 			if (in6m_sol != NULL &&
 			    in6m_sol->in6m_state == MLD_REPORTING_MEMBER) {
 				mindelay = in6m_sol->in6m_timer;
 			}
 			maxdelay = MAX_RTR_SOLICITATION_DELAY * hz;
 			if (maxdelay - mindelay == 0)
 				delay = 0;
 			else {
 				delay =
 				    (arc4random() % (maxdelay - mindelay)) +
 				    mindelay;
 			}
 		}
 		nd6_dad_start((struct ifaddr *)ia, delay);
 	}
 
 	in6_newaddrmsg(ia, RTM_ADD);
 	ifa_free(&ia->ia_ifa);
 	return (error);
 }
 
 void
 in6_purgeaddr(struct ifaddr *ifa)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa;
 	struct in6_multi_mship *imm;
 	int plen, error;
 
 	if (ifa->ifa_carp)
 		(*carp_detach_p)(ifa);
 
 	/*
 	 * Remove the loopback route to the interface address.
 	 * The check for the current setting of "nd6_useloopback"
 	 * is not needed.
 	 */
 	if (ia->ia_flags & IFA_RTSELF) {
 		error = ifa_del_loopback_route((struct ifaddr *)ia,
 		    (struct sockaddr *)&ia->ia_addr);
 		if (error == 0)
 			ia->ia_flags &= ~IFA_RTSELF;
 	}
 
 	/* stop DAD processing */
 	nd6_dad_stop(ifa);
 
 	/* Leave multicast groups. */
 	while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) {
 		LIST_REMOVE(imm, i6mm_chain);
 		in6_leavegroup(imm);
 	}
 	plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */
 	if ((ia->ia_flags & IFA_ROUTE) && plen == 128) {
 		error = rtinit(&(ia->ia_ifa), RTM_DELETE, ia->ia_flags |
 		    (ia->ia_dstaddr.sin6_family == AF_INET6) ? RTF_HOST : 0);
 		if (error != 0)
 			log(LOG_INFO, "%s: err=%d, destination address delete "
 			    "failed\n", __func__, error);
 		ia->ia_flags &= ~IFA_ROUTE;
 	}
 
 	in6_newaddrmsg(ia, RTM_DELETE);
 	in6_unlink_ifa(ia, ifp);
 }
 
 static void
 in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp)
 {
 	char ip6buf[INET6_ADDRSTRLEN];
 	int remove_lle;
 
 	IF_ADDR_WLOCK(ifp);
 	TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 	ifa_free(&ia->ia_ifa);			/* if_addrhead */
 
 	/*
 	 * Defer the release of what might be the last reference to the
 	 * in6_ifaddr so that it can't be freed before the remainder of the
 	 * cleanup.
 	 */
 	IN6_IFADDR_WLOCK();
 	TAILQ_REMOVE(&V_in6_ifaddrhead, ia, ia_link);
 	LIST_REMOVE(ia, ia6_hash);
 	IN6_IFADDR_WUNLOCK();
 
 	/*
 	 * Release the reference to the base prefix.  There should be a
 	 * positive reference.
 	 */
 	remove_lle = 0;
 	if (ia->ia6_ndpr == NULL) {
 		nd6log((LOG_NOTICE,
 		    "in6_unlink_ifa: autoconf'ed address "
 		    "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia))));
 	} else {
 		ia->ia6_ndpr->ndpr_refcnt--;
 		/* Do not delete lles within prefix if refcont != 0 */
 		if (ia->ia6_ndpr->ndpr_refcnt == 0)
 			remove_lle = 1;
 		ia->ia6_ndpr = NULL;
 	}
 
 	nd6_rem_ifa_lle(ia, remove_lle);
 
 	/*
 	 * Also, if the address being removed is autoconf'ed, call
 	 * pfxlist_onlink_check() since the release might affect the status of
 	 * other (detached) addresses.
 	 */
 	if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) {
 		pfxlist_onlink_check();
 	}
 	ifa_free(&ia->ia_ifa);			/* in6_ifaddrhead */
 }
 
 /*
  * Notifies other subsystems about address change/arrival:
  * 1) Notifies device handler on the first IPv6 address assignment
  * 2) Handle routing table changes for P2P links and route
  * 3) Handle routing table changes for address host route
  */
 static int
 in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia,
     struct in6_aliasreq *ifra, int hostIsNew)
 {
 	int	error = 0, plen, ifacount = 0;
 	struct ifaddr *ifa;
 	struct sockaddr_in6 *pdst;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/*
 	 * Give the interface a chance to initialize
 	 * if this is its first address,
 	 */
 	if (hostIsNew != 0) {
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifacount++;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 
 	if (ifacount <= 1 && ifp->if_ioctl) {
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * If a new destination address is specified, scrub the old one and
 	 * install the new destination.  Note that the interface must be
 	 * p2p or loopback.
 	 */
 	pdst = &ifra->ifra_dstaddr;
 	if (pdst->sin6_family == AF_INET6 &&
 	    !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) {
 		if ((ia->ia_flags & IFA_ROUTE) != 0 &&
 		    (rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST) != 0)) {
 			nd6log((LOG_ERR, "in6_update_ifa_internal: failed to "
 			    "remove a route to the old destination: %s\n",
 			    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 			/* proceed anyway... */
 		} else
 			ia->ia_flags &= ~IFA_ROUTE;
 		ia->ia_dstaddr = *pdst;
 	}
 
 	/*
 	 * If a new destination address is specified for a point-to-point
 	 * interface, install a route to the destination as an interface
 	 * direct route.
 	 * XXX: the logic below rejects assigning multiple addresses on a p2p
 	 * interface that share the same destination.
 	 */
 	plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */
 	if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 &&
 	    ia->ia_dstaddr.sin6_family == AF_INET6) {
 		int rtflags = RTF_UP | RTF_HOST;
 		/*
 		 * Handle the case for ::1 .
 		 */
 		if (ifp->if_flags & IFF_LOOPBACK)
 			ia->ia_flags |= IFA_RTSELF;
 		error = rtinit(&ia->ia_ifa, RTM_ADD, ia->ia_flags | rtflags);
 		if (error)
 			return (error);
 		ia->ia_flags |= IFA_ROUTE;
 	}
 
 	/*
 	 * add a loopback route to self if not exists
 	 */
 	if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) {
 		error = ifa_add_loopback_route((struct ifaddr *)ia,
 		    (struct sockaddr *)&ia->ia_addr);
 		if (error == 0)
 			ia->ia_flags |= IFA_RTSELF;
 	}
 
 	return (error);
 }
 
 /*
  * Find an IPv6 interface link-local address specific to an interface.
  * ifaddr is returned referenced.
  */
 struct in6_ifaddr *
 in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags)
 {
 	struct ifaddr *ifa;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) {
 			if ((((struct in6_ifaddr *)ifa)->ia6_flags &
 			    ignoreflags) != 0)
 				continue;
 			ifa_ref(ifa);
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return ((struct in6_ifaddr *)ifa);
 }
 
 
 /*
  * find the internet address corresponding to a given address.
  * ifaddr is returned referenced.
  */
 struct in6_ifaddr *
 in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) {
 			if (zoneid != 0 &&
 			    zoneid != ia->ia_addr.sin6_scope_id)
 				continue;
 			ifa_ref(&ia->ia_ifa);
 			break;
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (ia);
 }
 
 /*
  * find the internet address corresponding to a given interface and address.
  * ifaddr is returned referenced.
  */
 struct in6_ifaddr *
 in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr)
 {
 	struct ifaddr *ifa;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) {
 			ifa_ref(ifa);
 			break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return ((struct in6_ifaddr *)ifa);
 }
 
 /*
  * Find a link-local scoped address on ifp and return it if any.
  */
 struct in6_ifaddr *
 in6ifa_llaonifp(struct ifnet *ifp)
 {
 	struct sockaddr_in6 *sin6;
 	struct ifaddr *ifa;
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)
 		return (NULL);
 	if_addr_rlock(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		sin6 = (struct sockaddr_in6 *)ifa->ifa_addr;
 		if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
 		    IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) ||
 		    IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr))
 			break;
 	}
 	if_addr_runlock(ifp);
 
 	return ((struct in6_ifaddr *)ifa);
 }
 
 /*
  * Convert IP6 address to printable (loggable) representation. Caller
  * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long.
  */
 static char digits[] = "0123456789abcdef";
 char *
 ip6_sprintf(char *ip6buf, const struct in6_addr *addr)
 {
 	int i, cnt = 0, maxcnt = 0, idx = 0, index = 0;
 	char *cp;
 	const u_int16_t *a = (const u_int16_t *)addr;
 	const u_int8_t *d;
 	int dcolon = 0, zero = 0;
 
 	cp = ip6buf;
 
 	for (i = 0; i < 8; i++) {
 		if (*(a + i) == 0) {
 			cnt++;
 			if (cnt == 1)
 				idx = i;
 		}
 		else if (maxcnt < cnt) {
 			maxcnt = cnt;
 			index = idx;
 			cnt = 0;
 		}
 	}
 	if (maxcnt < cnt) {
 		maxcnt = cnt;
 		index = idx;
 	}
 
 	for (i = 0; i < 8; i++) {
 		if (dcolon == 1) {
 			if (*a == 0) {
 				if (i == 7)
 					*cp++ = ':';
 				a++;
 				continue;
 			} else
 				dcolon = 2;
 		}
 		if (*a == 0) {
 			if (dcolon == 0 && *(a + 1) == 0 && i == index) {
 				if (i == 0)
 					*cp++ = ':';
 				*cp++ = ':';
 				dcolon = 1;
 			} else {
 				*cp++ = '0';
 				*cp++ = ':';
 			}
 			a++;
 			continue;
 		}
 		d = (const u_char *)a;
 		/* Try to eliminate leading zeros in printout like in :0001. */
 		zero = 1;
 		*cp = digits[*d >> 4];
 		if (*cp != '0') {
 			zero = 0;
 			cp++;
 		}
 		*cp = digits[*d++ & 0xf];
 		if (zero == 0 || (*cp != '0')) {
 			zero = 0;
 			cp++;
 		}
 		*cp = digits[*d >> 4];
 		if (zero == 0 || (*cp != '0')) {
 			zero = 0;
 			cp++;
 		}
 		*cp++ = digits[*d & 0xf];
 		*cp++ = ':';
 		a++;
 	}
 	*--cp = '\0';
 	return (ip6buf);
 }
 
 int
 in6_localaddr(struct in6_addr *in6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6))
 		return 1;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 		if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr,
 		    &ia->ia_prefixmask.sin6_addr)) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return 1;
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 
 	return (0);
 }
 
 /*
  * Return 1 if an internet address is for the local host and configured
  * on one of its interfaces.
  */
 int
 in6_localip(struct in6_addr *in6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return (1);
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (0);
 }
  
 /*
  * Return 1 if an internet address is configured on an interface.
  */
 int
 in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr)
 {
 	struct in6_addr in6;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ia6;
 
 	in6 = *addr;
 	if (in6_clearscope(&in6))
 		return (0);
 	in6_setscope(&in6, ifp, NULL);
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ia6 = (struct in6_ifaddr *)ifa;
 		if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) {
 			IF_ADDR_RUNLOCK(ifp);
 			return (1);
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	return (0);
 }
 
 int
 in6_is_addr_deprecated(struct sockaddr_in6 *sa6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) {
 			if (ia->ia6_flags & IN6_IFF_DEPRECATED) {
 				IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 				return (1); /* true */
 			}
 			break;
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 
 	return (0);		/* false */
 }
 
 /*
  * return length of part which dst and src are equal
  * hard coding...
  */
 int
 in6_matchlen(struct in6_addr *src, struct in6_addr *dst)
 {
 	int match = 0;
 	u_char *s = (u_char *)src, *d = (u_char *)dst;
 	u_char *lim = s + 16, r;
 
 	while (s < lim)
 		if ((r = (*d++ ^ *s++)) != 0) {
 			while (r < 128) {
 				match++;
 				r <<= 1;
 			}
 			break;
 		} else
 			match += 8;
 	return match;
 }
 
 /* XXX: to be scope conscious */
 int
 in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len)
 {
 	int bytelen, bitlen;
 
 	/* sanity check */
 	if (0 > len || len > 128) {
 		log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n",
 		    len);
 		return (0);
 	}
 
 	bytelen = len / 8;
 	bitlen = len % 8;
 
 	if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen))
 		return (0);
 	if (bitlen != 0 &&
 	    p1->s6_addr[bytelen] >> (8 - bitlen) !=
 	    p2->s6_addr[bytelen] >> (8 - bitlen))
 		return (0);
 
 	return (1);
 }
 
 void
 in6_prefixlen2mask(struct in6_addr *maskp, int len)
 {
 	u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
 	int bytelen, bitlen, i;
 
 	/* sanity check */
 	if (0 > len || len > 128) {
 		log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n",
 		    len);
 		return;
 	}
 
 	bzero(maskp, sizeof(*maskp));
 	bytelen = len / 8;
 	bitlen = len % 8;
 	for (i = 0; i < bytelen; i++)
 		maskp->s6_addr[i] = 0xff;
 	if (bitlen)
 		maskp->s6_addr[bytelen] = maskarray[bitlen - 1];
 }
 
 /*
  * return the best address out of the same scope. if no address was
  * found, return the first valid address from designated IF.
  */
 struct in6_ifaddr *
 in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst)
 {
 	int dst_scope =	in6_addrscope(dst), blen = -1, tlen;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *besta = 0;
 	struct in6_ifaddr *dep[2];	/* last-resort: deprecated */
 
 	dep[0] = dep[1] = NULL;
 
 	/*
 	 * We first look for addresses in the same scope.
 	 * If there is one, return it.
 	 * If two or more, return one which matches the dst longest.
 	 * If none, return one of global addresses assigned other ifs.
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)
 			continue; /* XXX: is there any case to allow anycast? */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY)
 			continue; /* don't use this interface */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) {
 			if (V_ip6_use_deprecated)
 				dep[0] = (struct in6_ifaddr *)ifa;
 			continue;
 		}
 
 		if (dst_scope == in6_addrscope(IFA_IN6(ifa))) {
 			/*
 			 * call in6_matchlen() as few as possible
 			 */
 			if (besta) {
 				if (blen == -1)
 					blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst);
 				tlen = in6_matchlen(IFA_IN6(ifa), dst);
 				if (tlen > blen) {
 					blen = tlen;
 					besta = (struct in6_ifaddr *)ifa;
 				}
 			} else
 				besta = (struct in6_ifaddr *)ifa;
 		}
 	}
 	if (besta) {
 		ifa_ref(&besta->ia_ifa);
 		IF_ADDR_RUNLOCK(ifp);
 		return (besta);
 	}
 
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)
 			continue; /* XXX: is there any case to allow anycast? */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY)
 			continue; /* don't use this interface */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) {
 			if (V_ip6_use_deprecated)
 				dep[1] = (struct in6_ifaddr *)ifa;
 			continue;
 		}
 
 		if (ifa != NULL)
 			ifa_ref(ifa);
 		IF_ADDR_RUNLOCK(ifp);
 		return (struct in6_ifaddr *)ifa;
 	}
 
 	/* use the last-resort values, that are, deprecated addresses */
 	if (dep[0]) {
 		ifa_ref((struct ifaddr *)dep[0]);
 		IF_ADDR_RUNLOCK(ifp);
 		return dep[0];
 	}
 	if (dep[1]) {
 		ifa_ref((struct ifaddr *)dep[1]);
 		IF_ADDR_RUNLOCK(ifp);
 		return dep[1];
 	}
 
 	IF_ADDR_RUNLOCK(ifp);
 	return NULL;
 }
 
 /*
  * perform DAD when interface becomes IFF_UP.
  */
 void
 in6_if_up(struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ia;
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ia = (struct in6_ifaddr *)ifa;
 		if (ia->ia6_flags & IN6_IFF_TENTATIVE) {
 			/*
 			 * The TENTATIVE flag was likely set by hand
 			 * beforehand, implicitly indicating the need for DAD.
 			 * We may be able to skip the random delay in this
 			 * case, but we impose delays just in case.
 			 */
 			nd6_dad_start(ifa,
 			    arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz));
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	/*
 	 * special cases, like 6to4, are handled in in6_ifattach
 	 */
 	in6_ifattach(ifp, NULL);
 }
 
 int
 in6if_do_dad(struct ifnet *ifp)
 {
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0)
 		return (0);
 
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) ||
 	    (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD))
 		return (0);
 
 	/*
 	 * Our DAD routine requires the interface up and running.
 	 * However, some interfaces can be up before the RUNNING
 	 * status.  Additionaly, users may try to assign addresses
 	 * before the interface becomes up (or running).
 	 * This function returns EAGAIN in that case.
 	 * The caller should mark "tentative" on the address instead of
 	 * performing DAD immediately.
 	 */
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		return (EAGAIN);
 
 	return (1);
 }
 
 /*
  * Calculate max IPv6 MTU through all the interfaces and store it
  * to in6_maxmtu.
  */
 void
 in6_setmaxmtu(void)
 {
 	unsigned long maxmtu = 0;
 	struct ifnet *ifp;
 
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		/* this function can be called during ifnet initialization */
 		if (!ifp->if_afdata[AF_INET6])
 			continue;
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0 &&
 		    IN6_LINKMTU(ifp) > maxmtu)
 			maxmtu = IN6_LINKMTU(ifp);
 	}
 	IFNET_RUNLOCK_NOSLEEP();
 	if (maxmtu)	/* update only when maxmtu is positive */
 		V_in6_maxmtu = maxmtu;
 }
 
 /*
  * Provide the length of interface identifiers to be used for the link attached
  * to the given interface.  The length should be defined in "IPv6 over
  * xxx-link" document.  Note that address architecture might also define
  * the length for a particular set of address prefixes, regardless of the
  * link type.  As clarified in rfc2462bis, those two definitions should be
  * consistent, and those really are as of August 2004.
  */
 int
 in6_if2idlen(struct ifnet *ifp)
 {
 	switch (ifp->if_type) {
 	case IFT_ETHER:		/* RFC2464 */
 	case IFT_PROPVIRTUAL:	/* XXX: no RFC. treat it as ether */
 	case IFT_L2VLAN:	/* ditto */
 	case IFT_IEEE80211:	/* ditto */
 	case IFT_INFINIBAND:
 		return (64);
 	case IFT_FDDI:		/* RFC2467 */
 		return (64);
 	case IFT_ISO88025:	/* RFC2470 (IPv6 over Token Ring) */
 		return (64);
 	case IFT_PPP:		/* RFC2472 */
 		return (64);
 	case IFT_ARCNET:	/* RFC2497 */
 		return (64);
 	case IFT_FRELAY:	/* RFC2590 */
 		return (64);
 	case IFT_IEEE1394:	/* RFC3146 */
 		return (64);
 	case IFT_GIF:
 		return (64);	/* draft-ietf-v6ops-mech-v2-07 */
 	case IFT_LOOP:
 		return (64);	/* XXX: is this really correct? */
 	default:
 		/*
 		 * Unknown link type:
 		 * It might be controversial to use the today's common constant
 		 * of 64 for these cases unconditionally.  For full compliance,
 		 * we should return an error in this case.  On the other hand,
 		 * if we simply miss the standard for the link type or a new
 		 * standard is defined for a new link type, the IFID length
 		 * is very likely to be the common constant.  As a compromise,
 		 * we always use the constant, but make an explicit notice
 		 * indicating the "unknown" case.
 		 */
 		printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type);
 		return (64);
 	}
 }
 
 #include <sys/sysctl.h>
 
 struct in6_llentry {
 	struct llentry		base;
 };
 
 #define	IN6_LLTBL_DEFAULT_HSIZE	32
 #define	IN6_LLTBL_HASH(k, h) \
 	(((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1))
 
 /*
  * Do actual deallocation of @lle.
  * Called by LLE_FREE_LOCKED when number of references
  * drops to zero.
  */
 static void
 in6_lltable_destroy_lle(struct llentry *lle)
 {
 
 	LLE_WUNLOCK(lle);
 	LLE_LOCK_DESTROY(lle);
 	free(lle, M_LLTABLE);
 }
 
 static struct llentry *
 in6_lltable_new(const struct in6_addr *addr6, u_int flags)
 {
 	struct in6_llentry *lle;
 
 	lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO);
 	if (lle == NULL)		/* NB: caller generates msg */
 		return NULL;
 
 	lle->base.r_l3addr.addr6 = *addr6;
 	lle->base.lle_refcnt = 1;
 	lle->base.lle_free = in6_lltable_destroy_lle;
 	LLE_LOCK_INIT(&lle->base);
 	callout_init(&lle->base.lle_timer, 1);
 
 	return (&lle->base);
 }
 
 static int
 in6_lltable_match_prefix(const struct sockaddr *saddr,
     const struct sockaddr *smask, u_int flags, struct llentry *lle)
 {
 	const struct in6_addr *addr, *mask, *lle_addr;
 
 	addr = &((const struct sockaddr_in6 *)saddr)->sin6_addr;
 	mask = &((const struct sockaddr_in6 *)smask)->sin6_addr;
 	lle_addr = &lle->r_l3addr.addr6;
 
 	if (IN6_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0)
 		return (0);
 
 	if (lle->la_flags & LLE_IFADDR) {
 
 		/*
 		 * Delete LLE_IFADDR records IFF address & flag matches.
 		 * Note that addr is the interface address within prefix
 		 * being matched.
 		 */
 		if (IN6_ARE_ADDR_EQUAL(addr, lle_addr) &&
 		    (flags & LLE_STATIC) != 0)
 			return (1);
 		return (0);
 	}
 
 	/* flags & LLE_STATIC means deleting both dynamic and static entries */
 	if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))
 		return (1);
 
 	return (0);
 }
 
 static void
 in6_lltable_free_entry(struct lltable *llt, struct llentry *lle)
 {
 	struct ifnet *ifp;
 
 	LLE_WLOCK_ASSERT(lle);
 	KASSERT(llt != NULL, ("lltable is NULL"));
 
 	/* Unlink entry from table */
 	if ((lle->la_flags & LLE_LINKED) != 0) {
 
 		ifp = llt->llt_ifp;
 		IF_AFDATA_WLOCK_ASSERT(ifp);
 		lltable_unlink_entry(llt, lle);
 	}
 
 	if (callout_stop(&lle->lle_timer) > 0)
 		LLE_REMREF(lle);
 
 	llentry_free(lle);
 }
 
 static int
 in6_lltable_rtcheck(struct ifnet *ifp,
 		    u_int flags,
 		    const struct sockaddr *l3addr)
 {
-	struct rtentry *rt;
+	const struct sockaddr_in6 *sin6;
+	struct nhop6_basic nh6;
+	struct in6_addr dst;
+	uint32_t scopeid;
+	int error;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	KASSERT(l3addr->sa_family == AF_INET6,
 	    ("sin_family %d", l3addr->sa_family));
 
 	/* Our local addresses are always only installed on the default FIB. */
-	/* XXX rtalloc1 should take a const param */
-	rt = in6_rtalloc1(__DECONST(struct sockaddr *, l3addr), 0, 0,
-	    RT_DEFAULT_FIB);
-	if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) || rt->rt_ifp != ifp) {
+
+	sin6 = (const struct sockaddr_in6 *)l3addr;
+	in6_splitscope(&sin6->sin6_addr, &dst, &scopeid);
+	error = fib6_lookup_nh_basic(RT_DEFAULT_FIB, &dst, scopeid, 0, 0, &nh6);
+	if (error != 0 || (nh6.nh_flags & NHF_GATEWAY) || nh6.nh_ifp != ifp) {
 		struct ifaddr *ifa;
 		/*
 		 * Create an ND6 cache for an IPv6 neighbor
 		 * that is not covered by our own prefix.
 		 */
 		ifa = ifaof_ifpforaddr(l3addr, ifp);
 		if (ifa != NULL) {
 			ifa_free(ifa);
-			if (rt != NULL)
-				RTFREE_LOCKED(rt);
 			return 0;
 		}
 		log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n",
-		    ip6_sprintf(ip6buf, &((const struct sockaddr_in6 *)l3addr)->sin6_addr));
-		if (rt != NULL)
-			RTFREE_LOCKED(rt);
+		    ip6_sprintf(ip6buf, &sin6->sin6_addr));
 		return EINVAL;
 	}
-	RTFREE_LOCKED(rt);
 	return 0;
 }
 
 static inline uint32_t
 in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize)
 {
 
 	return (IN6_LLTBL_HASH(dst->s6_addr32[3], hsize));
 }
 
 static uint32_t
 in6_lltable_hash(const struct llentry *lle, uint32_t hsize)
 {
 
 	return (in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize));
 }
 
 static void
 in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
 {
 	struct sockaddr_in6 *sin6;
 
 	sin6 = (struct sockaddr_in6 *)sa;
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(*sin6);
 	sin6->sin6_addr = lle->r_l3addr.addr6;
 }
 
 static inline struct llentry *
 in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst)
 {
 	struct llentry *lle;
 	struct llentries *lleh;
 	u_int hashidx;
 
 	hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize);
 	lleh = &llt->lle_head[hashidx];
 	LIST_FOREACH(lle, lleh, lle_next) {
 		if (lle->la_flags & LLE_DELETED)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst))
 			break;
 	}
 
 	return (lle);
 }
 
 static void
 in6_lltable_delete_entry(struct lltable *llt, struct llentry *lle)
 {
 
 	lle->la_flags |= LLE_DELETED;
 	EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
 #ifdef DIAGNOSTIC
 	log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
 #endif
 	llentry_free(lle);
 }
 
 static struct llentry *
 in6_lltable_alloc(struct lltable *llt, u_int flags,
 	const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
 	struct ifnet *ifp = llt->llt_ifp;
 	struct llentry *lle;
 
 	KASSERT(l3addr->sa_family == AF_INET6,
 	    ("sin_family %d", l3addr->sa_family));
 
 	/*
 	 * A route that covers the given address must have
 	 * been installed 1st because we are doing a resolution,
 	 * verify this.
 	 */
 	if (!(flags & LLE_IFADDR) &&
 	    in6_lltable_rtcheck(ifp, flags, l3addr) != 0)
 		return (NULL);
 
 	lle = in6_lltable_new(&sin6->sin6_addr, flags);
 	if (lle == NULL) {
 		log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
 		return (NULL);
 	}
 	lle->la_flags = flags;
 	if ((flags & LLE_IFADDR) == LLE_IFADDR) {
 		lltable_set_entry_addr(ifp, lle, IF_LLADDR(ifp));
 		lle->la_flags |= LLE_STATIC;
 	}
 
 	if ((lle->la_flags & LLE_STATIC) != 0)
 		lle->ln_state = ND6_LLINFO_REACHABLE;
 
 	return (lle);
 }
 
 static struct llentry *
 in6_lltable_lookup(struct lltable *llt, u_int flags,
 	const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
 	struct llentry *lle;
 
 	IF_AFDATA_LOCK_ASSERT(llt->llt_ifp);
 	KASSERT(l3addr->sa_family == AF_INET6,
 	    ("sin_family %d", l3addr->sa_family));
 
 	lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);
 
 	if (lle == NULL)
 		return (NULL);
 
 	if (flags & LLE_EXCLUSIVE)
 		LLE_WLOCK(lle);
 	else
 		LLE_RLOCK(lle);
 	return (lle);
 }
 
 static int
 in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle,
     struct sysctl_req *wr)
 {
 	struct ifnet *ifp = llt->llt_ifp;
 	/* XXX stack use */
 	struct {
 		struct rt_msghdr	rtm;
 		struct sockaddr_in6	sin6;
 		/*
 		 * ndp.c assumes that sdl is word aligned
 		 */
 #ifdef __LP64__
 		uint32_t		pad;
 #endif
 		struct sockaddr_dl	sdl;
 	} ndpc;
 	struct sockaddr_dl *sdl;
 	int error;
 
 	bzero(&ndpc, sizeof(ndpc));
 			/* skip deleted entries */
 			if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
 				return (0);
 			/* Skip if jailed and not a valid IP of the prison. */
 			lltable_fill_sa_entry(lle,
 			    (struct sockaddr *)&ndpc.sin6);
 			if (prison_if(wr->td->td_ucred,
 			    (struct sockaddr *)&ndpc.sin6) != 0)
 				return (0);
 			/*
 			 * produce a msg made of:
 			 *  struct rt_msghdr;
 			 *  struct sockaddr_in6 (IPv6)
 			 *  struct sockaddr_dl;
 			 */
 			ndpc.rtm.rtm_msglen = sizeof(ndpc);
 			ndpc.rtm.rtm_version = RTM_VERSION;
 			ndpc.rtm.rtm_type = RTM_GET;
 			ndpc.rtm.rtm_flags = RTF_UP;
 			ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
 			if (V_deembed_scopeid)
 				sa6_recoverscope(&ndpc.sin6);
 
 			/* publish */
 			if (lle->la_flags & LLE_PUB)
 				ndpc.rtm.rtm_flags |= RTF_ANNOUNCE;
 
 			sdl = &ndpc.sdl;
 			sdl->sdl_family = AF_LINK;
 			sdl->sdl_len = sizeof(*sdl);
 			sdl->sdl_alen = ifp->if_addrlen;
 			sdl->sdl_index = ifp->if_index;
 			sdl->sdl_type = ifp->if_type;
 			bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
 			ndpc.rtm.rtm_rmx.rmx_expire =
 			    lle->la_flags & LLE_STATIC ? 0 : lle->la_expire;
 			ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
 			if (lle->la_flags & LLE_STATIC)
 				ndpc.rtm.rtm_flags |= RTF_STATIC;
 			if (lle->la_flags & LLE_IFADDR)
 				ndpc.rtm.rtm_flags |= RTF_PINNED;
 			ndpc.rtm.rtm_index = ifp->if_index;
 			error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc));
 
 	return (error);
 }
 
 static struct lltable *
 in6_lltattach(struct ifnet *ifp)
 {
 	struct lltable *llt;
 
 	llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE);
 	llt->llt_af = AF_INET6;
 	llt->llt_ifp = ifp;
 
 	llt->llt_lookup = in6_lltable_lookup;
 	llt->llt_alloc_entry = in6_lltable_alloc;
 	llt->llt_delete_entry = in6_lltable_delete_entry;
 	llt->llt_dump_entry = in6_lltable_dump_entry;
 	llt->llt_hash = in6_lltable_hash;
 	llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry;
 	llt->llt_free_entry = in6_lltable_free_entry;
 	llt->llt_match_prefix = in6_lltable_match_prefix;
  	lltable_link(llt);
 
 	return (llt);
 }
 
 void *
 in6_domifattach(struct ifnet *ifp)
 {
 	struct in6_ifextra *ext;
 
 	/* There are not IPv6-capable interfaces. */
 	switch (ifp->if_type) {
 	case IFT_PFLOG:
 	case IFT_PFSYNC:
 	case IFT_USB:
 		return (NULL);
 	}
 	ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK);
 	bzero(ext, sizeof(*ext));
 
 	ext->in6_ifstat = malloc(sizeof(counter_u64_t) *
 	    sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK);
 	COUNTER_ARRAY_ALLOC(ext->in6_ifstat,
 	    sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK);
 
 	ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) *
 	    sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR,
 	    M_WAITOK);
 	COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat,
 	    sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK);
 
 	ext->nd_ifinfo = nd6_ifattach(ifp);
 	ext->scope6_id = scope6_ifattach(ifp);
 	ext->lltable = in6_lltattach(ifp);
 
 	ext->mld_ifinfo = mld_domifattach(ifp);
 
 	return ext;
 }
 
 int
 in6_domifmtu(struct ifnet *ifp)
 {
 
 	return (IN6_LINKMTU(ifp));
 }
 
 void
 in6_domifdetach(struct ifnet *ifp, void *aux)
 {
 	struct in6_ifextra *ext = (struct in6_ifextra *)aux;
 
 	mld_domifdetach(ifp);
 	scope6_ifdetach(ext->scope6_id);
 	nd6_ifdetach(ext->nd_ifinfo);
 	lltable_free(ext->lltable);
 	COUNTER_ARRAY_FREE(ext->in6_ifstat,
 	    sizeof(struct in6_ifstat) / sizeof(uint64_t));
 	free(ext->in6_ifstat, M_IFADDR);
 	COUNTER_ARRAY_FREE(ext->icmp6_ifstat,
 	    sizeof(struct icmp6_ifstat) / sizeof(uint64_t));
 	free(ext->icmp6_ifstat, M_IFADDR);
 	free(ext, M_IFADDR);
 }
 
 /*
  * Convert sockaddr_in6 to sockaddr_in.  Original sockaddr_in6 must be
  * v4 mapped addr or v4 compat addr
  */
 void
 in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
 {
 
 	bzero(sin, sizeof(*sin));
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_family = AF_INET;
 	sin->sin_port = sin6->sin6_port;
 	sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3];
 }
 
 /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */
 void
 in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
 {
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_len = sizeof(struct sockaddr_in6);
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_port = sin->sin_port;
 	sin6->sin6_addr.s6_addr32[0] = 0;
 	sin6->sin6_addr.s6_addr32[1] = 0;
 	sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP;
 	sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr;
 }
 
 /* Convert sockaddr_in6 into sockaddr_in. */
 void
 in6_sin6_2_sin_in_sock(struct sockaddr *nam)
 {
 	struct sockaddr_in *sin_p;
 	struct sockaddr_in6 sin6;
 
 	/*
 	 * Save original sockaddr_in6 addr and convert it
 	 * to sockaddr_in.
 	 */
 	sin6 = *(struct sockaddr_in6 *)nam;
 	sin_p = (struct sockaddr_in *)nam;
 	in6_sin6_2_sin(sin_p, &sin6);
 }
 
 /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */
 void
 in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam)
 {
 	struct sockaddr_in *sin_p;
 	struct sockaddr_in6 *sin6_p;
 
 	sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK);
 	sin_p = (struct sockaddr_in *)*nam;
 	in6_sin_2_v4mapsin6(sin_p, sin6_p);
 	free(*nam, M_SONAME);
 	*nam = (struct sockaddr *)sin6_p;
 }
Index: head/sys/netinet6/in6_fib.c
===================================================================
--- head/sys/netinet6/in6_fib.c	(revision 292014)
+++ head/sys/netinet6/in6_fib.c	(revision 292015)
@@ -1,264 +1,268 @@
 /*-
  * Copyright (c) 2015
  * 	Alexander V. Chernikov <melifaro@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/kernel.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_mroute.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/scope6_var.h>
 
 #include <net/if_types.h>
 
 #ifdef INET6
 static void fib6_rte_to_nh_extended(struct rtentry *rte,
     const struct in6_addr *dst, uint32_t flags, struct nhop6_extended *pnh6);
 static void fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst,
     uint32_t flags, struct nhop6_basic *pnh6);
 static struct ifnet *fib6_get_ifaifp(struct rtentry *rte);
 #define RNTORT(p)	((struct rtentry *)(p))
 
 /*
  * Gets real interface for the @rte.
  * Returns rt_ifp for !IFF_LOOPBACK routers.
  * Extracts "real" address interface from interface address
  * loopback routes.
  */
 static struct ifnet *
 fib6_get_ifaifp(struct rtentry *rte)
 {
 	struct ifnet *ifp;
 	struct sockaddr_dl *sdl;
 
 	ifp = rte->rt_ifp;
 	if ((ifp->if_flags & IFF_LOOPBACK) &&
 	    rte->rt_gateway->sa_family == AF_LINK) {
 		sdl = (struct sockaddr_dl *)rte->rt_gateway;
 		return (ifnet_byindex(sdl->sdl_index));
 	}
 
 	return (ifp);
 }
 
 static void
 fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst,
     uint32_t flags, struct nhop6_basic *pnh6)
 {
 	struct sockaddr_in6 *gw;
 
 	/* Do explicit nexthop zero unless we're copying it */
 	memset(pnh6, 0, sizeof(*pnh6));
 
 	if ((flags & NHR_IFAIF) != 0)
 		pnh6->nh_ifp = fib6_get_ifaifp(rte);
 	else
 		pnh6->nh_ifp = rte->rt_ifp;
 
 	pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp));
 	if (rte->rt_flags & RTF_GATEWAY) {
 		gw = (struct sockaddr_in6 *)rte->rt_gateway;
 		pnh6->nh_addr = gw->sin6_addr;
 		in6_clearscope(&pnh6->nh_addr);
 	} else
 		pnh6->nh_addr = *dst;
 	/* Set flags */
 	pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
 	gw = (struct sockaddr_in6 *)rt_key(rte);
 	if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
 		pnh6->nh_flags |= NHF_DEFAULT;
 }
 
 static void
 fib6_rte_to_nh_extended(struct rtentry *rte, const struct in6_addr *dst,
     uint32_t flags, struct nhop6_extended *pnh6)
 {
 	struct sockaddr_in6 *gw;
 
 	/* Do explicit nexthop zero unless we're copying it */
 	memset(pnh6, 0, sizeof(*pnh6));
 
 	if ((flags & NHR_IFAIF) != 0)
 		pnh6->nh_ifp = fib6_get_ifaifp(rte);
 	else
 		pnh6->nh_ifp = rte->rt_ifp;
 
 	pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp));
 	if (rte->rt_flags & RTF_GATEWAY) {
 		gw = (struct sockaddr_in6 *)rte->rt_gateway;
 		pnh6->nh_addr = gw->sin6_addr;
 		in6_clearscope(&pnh6->nh_addr);
 	} else
 		pnh6->nh_addr = *dst;
 	/* Set flags */
 	pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
 	gw = (struct sockaddr_in6 *)rt_key(rte);
 	if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
 		pnh6->nh_flags |= NHF_DEFAULT;
 }
 
 /*
  * Performs IPv6 route table lookup on @dst. Returns 0 on success.
  * Stores basic nexthop info into provided @pnh6 structure.
  * Note that
  * - nh_ifp represents logical transmit interface (rt_ifp) by default
  * - nh_ifp represents "address" interface if NHR_IFAIF flag is passed
  * - mtu from logical transmit interface will be returned.
  * - nh_ifp cannot be safely dereferenced
  * - nh_ifp represents rt_ifp (e.g. if looking up address on
  *   interface "ix0" pointer to "ix0" interface will be returned instead
  *   of "lo0")
  * - howewer mtu from "transmit" interface will be returned.
+ * - scope will be embedded in nh_addr
  */
 int
 fib6_lookup_nh_basic(uint32_t fibnum, const struct in6_addr *dst, uint32_t scopeid,
     uint32_t flags, uint32_t flowid, struct nhop6_basic *pnh6)
 {
 	struct radix_node_head *rh;
 	struct radix_node *rn;
 	struct sockaddr_in6 sin6;
 	struct rtentry *rte;
 
 	KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET6);
 	if (rh == NULL)
 		return (ENOENT);
 
 	/* Prepare lookup key */
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_addr = *dst;
+	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	/* Assume scopeid is valid and embed it directly */
 	if (IN6_IS_SCOPE_LINKLOCAL(dst))
 		sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
 
 	RADIX_NODE_HEAD_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin6, rh);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rte = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rte->rt_ifp)) {
-			fib6_rte_to_nh_basic(rte, dst, flags, pnh6);
+			fib6_rte_to_nh_basic(rte, &sin6.sin6_addr, flags, pnh6);
 			RADIX_NODE_HEAD_RUNLOCK(rh);
 			return (0);
 		}
 	}
 	RADIX_NODE_HEAD_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 /*
  * Performs IPv6 route table lookup on @dst. Returns 0 on success.
  * Stores extended nexthop info into provided @pnh6 structure.
  * Note that
  * - nh_ifp cannot be safely dereferenced unless NHR_REF is specified.
  * - in that case you need to call fib6_free_nh_ext()
  * - nh_ifp represents logical transmit interface (rt_ifp) by default
  * - nh_ifp represents "address" interface if NHR_IFAIF flag is passed
  * - mtu from logical transmit interface will be returned.
+ * - scope will be embedded in nh_addr
  */
 int
 fib6_lookup_nh_ext(uint32_t fibnum, const struct in6_addr *dst,uint32_t scopeid,
     uint32_t flags, uint32_t flowid, struct nhop6_extended *pnh6)
 {
 	struct radix_node_head *rh;
 	struct radix_node *rn;
 	struct sockaddr_in6 sin6;
 	struct rtentry *rte;
 
 	KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET6);
 	if (rh == NULL)
 		return (ENOENT);
 
 	/* Prepare lookup key */
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_addr = *dst;
 	/* Assume scopeid is valid and embed it directly */
 	if (IN6_IS_SCOPE_LINKLOCAL(dst))
 		sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
 
 	RADIX_NODE_HEAD_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin6, rh);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rte = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rte->rt_ifp)) {
-			fib6_rte_to_nh_extended(rte, dst, flags, pnh6);
+			fib6_rte_to_nh_extended(rte, &sin6.sin6_addr, flags,
+			    pnh6);
 			if ((flags & NHR_REF) != 0) {
 				/* TODO: Do lwref on egress ifp's */
 			}
 			RADIX_NODE_HEAD_RUNLOCK(rh);
 
 			return (0);
 		}
 	}
 	RADIX_NODE_HEAD_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 void
 fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6)
 {
 
 }
 
 #endif
 
Index: head/sys/netinet6/in6_mcast.c
===================================================================
--- head/sys/netinet6/in6_mcast.c	(revision 292014)
+++ head/sys/netinet6/in6_mcast.c	(revision 292015)
@@ -1,2836 +1,2833 @@
 /*
  * Copyright (c) 2009 Bruce Simpson.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPv6 multicast socket, group, and socket option processing module.
  * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 #include <sys/priv.h>
 #include <sys/ktr.h>
 #include <sys/tree.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
+#include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/scope6_var.h>
 
 #ifndef KTR_MLD
 #define KTR_MLD KTR_INET6
 #endif
 
 #ifndef __SOCKUNION_DECLARED
 union sockunion {
 	struct sockaddr_storage	ss;
 	struct sockaddr		sa;
 	struct sockaddr_dl	sdl;
 	struct sockaddr_in6	sin6;
 };
 typedef union sockunion sockunion_t;
 #define __SOCKUNION_DECLARED
 #endif /* __SOCKUNION_DECLARED */
 
 static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter",
     "IPv6 multicast PCB-layer source filter");
 static MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group");
 static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options");
 static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource",
     "IPv6 multicast MLD-layer source filter");
 
 RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp);
 
 /*
  * Locking:
  * - Lock order is: Giant, INP_WLOCK, IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK.
  * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however
  *   it can be taken by code in net/if.c also.
  * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK.
  *
  * struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly
  * any need for in6_multi itself to be virtualized -- it is bound to an ifp
  * anyway no matter what happens.
  */
 struct mtx in6_multi_mtx;
 MTX_SYSINIT(in6_multi_mtx, &in6_multi_mtx, "in6_multi_mtx", MTX_DEF);
 
 static void	im6f_commit(struct in6_mfilter *);
 static int	im6f_get_source(struct in6_mfilter *imf,
 		    const struct sockaddr_in6 *psin,
 		    struct in6_msource **);
 static struct in6_msource *
 		im6f_graft(struct in6_mfilter *, const uint8_t,
 		    const struct sockaddr_in6 *);
 static void	im6f_leave(struct in6_mfilter *);
 static int	im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *);
 static void	im6f_purge(struct in6_mfilter *);
 static void	im6f_rollback(struct in6_mfilter *);
 static void	im6f_reap(struct in6_mfilter *);
 static int	im6o_grow(struct ip6_moptions *);
 static size_t	im6o_match_group(const struct ip6_moptions *,
 		    const struct ifnet *, const struct sockaddr *);
 static struct in6_msource *
 		im6o_match_source(const struct ip6_moptions *, const size_t,
 		    const struct sockaddr *);
 static void	im6s_merge(struct ip6_msource *ims,
 		    const struct in6_msource *lims, const int rollback);
 static int	in6_mc_get(struct ifnet *, const struct in6_addr *,
 		    struct in6_multi **);
 static int	in6m_get_source(struct in6_multi *inm,
 		    const struct in6_addr *addr, const int noalloc,
 		    struct ip6_msource **pims);
 #ifdef KTR
 static int	in6m_is_ifp_detached(const struct in6_multi *);
 #endif
 static int	in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *);
 static void	in6m_purge(struct in6_multi *);
 static void	in6m_reap(struct in6_multi *);
 static struct ip6_moptions *
 		in6p_findmoptions(struct inpcb *);
 static int	in6p_get_source_filters(struct inpcb *, struct sockopt *);
 static int	in6p_join_group(struct inpcb *, struct sockopt *);
 static int	in6p_leave_group(struct inpcb *, struct sockopt *);
 static struct ifnet *
 		in6p_lookup_mcast_ifp(const struct inpcb *,
 		    const struct sockaddr_in6 *);
 static int	in6p_block_unblock_source(struct inpcb *, struct sockopt *);
 static int	in6p_set_multicast_if(struct inpcb *, struct sockopt *);
 static int	in6p_set_source_filters(struct inpcb *, struct sockopt *);
 static int	sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_DECL(_net_inet6_ip6);	/* XXX Not in any common header. */
 
 static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW, 0,
     "IPv6 multicast");
 
 static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER;
 SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc,
     CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0,
     "Max source filters per group");
 
 static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER;
 SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc,
     CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0,
     "Max source filters per socket");
 
 /* TODO Virtualize this switch. */
 int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
 SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
     &in6_mcast_loop, 0, "Loopback multicast datagrams by default");
 
 static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters,
     "Per-interface stack-wide source filters");
 
 #ifdef KTR
 /*
  * Inline function which wraps assertions for a valid ifp.
  * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
  * is detached.
  */
 static int __inline
 in6m_is_ifp_detached(const struct in6_multi *inm)
 {
 	struct ifnet *ifp;
 
 	KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->in6m_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that network-layer notion of ifp is the
 		 * same as that of link-layer.
 		 */
 		KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	return (ifp == NULL);
 }
 #endif
 
 /*
  * Initialize an in6_mfilter structure to a known state at t0, t1
  * with an empty source filter list.
  */
 static __inline void
 im6f_init(struct in6_mfilter *imf, const int st0, const int st1)
 {
 	memset(imf, 0, sizeof(struct in6_mfilter));
 	RB_INIT(&imf->im6f_sources);
 	imf->im6f_st[0] = st0;
 	imf->im6f_st[1] = st1;
 }
 
 /*
  * Resize the ip6_moptions vector to the next power-of-two minus 1.
  * May be called with locks held; do not sleep.
  */
 static int
 im6o_grow(struct ip6_moptions *imo)
 {
 	struct in6_multi	**nmships;
 	struct in6_multi	**omships;
 	struct in6_mfilter	 *nmfilters;
 	struct in6_mfilter	 *omfilters;
 	size_t			  idx;
 	size_t			  newmax;
 	size_t			  oldmax;
 
 	nmships = NULL;
 	nmfilters = NULL;
 	omships = imo->im6o_membership;
 	omfilters = imo->im6o_mfilters;
 	oldmax = imo->im6o_max_memberships;
 	newmax = ((oldmax + 1) * 2) - 1;
 
 	if (newmax <= IPV6_MAX_MEMBERSHIPS) {
 		nmships = (struct in6_multi **)realloc(omships,
 		    sizeof(struct in6_multi *) * newmax, M_IP6MOPTS, M_NOWAIT);
 		nmfilters = (struct in6_mfilter *)realloc(omfilters,
 		    sizeof(struct in6_mfilter) * newmax, M_IN6MFILTER,
 		    M_NOWAIT);
 		if (nmships != NULL && nmfilters != NULL) {
 			/* Initialize newly allocated source filter heads. */
 			for (idx = oldmax; idx < newmax; idx++) {
 				im6f_init(&nmfilters[idx], MCAST_UNDEFINED,
 				    MCAST_EXCLUDE);
 			}
 			imo->im6o_max_memberships = newmax;
 			imo->im6o_membership = nmships;
 			imo->im6o_mfilters = nmfilters;
 		}
 	}
 
 	if (nmships == NULL || nmfilters == NULL) {
 		if (nmships != NULL)
 			free(nmships, M_IP6MOPTS);
 		if (nmfilters != NULL)
 			free(nmfilters, M_IN6MFILTER);
 		return (ETOOMANYREFS);
 	}
 
 	return (0);
 }
 
 /*
  * Find an IPv6 multicast group entry for this ip6_moptions instance
  * which matches the specified group, and optionally an interface.
  * Return its index into the array, or -1 if not found.
  */
 static size_t
 im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group)
 {
 	const struct sockaddr_in6 *gsin6;
 	struct in6_multi	**pinm;
 	int		  idx;
 	int		  nmships;
 
 	gsin6 = (const struct sockaddr_in6 *)group;
 
 	/* The im6o_membership array may be lazy allocated. */
 	if (imo->im6o_membership == NULL || imo->im6o_num_memberships == 0)
 		return (-1);
 
 	nmships = imo->im6o_num_memberships;
 	pinm = &imo->im6o_membership[0];
 	for (idx = 0; idx < nmships; idx++, pinm++) {
 		if (*pinm == NULL)
 			continue;
 		if ((ifp == NULL || ((*pinm)->in6m_ifp == ifp)) &&
 		    IN6_ARE_ADDR_EQUAL(&(*pinm)->in6m_addr,
 		    &gsin6->sin6_addr)) {
 			break;
 		}
 	}
 	if (idx >= nmships)
 		idx = -1;
 
 	return (idx);
 }
 
 /*
  * Find an IPv6 multicast source entry for this imo which matches
  * the given group index for this socket, and source address.
  *
  * XXX TODO: The scope ID, if present in src, is stripped before
  * any comparison. We SHOULD enforce scope/zone checks where the source
  * filter entry has a link scope.
  *
  * NOTE: This does not check if the entry is in-mode, merely if
  * it exists, which may not be the desired behaviour.
  */
 static struct in6_msource *
 im6o_match_source(const struct ip6_moptions *imo, const size_t gidx,
     const struct sockaddr *src)
 {
 	struct ip6_msource	 find;
 	struct in6_mfilter	*imf;
 	struct ip6_msource	*ims;
 	const sockunion_t	*psa;
 
 	KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__));
 	KASSERT(gidx != -1 && gidx < imo->im6o_num_memberships,
 	    ("%s: invalid index %d\n", __func__, (int)gidx));
 
 	/* The im6o_mfilters array may be lazy allocated. */
 	if (imo->im6o_mfilters == NULL)
 		return (NULL);
 	imf = &imo->im6o_mfilters[gidx];
 
 	psa = (const sockunion_t *)src;
 	find.im6s_addr = psa->sin6.sin6_addr;
 	in6_clearscope(&find.im6s_addr);		/* XXX */
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 
 	return ((struct in6_msource *)ims);
 }
 
 /*
  * Perform filtering for multicast datagrams on a socket by group and source.
  *
  * Returns 0 if a datagram should be allowed through, or various error codes
  * if the socket was not a member of the group, or the source was muted, etc.
  */
 int
 im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group, const struct sockaddr *src)
 {
 	size_t gidx;
 	struct in6_msource *ims;
 	int mode;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 
 	gidx = im6o_match_group(imo, ifp, group);
 	if (gidx == -1)
 		return (MCAST_NOTGMEMBER);
 
 	/*
 	 * Check if the source was included in an (S,G) join.
 	 * Allow reception on exclusive memberships by default,
 	 * reject reception on inclusive memberships by default.
 	 * Exclude source only if an in-mode exclude filter exists.
 	 * Include source only if an in-mode include filter exists.
 	 * NOTE: We are comparing group state here at MLD t1 (now)
 	 * with socket-layer t0 (since last downcall).
 	 */
 	mode = imo->im6o_mfilters[gidx].im6f_st[1];
 	ims = im6o_match_source(imo, gidx, src);
 
 	if ((ims == NULL && mode == MCAST_INCLUDE) ||
 	    (ims != NULL && ims->im6sl_st[0] != mode))
 		return (MCAST_NOTSMEMBER);
 
 	return (MCAST_PASS);
 }
 
 /*
  * Find and return a reference to an in6_multi record for (ifp, group),
  * and bump its reference count.
  * If one does not exist, try to allocate it, and update link-layer multicast
  * filters on ifp to listen for group.
  * Assumes the IN6_MULTI lock is held across the call.
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in6_mc_get(struct ifnet *ifp, const struct in6_addr *group,
     struct in6_multi **pinm)
 {
 	struct sockaddr_in6	 gsin6;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm;
 	int			 error;
 
 	error = 0;
 
 	/*
 	 * XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK;
 	 * if_addmulti() takes this mutex itself, so we must drop and
 	 * re-acquire around the call.
 	 */
 	IN6_MULTI_LOCK_ASSERT();
 	IF_ADDR_WLOCK(ifp);
 
 	inm = in6m_lookup_locked(ifp, group);
 	if (inm != NULL) {
 		/*
 		 * If we already joined this group, just bump the
 		 * refcount and return it.
 		 */
 		KASSERT(inm->in6m_refcount >= 1,
 		    ("%s: bad refcount %d", __func__, inm->in6m_refcount));
 		++inm->in6m_refcount;
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	memset(&gsin6, 0, sizeof(gsin6));
 	gsin6.sin6_family = AF_INET6;
 	gsin6.sin6_len = sizeof(struct sockaddr_in6);
 	gsin6.sin6_addr = *group;
 
 	/*
 	 * Check if a link-layer group is already associated
 	 * with this network-layer group on the given ifnet.
 	 */
 	IF_ADDR_WUNLOCK(ifp);
 	error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma);
 	if (error != 0)
 		return (error);
 	IF_ADDR_WLOCK(ifp);
 
 	/*
 	 * If something other than netinet6 is occupying the link-layer
 	 * group, print a meaningful error message and back out of
 	 * the allocation.
 	 * Otherwise, bump the refcount on the existing network-layer
 	 * group association and return it.
 	 */
 	if (ifma->ifma_protospec != NULL) {
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 #ifdef INVARIANTS
 		KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
 		    __func__));
 		KASSERT(ifma->ifma_addr->sa_family == AF_INET6,
 		    ("%s: ifma not AF_INET6", __func__));
 		KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
 		if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp ||
 		    !IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group))
 			panic("%s: ifma %p is inconsistent with %p (%p)",
 			    __func__, ifma, inm, group);
 #endif
 		++inm->in6m_refcount;
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	IF_ADDR_WLOCK_ASSERT(ifp);
 
 	/*
 	 * A new in6_multi record is needed; allocate and initialize it.
 	 * We DO NOT perform an MLD join as the in6_ layer may need to
 	 * push an initial source list down to MLD to support SSM.
 	 *
 	 * The initial source filter state is INCLUDE, {} as per the RFC.
 	 * Pending state-changes per group are subject to a bounds check.
 	 */
 	inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO);
 	if (inm == NULL) {
 		IF_ADDR_WUNLOCK(ifp);
 		if_delmulti_ifma(ifma);
 		return (ENOMEM);
 	}
 	inm->in6m_addr = *group;
 	inm->in6m_ifp = ifp;
 	inm->in6m_mli = MLD_IFINFO(ifp);
 	inm->in6m_ifma = ifma;
 	inm->in6m_refcount = 1;
 	inm->in6m_state = MLD_NOT_MEMBER;
 	mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES);
 
 	inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED;
 	inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 	RB_INIT(&inm->in6m_srcs);
 
 	ifma->ifma_protospec = inm;
 	*pinm = inm;
 
 out_locked:
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Drop a reference to an in6_multi record.
  *
  * If the refcount drops to 0, free the in6_multi record and
  * delete the underlying link-layer membership.
  */
 void
 in6m_release_locked(struct in6_multi *inm)
 {
 	struct ifmultiaddr *ifma;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount);
 
 	if (--inm->in6m_refcount > 0) {
 		CTR2(KTR_MLD, "%s: refcount is now %d", __func__,
 		    inm->in6m_refcount);
 		return;
 	}
 
 	CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm);
 
 	ifma = inm->in6m_ifma;
 
 	/* XXX this access is not covered by IF_ADDR_LOCK */
 	CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma);
 	KASSERT(ifma->ifma_protospec == inm,
 	    ("%s: ifma_protospec != inm", __func__));
 	ifma->ifma_protospec = NULL;
 
 	in6m_purge(inm);
 
 	free(inm, M_IP6MADDR);
 
 	if_delmulti_ifma(ifma);
 }
 
 /*
  * Clear recorded source entries for a group.
  * Used by the MLD code. Caller must hold the IN6_MULTI lock.
  * FIXME: Should reap.
  */
 void
 in6m_clear_recorded(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 		if (ims->im6s_stp) {
 			ims->im6s_stp = 0;
 			--inm->in6m_st[1].iss_rec;
 		}
 	}
 	KASSERT(inm->in6m_st[1].iss_rec == 0,
 	    ("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec));
 }
 
 /*
  * Record a source as pending for a Source-Group MLDv2 query.
  * This lives here as it modifies the shared tree.
  *
  * inm is the group descriptor.
  * naddr is the address of the source to record in network-byte order.
  *
  * If the net.inet6.mld.sgalloc sysctl is non-zero, we will
  * lazy-allocate a source node in response to an SG query.
  * Otherwise, no allocation is performed. This saves some memory
  * with the trade-off that the source will not be reported to the
  * router if joined in the window between the query response and
  * the group actually being joined on the local host.
  *
  * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed.
  * This turns off the allocation of a recorded source entry if
  * the group has not been joined.
  *
  * Return 0 if the source didn't exist or was already marked as recorded.
  * Return 1 if the source was marked as recorded by this function.
  * Return <0 if any error occured (negated errno code).
  */
 int
 in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	find.im6s_addr = *addr;
 	ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
 	if (ims && ims->im6s_stp)
 		return (0);
 	if (ims == NULL) {
 		if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
 			return (-ENOSPC);
 		nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (-ENOMEM);
 		nims->im6s_addr = find.im6s_addr;
 		RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
 		++inm->in6m_nsrc;
 		ims = nims;
 	}
 
 	/*
 	 * Mark the source as recorded and update the recorded
 	 * source count.
 	 */
 	++ims->im6s_stp;
 	++inm->in6m_st[1].iss_rec;
 
 	return (1);
 }
 
 /*
  * Return a pointer to an in6_msource owned by an in6_mfilter,
  * given its source address.
  * Lazy-allocate if needed. If this is a new entry its filter state is
  * undefined at t0.
  *
  * imf is the filter set being modified.
  * addr is the source address.
  *
  * SMPng: May be called with locks held; malloc must not block.
  */
 static int
 im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin,
     struct in6_msource **plims)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 	struct in6_msource	*lims;
 	int			 error;
 
 	error = 0;
 	ims = NULL;
 	lims = NULL;
 
 	find.im6s_addr = psin->sin6_addr;
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 	lims = (struct in6_msource *)ims;
 	if (lims == NULL) {
 		if (imf->im6f_nsrc == in6_mcast_maxsocksrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		lims = (struct in6_msource *)nims;
 		lims->im6s_addr = find.im6s_addr;
 		lims->im6sl_st[0] = MCAST_UNDEFINED;
 		RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
 		++imf->im6f_nsrc;
 	}
 
 	*plims = lims;
 
 	return (error);
 }
 
 /*
  * Graft a source entry into an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being in the new filter mode at t1.
  *
  * Return the pointer to the new node, otherwise return NULL.
  */
 static struct in6_msource *
 im6f_graft(struct in6_mfilter *imf, const uint8_t st1,
     const struct sockaddr_in6 *psin)
 {
 	struct ip6_msource	*nims;
 	struct in6_msource	*lims;
 
 	nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
 	    M_NOWAIT | M_ZERO);
 	if (nims == NULL)
 		return (NULL);
 	lims = (struct in6_msource *)nims;
 	lims->im6s_addr = psin->sin6_addr;
 	lims->im6sl_st[0] = MCAST_UNDEFINED;
 	lims->im6sl_st[1] = st1;
 	RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
 	++imf->im6f_nsrc;
 
 	return (lims);
 }
 
 /*
  * Prune a source entry from an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being left at t1, it is not freed.
  *
  * Return 0 if no error occurred, otherwise return an errno value.
  */
 static int
 im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	find.im6s_addr = psin->sin6_addr;
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 	if (ims == NULL)
 		return (ENOENT);
 	lims = (struct in6_msource *)ims;
 	lims->im6sl_st[1] = MCAST_UNDEFINED;
 	return (0);
 }
 
 /*
  * Revert socket-layer filter set deltas at t1 to t0 state.
  */
 static void
 im6f_rollback(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == lims->im6sl_st[1]) {
 			/* no change at t1 */
 			continue;
 		} else if (lims->im6sl_st[0] != MCAST_UNDEFINED) {
 			/* revert change to existing source at t1 */
 			lims->im6sl_st[1] = lims->im6sl_st[0];
 		} else {
 			/* revert source added t1 */
 			CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 			RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 			free(ims, M_IN6MFILTER);
 			imf->im6f_nsrc--;
 		}
 	}
 	imf->im6f_st[1] = imf->im6f_st[0];
 }
 
 /*
  * Mark socket-layer filter set as INCLUDE {} at t1.
  */
 static void
 im6f_leave(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		lims->im6sl_st[1] = MCAST_UNDEFINED;
 	}
 	imf->im6f_st[1] = MCAST_INCLUDE;
 }
 
 /*
  * Mark socket-layer filter set deltas as committed.
  */
 static void
 im6f_commit(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		lims->im6sl_st[0] = lims->im6sl_st[1];
 	}
 	imf->im6f_st[0] = imf->im6f_st[1];
 }
 
 /*
  * Reap unreferenced sources from socket-layer filter set.
  */
 static void
 im6f_reap(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		lims = (struct in6_msource *)ims;
 		if ((lims->im6sl_st[0] == MCAST_UNDEFINED) &&
 		    (lims->im6sl_st[1] == MCAST_UNDEFINED)) {
 			CTR2(KTR_MLD, "%s: free lims %p", __func__, ims);
 			RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 			free(ims, M_IN6MFILTER);
 			imf->im6f_nsrc--;
 		}
 	}
 }
 
 /*
  * Purge socket-layer filter set.
  */
 static void
 im6f_purge(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 		free(ims, M_IN6MFILTER);
 		imf->im6f_nsrc--;
 	}
 	imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED;
 	KASSERT(RB_EMPTY(&imf->im6f_sources),
 	    ("%s: im6f_sources not empty", __func__));
 }
 
 /*
  * Look up a source filter entry for a multicast group.
  *
  * inm is the group descriptor to work with.
  * addr is the IPv6 address to look up.
  * noalloc may be non-zero to suppress allocation of sources.
  * *pims will be set to the address of the retrieved or allocated source.
  *
  * SMPng: NOTE: may be called with locks held.
  * Return 0 if successful, otherwise return a non-zero error code.
  */
 static int
 in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr,
     const int noalloc, struct ip6_msource **pims)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	find.im6s_addr = *addr;
 	ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
 	if (ims == NULL && !noalloc) {
 		if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		nims->im6s_addr = *addr;
 		RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
 		++inm->in6m_nsrc;
 		ims = nims;
 		CTR3(KTR_MLD, "%s: allocated %s as %p", __func__,
 		    ip6_sprintf(ip6tbuf, addr), ims);
 	}
 
 	*pims = ims;
 	return (0);
 }
 
 /*
  * Merge socket-layer source into MLD-layer source.
  * If rollback is non-zero, perform the inverse of the merge.
  */
 static void
 im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims,
     const int rollback)
 {
 	int n = rollback ? -1 : 1;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 
 	ip6_sprintf(ip6tbuf, &lims->im6s_addr);
 #endif
 
 	if (lims->im6sl_st[0] == MCAST_EXCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].ex -= n;
 	} else if (lims->im6sl_st[0] == MCAST_INCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].in -= n;
 	}
 
 	if (lims->im6sl_st[1] == MCAST_EXCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].ex += n;
 	} else if (lims->im6sl_st[1] == MCAST_INCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].in += n;
 	}
 }
 
 /*
  * Atomically update the global in6_multi state, when a membership's
  * filter list is being updated in any way.
  *
  * imf is the per-inpcb-membership group filter pointer.
  * A fake imf may be passed for in-kernel consumers.
  *
  * XXX This is a candidate for a set-symmetric-difference style loop
  * which would eliminate the repeated lookup from root of ims nodes,
  * as they share the same key space.
  *
  * If any error occurred this function will back out of refcounts
  * and return a non-zero value.
  */
 static int
 in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *nims;
 	struct in6_msource	*lims;
 	int			 schanged, error;
 	int			 nsrc0, nsrc1;
 
 	schanged = 0;
 	error = 0;
 	nsrc1 = nsrc0 = 0;
 
 	/*
 	 * Update the source filters first, as this may fail.
 	 * Maintain count of in-mode filters at t0, t1. These are
 	 * used to work out if we transition into ASM mode or not.
 	 * Maintain a count of source filters whose state was
 	 * actually modified by this operation.
 	 */
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++;
 		if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++;
 		if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue;
 		error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims);
 		++schanged;
 		if (error)
 			break;
 		im6s_merge(nims, lims, 0);
 	}
 	if (error) {
 		struct ip6_msource *bims;
 
 		RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) {
 			lims = (struct in6_msource *)ims;
 			if (lims->im6sl_st[0] == lims->im6sl_st[1])
 				continue;
 			(void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims);
 			if (bims == NULL)
 				continue;
 			im6s_merge(bims, lims, 1);
 		}
 		goto out_reap;
 	}
 
 	CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1",
 	    __func__, nsrc0, nsrc1);
 
 	/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
 	if (imf->im6f_st[0] == imf->im6f_st[1] &&
 	    imf->im6f_st[1] == MCAST_INCLUDE) {
 		if (nsrc1 == 0) {
 			CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
 			--inm->in6m_st[1].iss_in;
 		}
 	}
 
 	/* Handle filter mode transition on socket. */
 	if (imf->im6f_st[0] != imf->im6f_st[1]) {
 		CTR3(KTR_MLD, "%s: imf transition %d to %d",
 		    __func__, imf->im6f_st[0], imf->im6f_st[1]);
 
 		if (imf->im6f_st[0] == MCAST_EXCLUDE) {
 			CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__);
 			--inm->in6m_st[1].iss_ex;
 		} else if (imf->im6f_st[0] == MCAST_INCLUDE) {
 			CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
 			--inm->in6m_st[1].iss_in;
 		}
 
 		if (imf->im6f_st[1] == MCAST_EXCLUDE) {
 			CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__);
 			inm->in6m_st[1].iss_ex++;
 		} else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
 			CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__);
 			inm->in6m_st[1].iss_in++;
 		}
 	}
 
 	/*
 	 * Track inm filter state in terms of listener counts.
 	 * If there are any exclusive listeners, stack-wide
 	 * membership is exclusive.
 	 * Otherwise, if only inclusive listeners, stack-wide is inclusive.
 	 * If no listeners remain, state is undefined at t1,
 	 * and the MLD lifecycle for this group should finish.
 	 */
 	if (inm->in6m_st[1].iss_ex > 0) {
 		CTR1(KTR_MLD, "%s: transition to EX", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE;
 	} else if (inm->in6m_st[1].iss_in > 0) {
 		CTR1(KTR_MLD, "%s: transition to IN", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_INCLUDE;
 	} else {
 		CTR1(KTR_MLD, "%s: transition to UNDEF", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 	}
 
 	/* Decrement ASM listener count on transition out of ASM mode. */
 	if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
 		if ((imf->im6f_st[1] != MCAST_EXCLUDE) ||
 		    (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0))
 			CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__);
 			--inm->in6m_st[1].iss_asm;
 	}
 
 	/* Increment ASM listener count on transition to ASM mode. */
 	if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
 		CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__);
 		inm->in6m_st[1].iss_asm++;
 	}
 
 	CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm);
 	in6m_print(inm);
 
 out_reap:
 	if (schanged > 0) {
 		CTR1(KTR_MLD, "%s: sources changed; reaping", __func__);
 		in6m_reap(inm);
 	}
 	return (error);
 }
 
 /*
  * Mark an in6_multi's filter set deltas as committed.
  * Called by MLD after a state change has been enqueued.
  */
 void
 in6m_commit(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims;
 
 	CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm);
 	CTR1(KTR_MLD, "%s: pre commit:", __func__);
 	in6m_print(inm);
 
 	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 		ims->im6s_st[0] = ims->im6s_st[1];
 	}
 	inm->in6m_st[0] = inm->in6m_st[1];
 }
 
 /*
  * Reap unreferenced nodes from an in6_multi's filter set.
  */
 static void
 in6m_reap(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
 		if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 ||
 		    ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 ||
 		    ims->im6s_stp != 0)
 			continue;
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
 		free(ims, M_IP6MSOURCE);
 		inm->in6m_nsrc--;
 	}
 }
 
 /*
  * Purge all source nodes from an in6_multi's filter set.
  */
 static void
 in6m_purge(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
 		free(ims, M_IP6MSOURCE);
 		inm->in6m_nsrc--;
 	}
 	/* Free state-change requests that might be queued. */
 	mbufq_drain(&inm->in6m_scq);
 }
 
 /*
  * Join a multicast address w/o sources.
  * KAME compatibility entry point.
  *
  * SMPng: Assume no mc locks held by caller.
  */
 struct in6_multi_mship *
 in6_joingroup(struct ifnet *ifp, struct in6_addr *mcaddr,
     int *errorp, int delay)
 {
 	struct in6_multi_mship *imm;
 	int error;
 
 	imm = malloc(sizeof(*imm), M_IP6MADDR, M_NOWAIT);
 	if (imm == NULL) {
 		*errorp = ENOBUFS;
 		return (NULL);
 	}
 
 	delay = (delay * PR_FASTHZ) / hz;
 
 	error = in6_mc_join(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay);
 	if (error) {
 		*errorp = error;
 		free(imm, M_IP6MADDR);
 		return (NULL);
 	}
 
 	return (imm);
 }
 
 /*
  * Leave a multicast address w/o sources.
  * KAME compatibility entry point.
  *
  * SMPng: Assume no mc locks held by caller.
  */
 int
 in6_leavegroup(struct in6_multi_mship *imm)
 {
 
 	if (imm->i6mm_maddr != NULL)
 		in6_mc_leave(imm->i6mm_maddr, NULL);
 	free(imm,  M_IP6MADDR);
 	return 0;
 }
 
 /*
  * Join a multicast group; unlocked entry point.
  *
  * SMPng: XXX: in6_mc_join() is called from in6_control() when upper
  * locks are not held. Fortunately, ifp is unlikely to have been detached
  * at this point, so we assume it's OK to recurse.
  */
 int
 in6_mc_join(struct ifnet *ifp, const struct in6_addr *mcaddr,
     /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
     const int delay)
 {
 	int error;
 
 	IN6_MULTI_LOCK();
 	error = in6_mc_join_locked(ifp, mcaddr, imf, pinm, delay);
 	IN6_MULTI_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Join a multicast group; real entry point.
  *
  * Only preserves atomicity at inm level.
  * NOTE: imf argument cannot be const due to sys/tree.h limitations.
  *
  * If the MLD downcall fails, the group is not joined, and an error
  * code is returned.
  */
 int
 in6_mc_join_locked(struct ifnet *ifp, const struct in6_addr *mcaddr,
     /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
     const int delay)
 {
 	struct in6_mfilter	 timf;
 	struct in6_multi	*inm;
 	int			 error;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 #ifdef INVARIANTS
 	/*
 	 * Sanity: Check scope zone ID was set for ifp, if and
 	 * only if group is scoped to an interface.
 	 */
 	KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr),
 	    ("%s: not a multicast address", __func__));
 	if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) ||
 	    IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) {
 		KASSERT(mcaddr->s6_addr16[1] != 0,
 		    ("%s: scope zone ID not set", __func__));
 	}
 #endif
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__,
 	    ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp));
 
 	error = 0;
 	inm = NULL;
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		imf = &timf;
 	}
 
 	error = in6_mc_get(ifp, mcaddr, &inm);
 	if (error) {
 		CTR1(KTR_MLD, "%s: in6_mc_get() failure", __func__);
 		return (error);
 	}
 
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 		goto out_in6m_release;
 	}
 
 	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 	error = mld_change_state(inm, delay);
 	if (error) {
 		CTR1(KTR_MLD, "%s: failed to update source", __func__);
 		goto out_in6m_release;
 	}
 
 out_in6m_release:
 	if (error) {
 		CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
 		in6m_release_locked(inm);
 	} else {
 		*pinm = inm;
 	}
 
 	return (error);
 }
 
 /*
  * Leave a multicast group; unlocked entry point.
  */
 int
 in6_mc_leave(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	struct ifnet *ifp;
 	int error;
 
 	ifp = inm->in6m_ifp;
 
 	IN6_MULTI_LOCK();
 	error = in6_mc_leave_locked(inm, imf);
 	IN6_MULTI_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Leave a multicast group; real entry point.
  * All source filters will be expunged.
  *
  * Only preserves atomicity at inm level.
  *
  * Holding the write lock for the INP which contains imf
  * is highly advisable. We can't assert for it as imf does not
  * contain a back-pointer to the owning inp.
  *
  * Note: This is not the same as in6m_release(*) as this function also
  * makes a state change downcall into MLD.
  */
 int
 in6_mc_leave_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	struct in6_mfilter	 timf;
 	int			 error;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	error = 0;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__,
 	    inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    (in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)),
 	    imf);
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
 		imf = &timf;
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 *
 	 * As this particular invocation should not cause any memory
 	 * to be allocated, and there is no opportunity to roll back
 	 * the transaction, it MUST NOT fail.
 	 */
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
 
 	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 	error = mld_change_state(inm, 0);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 
 	CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
 	in6m_release_locked(inm);
 
 	return (error);
 }
 
 /*
  * Block or unblock an ASM multicast source on an inpcb.
  * This implements the delta-based API described in RFC 3678.
  *
  * The delta-based API applies only to exclusive-mode memberships.
  * An MLD downcall will be performed.
  *
  * SMPng: NOTE: Must take Giant as a join may create a new ifma.
  *
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_msource		*ims;
 	struct in6_multi			*inm;
 	size_t				 idx;
 	uint16_t			 fmode;
 	int				 error, doblock;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	ifp = NULL;
 	error = 0;
 	doblock = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 
 	switch (sopt->sopt_name) {
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = sooptcopyin(sopt, &gsr,
 		    sizeof(struct group_source_req),
 		    sizeof(struct group_source_req));
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (ssa->sin6.sin6_family != AF_INET6 ||
 		    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 
 		ifp = ifnet_byindex(gsr.gsr_interface);
 
 		if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
 			doblock = 1;
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	/*
 	 * Check if we are actually a member of this group.
 	 */
 	imo = in6p_findmoptions(inp);
 	idx = im6o_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1 || imo->im6o_mfilters == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 
 	KASSERT(imo->im6o_mfilters != NULL,
 	    ("%s: im6o_mfilters not allocated", __func__));
 	imf = &imo->im6o_mfilters[idx];
 	inm = imo->im6o_membership[idx];
 
 	/*
 	 * Attempting to use the delta-based API on an
 	 * non exclusive-mode membership is an error.
 	 */
 	fmode = imf->im6f_st[0];
 	if (fmode != MCAST_EXCLUDE) {
 		error = EINVAL;
 		goto out_in6p_locked;
 	}
 
 	/*
 	 * Deal with error cases up-front:
 	 *  Asked to block, but already blocked; or
 	 *  Asked to unblock, but nothing to unblock.
 	 * If adding a new block entry, allocate it.
 	 */
 	ims = im6o_match_source(imo, idx, &ssa->sa);
 	if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
 		CTR3(KTR_MLD, "%s: source %s %spresent", __func__,
 		    ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
 		    doblock ? "" : "not ");
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	if (doblock) {
 		CTR2(KTR_MLD, "%s: %s source", __func__, "block");
 		ims = im6f_graft(imf, fmode, &ssa->sin6);
 		if (ims == NULL)
 			error = ENOMEM;
 	} else {
 		CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
 		error = im6f_prune(imf, &ssa->sin6);
 	}
 
 	if (error) {
 		CTR1(KTR_MLD, "%s: merge imf state failed", __func__);
 		goto out_im6f_rollback;
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	IN6_MULTI_LOCK();
 
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 	else {
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 	}
 
 	IN6_MULTI_UNLOCK();
 
 out_im6f_rollback:
 	if (error)
 		im6f_rollback(imf);
 	else
 		im6f_commit(imf);
 
 	im6f_reap(imf);
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Given an inpcb, return its multicast options structure pointer.  Accepts
  * an unlocked inpcb pointer, but will return it locked.  May sleep.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  * SMPng: NOTE: Returns with the INP write lock held.
  */
 static struct ip6_moptions *
 in6p_findmoptions(struct inpcb *inp)
 {
 	struct ip6_moptions	 *imo;
 	struct in6_multi		**immp;
 	struct in6_mfilter	 *imfp;
 	size_t			  idx;
 
 	INP_WLOCK(inp);
 	if (inp->in6p_moptions != NULL)
 		return (inp->in6p_moptions);
 
 	INP_WUNLOCK(inp);
 
 	imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK);
 	immp = malloc(sizeof(*immp) * IPV6_MIN_MEMBERSHIPS, M_IP6MOPTS,
 	    M_WAITOK | M_ZERO);
 	imfp = malloc(sizeof(struct in6_mfilter) * IPV6_MIN_MEMBERSHIPS,
 	    M_IN6MFILTER, M_WAITOK);
 
 	imo->im6o_multicast_ifp = NULL;
 	imo->im6o_multicast_hlim = V_ip6_defmcasthlim;
 	imo->im6o_multicast_loop = in6_mcast_loop;
 	imo->im6o_num_memberships = 0;
 	imo->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
 	imo->im6o_membership = immp;
 
 	/* Initialize per-group source filters. */
 	for (idx = 0; idx < IPV6_MIN_MEMBERSHIPS; idx++)
 		im6f_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE);
 	imo->im6o_mfilters = imfp;
 
 	INP_WLOCK(inp);
 	if (inp->in6p_moptions != NULL) {
 		free(imfp, M_IN6MFILTER);
 		free(immp, M_IP6MOPTS);
 		free(imo, M_IP6MOPTS);
 		return (inp->in6p_moptions);
 	}
 	inp->in6p_moptions = imo;
 	return (imo);
 }
 
 /*
  * Discard the IPv6 multicast options (and source filters).
  *
  * SMPng: NOTE: assumes INP write lock is held.
  */
 void
 ip6_freemoptions(struct ip6_moptions *imo)
 {
 	struct in6_mfilter	*imf;
 	size_t			 idx, nmships;
 
 	KASSERT(imo != NULL, ("%s: ip6_moptions is NULL", __func__));
 
 	nmships = imo->im6o_num_memberships;
 	for (idx = 0; idx < nmships; ++idx) {
 		imf = imo->im6o_mfilters ? &imo->im6o_mfilters[idx] : NULL;
 		if (imf)
 			im6f_leave(imf);
 		/* XXX this will thrash the lock(s) */
 		(void)in6_mc_leave(imo->im6o_membership[idx], imf);
 		if (imf)
 			im6f_purge(imf);
 	}
 
 	if (imo->im6o_mfilters)
 		free(imo->im6o_mfilters, M_IN6MFILTER);
 	free(imo->im6o_membership, M_IP6MOPTS);
 	free(imo, M_IP6MOPTS);
 }
 
 /*
  * Atomically get source filters on a socket for an IPv6 multicast group.
  * Called with INP lock held; returns with lock released.
  */
 static int
 in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct ip6_moptions	*imo;
 	struct in6_mfilter	*imf;
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 	struct sockaddr_in6	*psin;
 	struct sockaddr_storage	*ptss;
 	struct sockaddr_storage	*tss;
 	int			 error;
 	size_t			 idx, nsrcs, ncsrcs;
 
 	INP_WLOCK_ASSERT(inp);
 
 	imo = inp->in6p_moptions;
 	KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__));
 
 	INP_WUNLOCK(inp);
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_group.ss_family != AF_INET6 ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
 		return (EADDRNOTAVAIL);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	INP_WLOCK(inp);
 
 	/*
 	 * Lookup group on the socket.
 	 */
 	idx = im6o_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1 || imo->im6o_mfilters == NULL) {
 		INP_WUNLOCK(inp);
 		return (EADDRNOTAVAIL);
 	}
 	imf = &imo->im6o_mfilters[idx];
 
 	/*
 	 * Ignore memberships which are in limbo.
 	 */
 	if (imf->im6f_st[1] == MCAST_UNDEFINED) {
 		INP_WUNLOCK(inp);
 		return (EAGAIN);
 	}
 	msfr.msfr_fmode = imf->im6f_st[1];
 
 	/*
 	 * If the user specified a buffer, copy out the source filter
 	 * entries to userland gracefully.
 	 * We only copy out the number of entries which userland
 	 * has asked for, but we always tell userland how big the
 	 * buffer really needs to be.
 	 */
 	if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
 		msfr.msfr_nsrcs = in6_mcast_maxsocksrc;
 	tss = NULL;
 	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
 		tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_NOWAIT | M_ZERO);
 		if (tss == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOBUFS);
 		}
 	}
 
 	/*
 	 * Count number of sources in-mode at t0.
 	 * If buffer space exists and remains, copy out source entries.
 	 */
 	nsrcs = msfr.msfr_nsrcs;
 	ncsrcs = 0;
 	ptss = tss;
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == MCAST_UNDEFINED ||
 		    lims->im6sl_st[0] != imf->im6f_st[0])
 			continue;
 		++ncsrcs;
 		if (tss != NULL && nsrcs > 0) {
 			psin = (struct sockaddr_in6 *)ptss;
 			psin->sin6_family = AF_INET6;
 			psin->sin6_len = sizeof(struct sockaddr_in6);
 			psin->sin6_addr = lims->im6s_addr;
 			psin->sin6_port = 0;
 			--nsrcs;
 			++ptss;
 		}
 	}
 
 	INP_WUNLOCK(inp);
 
 	if (tss != NULL) {
 		error = copyout(tss, msfr.msfr_srcs,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		free(tss, M_TEMP);
 		if (error)
 			return (error);
 	}
 
 	msfr.msfr_nsrcs = ncsrcs;
 	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
 
 	return (error);
 }
 
 /*
  * Return the IP multicast options in response to user getsockopt().
  */
 int
 ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip6_moptions	*im6o;
 	int			 error;
 	u_int			 optval;
 
 	INP_WLOCK(inp);
 	im6o = inp->in6p_moptions;
 	/*
 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
 	 * or is a divert socket, reject it.
 	 */
 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = 0;
 	switch (sopt->sopt_name) {
 	case IPV6_MULTICAST_IF:
 		if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) {
 			optval = 0;
 		} else {
 			optval = im6o->im6o_multicast_ifp->if_index;
 		}
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MULTICAST_HOPS:
 		if (im6o == NULL)
 			optval = V_ip6_defmcasthlim;
 		else
 			optval = im6o->im6o_multicast_hlim;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MULTICAST_LOOP:
 		if (im6o == NULL)
 			optval = in6_mcast_loop; /* XXX VIMAGE */
 		else
 			optval = im6o->im6o_multicast_loop;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MSFILTER:
 		if (im6o == NULL) {
 			error = EADDRNOTAVAIL;
 			INP_WUNLOCK(inp);
 		} else {
 			error = in6p_get_source_filters(inp, sopt);
 		}
 		break;
 
 	default:
 		INP_WUNLOCK(inp);
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Look up the ifnet to use for a multicast group membership,
  * given the address of an IPv6 group.
  *
  * This routine exists to support legacy IPv6 multicast applications.
  *
  * If inp is non-NULL, use this socket's current FIB number for any
  * required FIB lookup. Look up the group address in the unicast FIB,
  * and use its ifp; usually, this points to the default next-hop.
  * If the FIB lookup fails, return NULL.
  *
  * FUTURE: Support multiple forwarding tables for IPv6.
  *
  * Returns NULL if no ifp could be found.
  */
 static struct ifnet *
 in6p_lookup_mcast_ifp(const struct inpcb *in6p,
     const struct sockaddr_in6 *gsin6)
 {
-	struct route_in6	 ro6;
-	struct ifnet		*ifp;
+	struct nhop6_basic	nh6;
+	struct in6_addr		dst;
+	uint32_t		scopeid;
+	uint32_t		fibnum;
 
 	KASSERT(in6p->inp_vflag & INP_IPV6,
 	    ("%s: not INP_IPV6 inpcb", __func__));
 	KASSERT(gsin6->sin6_family == AF_INET6,
 	    ("%s: not AF_INET6 group", __func__));
 
-	ifp = NULL;
-	memset(&ro6, 0, sizeof(struct route_in6));
-	memcpy(&ro6.ro_dst, gsin6, sizeof(struct sockaddr_in6));
-	rtalloc_ign_fib((struct route *)&ro6, 0,
-	    in6p ? in6p->inp_inc.inc_fibnum : RT_DEFAULT_FIB);
-	if (ro6.ro_rt != NULL) {
-		ifp = ro6.ro_rt->rt_ifp;
-		KASSERT(ifp != NULL, ("%s: null ifp", __func__));
-		RTFREE(ro6.ro_rt);
-	}
+	in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid);
+	fibnum = in6p ? in6p->inp_inc.inc_fibnum : RT_DEFAULT_FIB;
+	if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0)
+		return (NULL);
 
-	return (ifp);
+	return (nh6.nh_ifp);
 }
 
 /*
  * Join an IPv6 multicast group, possibly with a source.
  *
  * FIXME: The KAME use of the unspecified address (::)
  * to join *all* multicast groups is currently unsupported.
  */
 static int
 in6p_join_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_multi		*inm;
 	struct in6_msource		*lims;
 	size_t				 idx;
 	int				 error, is_new;
 
 	ifp = NULL;
 	imf = NULL;
 	lims = NULL;
 	error = 0;
 	is_new = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	/*
 	 * Chew everything into struct group_source_req.
 	 * Overwrite the port field if present, as the sockaddr
 	 * being copied in may be matched with a binary comparison.
 	 * Ignore passed-in scope ID.
 	 */
 	switch (sopt->sopt_name) {
 	case IPV6_JOIN_GROUP: {
 		struct ipv6_mreq mreq;
 
 		error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
 		    sizeof(struct ipv6_mreq));
 		if (error)
 			return (error);
 
 		gsa->sin6.sin6_family = AF_INET6;
 		gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
 
 		if (mreq.ipv6mr_interface == 0) {
 			ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
 		} else {
 			if (V_if_index < mreq.ipv6mr_interface)
 				return (EADDRNOTAVAIL);
 			ifp = ifnet_byindex(mreq.ipv6mr_interface);
 		}
 		CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p",
 		    __func__, mreq.ipv6mr_interface, ifp);
 	} break;
 
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_JOIN_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			if (ssa->sin6.sin6_family != AF_INET6 ||
 			    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 			if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
 				return (EINVAL);
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&ssa->sin6.sin6_addr);
 			ssa->sin6.sin6_port = 0;
 			ssa->sin6.sin6_scope_id = 0;
 		}
 
 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
 			return (EADDRNOTAVAIL);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EADDRNOTAVAIL);
 
 	gsa->sin6.sin6_port = 0;
 	gsa->sin6.sin6_scope_id = 0;
 
 	/*
 	 * Always set the scope zone ID on memberships created from userland.
 	 * Use the passed-in ifp to do this.
 	 * XXX The in6_setscope() return value is meaningless.
 	 * XXX SCOPE6_LOCK() is taken by in6_setscope().
 	 */
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	imo = in6p_findmoptions(inp);
 	idx = im6o_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1) {
 		is_new = 1;
 	} else {
 		inm = imo->im6o_membership[idx];
 		imf = &imo->im6o_mfilters[idx];
 		if (ssa->ss.ss_family != AF_UNSPEC) {
 			/*
 			 * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
 			 * is an error. On an existing inclusive membership,
 			 * it just adds the source to the filter list.
 			 */
 			if (imf->im6f_st[1] != MCAST_INCLUDE) {
 				error = EINVAL;
 				goto out_in6p_locked;
 			}
 			/*
 			 * Throw out duplicates.
 			 *
 			 * XXX FIXME: This makes a naive assumption that
 			 * even if entries exist for *ssa in this imf,
 			 * they will be rejected as dupes, even if they
 			 * are not valid in the current mode (in-mode).
 			 *
 			 * in6_msource is transactioned just as for anything
 			 * else in SSM -- but note naive use of in6m_graft()
 			 * below for allocating new filter entries.
 			 *
 			 * This is only an issue if someone mixes the
 			 * full-state SSM API with the delta-based API,
 			 * which is discouraged in the relevant RFCs.
 			 */
 			lims = im6o_match_source(imo, idx, &ssa->sa);
 			if (lims != NULL /*&&
 			    lims->im6sl_st[1] == MCAST_INCLUDE*/) {
 				error = EADDRNOTAVAIL;
 				goto out_in6p_locked;
 			}
 		} else {
 			/*
 			 * MCAST_JOIN_GROUP alone, on any existing membership,
 			 * is rejected, to stop the same inpcb tying up
 			 * multiple refs to the in_multi.
 			 * On an existing inclusive membership, this is also
 			 * an error; if you want to change filter mode,
 			 * you must use the userland API setsourcefilter().
 			 * XXX We don't reject this for imf in UNDEFINED
 			 * state at t1, because allocation of a filter
 			 * is atomic with allocation of a membership.
 			 */
 			error = EINVAL;
 			goto out_in6p_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	if (is_new) {
 		if (imo->im6o_num_memberships == imo->im6o_max_memberships) {
 			error = im6o_grow(imo);
 			if (error)
 				goto out_in6p_locked;
 		}
 		/*
 		 * Allocate the new slot upfront so we can deal with
 		 * grafting the new source filter in same code path
 		 * as for join-source on existing membership.
 		 */
 		idx = imo->im6o_num_memberships;
 		imo->im6o_membership[idx] = NULL;
 		imo->im6o_num_memberships++;
 		KASSERT(imo->im6o_mfilters != NULL,
 		    ("%s: im6f_mfilters vector was not allocated", __func__));
 		imf = &imo->im6o_mfilters[idx];
 		KASSERT(RB_EMPTY(&imf->im6f_sources),
 		    ("%s: im6f_sources not empty", __func__));
 	}
 
 	/*
 	 * Graft new source into filter list for this inpcb's
 	 * membership of the group. The in6_multi may not have
 	 * been allocated yet if this is a new membership, however,
 	 * the in_mfilter slot will be allocated and must be initialized.
 	 *
 	 * Note: Grafting of exclusive mode filters doesn't happen
 	 * in this path.
 	 * XXX: Should check for non-NULL lims (node exists but may
 	 * not be in-mode) for interop with full-state API.
 	 */
 	if (ssa->ss.ss_family != AF_UNSPEC) {
 		/* Membership starts in IN mode */
 		if (is_new) {
 			CTR1(KTR_MLD, "%s: new join w/source", __func__);
 			im6f_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE);
 		} else {
 			CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
 		}
 		lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6);
 		if (lims == NULL) {
 			CTR1(KTR_MLD, "%s: merge imf state failed",
 			    __func__);
 			error = ENOMEM;
 			goto out_im6o_free;
 		}
 	} else {
 		/* No address specified; Membership starts in EX mode */
 		if (is_new) {
 			CTR1(KTR_MLD, "%s: new join w/o source", __func__);
 			im6f_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	IN6_MULTI_LOCK();
 
 	if (is_new) {
 		error = in6_mc_join_locked(ifp, &gsa->sin6.sin6_addr, imf,
 		    &inm, 0);
 		if (error) {
 			IN6_MULTI_UNLOCK();
 			goto out_im6o_free;
 		}
 		imo->im6o_membership[idx] = inm;
 	} else {
 		CTR1(KTR_MLD, "%s: merge inm state", __func__);
 		error = in6m_merge(inm, imf);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed to merge inm state",
 			    __func__);
 		else {
 			CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 			error = mld_change_state(inm, 0);
 			if (error)
 				CTR1(KTR_MLD, "%s: failed mld downcall",
 				    __func__);
 		}
 	}
 
 	IN6_MULTI_UNLOCK();
 	INP_WLOCK_ASSERT(inp);
 	if (error) {
 		im6f_rollback(imf);
 		if (is_new)
 			im6f_purge(imf);
 		else
 			im6f_reap(imf);
 	} else {
 		im6f_commit(imf);
 	}
 
 out_im6o_free:
 	if (error && is_new) {
 		imo->im6o_membership[idx] = NULL;
 		--imo->im6o_num_memberships;
 	}
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Leave an IPv6 multicast group on an inpcb, possibly with a source.
  */
 static int
 in6p_leave_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ipv6_mreq		 mreq;
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_msource		*ims;
 	struct in6_multi		*inm;
 	uint32_t			 ifindex;
 	size_t				 idx;
 	int				 error, is_final;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	ifp = NULL;
 	ifindex = 0;
 	error = 0;
 	is_final = 1;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	/*
 	 * Chew everything passed in up into a struct group_source_req
 	 * as that is easier to process.
 	 * Note: Any embedded scope ID in the multicast group passed
 	 * in by userland is ignored, the interface index is the recommended
 	 * mechanism to specify an interface; see below.
 	 */
 	switch (sopt->sopt_name) {
 	case IPV6_LEAVE_GROUP:
 		error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
 		    sizeof(struct ipv6_mreq));
 		if (error)
 			return (error);
 		gsa->sin6.sin6_family = AF_INET6;
 		gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
 		gsa->sin6.sin6_port = 0;
 		gsa->sin6.sin6_scope_id = 0;
 		ifindex = mreq.ipv6mr_interface;
 		break;
 
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			if (ssa->sin6.sin6_family != AF_INET6 ||
 			    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 			if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
 				return (EINVAL);
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&ssa->sin6.sin6_addr);
 		}
 		gsa->sin6.sin6_port = 0;
 		gsa->sin6.sin6_scope_id = 0;
 		ifindex = gsr.gsr_interface;
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	/*
 	 * Validate interface index if provided. If no interface index
 	 * was provided separately, attempt to look the membership up
 	 * from the default scope as a last resort to disambiguate
 	 * the membership we are being asked to leave.
 	 * XXX SCOPE6 lock potentially taken here.
 	 */
 	if (ifindex != 0) {
 		if (V_if_index < ifindex)
 			return (EADDRNOTAVAIL);
 		ifp = ifnet_byindex(ifindex);
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 	} else {
 		error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone);
 		if (error)
 			return (EADDRNOTAVAIL);
 		/*
 		 * Some badly behaved applications don't pass an ifindex
 		 * or a scope ID, which is an API violation. In this case,
 		 * perform a lookup as per a v6 join.
 		 *
 		 * XXX For now, stomp on zone ID for the corner case.
 		 * This is not the 'KAME way', but we need to see the ifp
 		 * directly until such time as this implementation is
 		 * refactored, assuming the scope IDs are the way to go.
 		 */
 		ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]);
 		if (ifindex == 0) {
 			CTR2(KTR_MLD, "%s: warning: no ifindex, looking up "
 			    "ifp for group %s.", __func__,
 			    ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr));
 			ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
 		} else {
 			ifp = ifnet_byindex(ifindex);
 		}
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 	}
 
 	CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp);
 	KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__));
 
 	/*
 	 * Find the membership in the membership array.
 	 */
 	imo = in6p_findmoptions(inp);
 	idx = im6o_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 	inm = imo->im6o_membership[idx];
 	imf = &imo->im6o_mfilters[idx];
 
 	if (ssa->ss.ss_family != AF_UNSPEC)
 		is_final = 0;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If we were instructed only to leave a given source, do so.
 	 * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
 	 */
 	if (is_final) {
 		im6f_leave(imf);
 	} else {
 		if (imf->im6f_st[0] == MCAST_EXCLUDE) {
 			error = EADDRNOTAVAIL;
 			goto out_in6p_locked;
 		}
 		ims = im6o_match_source(imo, idx, &ssa->sa);
 		if (ims == NULL) {
 			CTR3(KTR_MLD, "%s: source %p %spresent", __func__,
 			    ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
 			    "not ");
 			error = EADDRNOTAVAIL;
 			goto out_in6p_locked;
 		}
 		CTR2(KTR_MLD, "%s: %s source", __func__, "block");
 		error = im6f_prune(imf, &ssa->sin6);
 		if (error) {
 			CTR1(KTR_MLD, "%s: merge imf state failed",
 			    __func__);
 			goto out_in6p_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	IN6_MULTI_LOCK();
 
 	if (is_final) {
 		/*
 		 * Give up the multicast address record to which
 		 * the membership points.
 		 */
 		(void)in6_mc_leave_locked(inm, imf);
 	} else {
 		CTR1(KTR_MLD, "%s: merge inm state", __func__);
 		error = in6m_merge(inm, imf);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed to merge inm state",
 			    __func__);
 		else {
 			CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 			error = mld_change_state(inm, 0);
 			if (error)
 				CTR1(KTR_MLD, "%s: failed mld downcall",
 				    __func__);
 		}
 	}
 
 	IN6_MULTI_UNLOCK();
 
 	if (error)
 		im6f_rollback(imf);
 	else
 		im6f_commit(imf);
 
 	im6f_reap(imf);
 
 	if (is_final) {
 		/* Remove the gap in the membership array. */
 		for (++idx; idx < imo->im6o_num_memberships; ++idx) {
 			imo->im6o_membership[idx-1] = imo->im6o_membership[idx];
 			imo->im6o_mfilters[idx-1] = imo->im6o_mfilters[idx];
 		}
 		imo->im6o_num_memberships--;
 	}
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Select the interface for transmitting IPv6 multicast datagrams.
  *
  * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn
  * may be passed to this socket option. An address of in6addr_any or an
  * interface index of 0 is used to remove a previous selection.
  * When no interface is selected, one is chosen for every send.
  */
 static int
 in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ifnet		*ifp;
 	struct ip6_moptions	*imo;
 	u_int			 ifindex;
 	int			 error;
 
 	if (sopt->sopt_valsize != sizeof(u_int))
 		return (EINVAL);
 
 	error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int));
 	if (error)
 		return (error);
 	if (V_if_index < ifindex)
 		return (EINVAL);
 	if (ifindex == 0)
 		ifp = NULL;
 	else {
 		ifp = ifnet_byindex(ifindex);
 		if (ifp == NULL)
 			return (EINVAL);
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EADDRNOTAVAIL);
 	}
 	imo = in6p_findmoptions(inp);
 	imo->im6o_multicast_ifp = ifp;
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 /*
  * Atomically set source filters on a socket for an IPv6 multicast group.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  */
 static int
 in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct in6_mfilter	*imf;
 	struct ip6_moptions	*imo;
 	struct in6_multi		*inm;
 	size_t			 idx;
 	int			 error;
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
 		return (ENOBUFS);
 
 	if (msfr.msfr_fmode != MCAST_EXCLUDE &&
 	    msfr.msfr_fmode != MCAST_INCLUDE)
 		return (EINVAL);
 
 	if (msfr.msfr_group.ss_family != AF_INET6 ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	gsa->sin6.sin6_port = 0;	/* ignore port */
 
 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
 		return (EADDRNOTAVAIL);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	/*
 	 * Take the INP write lock.
 	 * Check if this socket is a member of this group.
 	 */
 	imo = in6p_findmoptions(inp);
 	idx = im6o_match_group(imo, ifp, &gsa->sa);
 	if (idx == -1 || imo->im6o_mfilters == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 	inm = imo->im6o_membership[idx];
 	imf = &imo->im6o_mfilters[idx];
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	imf->im6f_st[1] = msfr.msfr_fmode;
 
 	/*
 	 * Apply any new source filters, if present.
 	 * Make a copy of the user-space source vector so
 	 * that we may copy them with a single copyin. This
 	 * allows us to deal with page faults up-front.
 	 */
 	if (msfr.msfr_nsrcs > 0) {
 		struct in6_msource	*lims;
 		struct sockaddr_in6	*psin;
 		struct sockaddr_storage	*kss, *pkss;
 		int			 i;
 
 		INP_WUNLOCK(inp);
  
 		CTR2(KTR_MLD, "%s: loading %lu source list entries",
 		    __func__, (unsigned long)msfr.msfr_nsrcs);
 		kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_WAITOK);
 		error = copyin(msfr.msfr_srcs, kss,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		if (error) {
 			free(kss, M_TEMP);
 			return (error);
 		}
 
 		INP_WLOCK(inp);
 
 		/*
 		 * Mark all source filters as UNDEFINED at t1.
 		 * Restore new group filter mode, as im6f_leave()
 		 * will set it to INCLUDE.
 		 */
 		im6f_leave(imf);
 		imf->im6f_st[1] = msfr.msfr_fmode;
 
 		/*
 		 * Update socket layer filters at t1, lazy-allocating
 		 * new entries. This saves a bunch of memory at the
 		 * cost of one RB_FIND() per source entry; duplicate
 		 * entries in the msfr_nsrcs vector are ignored.
 		 * If we encounter an error, rollback transaction.
 		 *
 		 * XXX This too could be replaced with a set-symmetric
 		 * difference like loop to avoid walking from root
 		 * every time, as the key space is common.
 		 */
 		for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
 			psin = (struct sockaddr_in6 *)pkss;
 			if (psin->sin6_family != AF_INET6) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 			if (psin->sin6_len != sizeof(struct sockaddr_in6)) {
 				error = EINVAL;
 				break;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) {
 				error = EINVAL;
 				break;
 			}
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&psin->sin6_addr);
 			error = im6f_get_source(imf, psin, &lims);
 			if (error)
 				break;
 			lims->im6sl_st[1] = imf->im6f_st[1];
 		}
 		free(kss, M_TEMP);
 	}
 
 	if (error)
 		goto out_im6f_rollback;
 
 	INP_WLOCK_ASSERT(inp);
 	IN6_MULTI_LOCK();
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 	else {
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 	}
 
 	IN6_MULTI_UNLOCK();
 
 out_im6f_rollback:
 	if (error)
 		im6f_rollback(imf);
 	else
 		im6f_commit(imf);
 
 	im6f_reap(imf);
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Set the IP multicast options in response to user setsockopt().
  *
  * Many of the socket options handled in this function duplicate the
  * functionality of socket options in the regular unicast API. However,
  * it is not possible to merge the duplicate code, because the idempotence
  * of the IPv6 multicast part of the BSD Sockets API must be preserved;
  * the effects of these options must be treated as separate and distinct.
  *
  * SMPng: XXX: Unlocked read of inp_socket believed OK.
  */
 int
 ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip6_moptions	*im6o;
 	int			 error;
 
 	error = 0;
 
 	/*
 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
 	 * or is a divert socket, reject it.
 	 */
 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
 		return (EOPNOTSUPP);
 
 	switch (sopt->sopt_name) {
 	case IPV6_MULTICAST_IF:
 		error = in6p_set_multicast_if(inp, sopt);
 		break;
 
 	case IPV6_MULTICAST_HOPS: {
 		int hlim;
 
 		if (sopt->sopt_valsize != sizeof(int)) {
 			error = EINVAL;
 			break;
 		}
 		error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int));
 		if (error)
 			break;
 		if (hlim < -1 || hlim > 255) {
 			error = EINVAL;
 			break;
 		} else if (hlim == -1) {
 			hlim = V_ip6_defmcasthlim;
 		}
 		im6o = in6p_findmoptions(inp);
 		im6o->im6o_multicast_hlim = hlim;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IPV6_MULTICAST_LOOP: {
 		u_int loop;
 
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.
 		 */
 		if (sopt->sopt_valsize != sizeof(u_int)) {
 			error = EINVAL;
 			break;
 		}
 		error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int));
 		if (error)
 			break;
 		if (loop > 1) {
 			error = EINVAL;
 			break;
 		}
 		im6o = in6p_findmoptions(inp);
 		im6o->im6o_multicast_loop = loop;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IPV6_JOIN_GROUP:
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		error = in6p_join_group(inp, sopt);
 		break;
 
 	case IPV6_LEAVE_GROUP:
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		error = in6p_leave_group(inp, sopt);
 		break;
 
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = in6p_block_unblock_source(inp, sopt);
 		break;
 
 	case IPV6_MSFILTER:
 		error = in6p_set_source_filters(inp, sopt);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Expose MLD's multicast filter mode and source list(s) to userland,
  * keyed by (ifindex, group).
  * The filter mode is written out as a uint32_t, followed by
  * 0..n of struct in6_addr.
  * For use by ifmcstat(8).
  * SMPng: NOTE: unlocked read of ifindex space.
  */
 static int
 sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_addr			 mcaddr;
 	struct in6_addr			 src;
 	struct ifnet			*ifp;
 	struct ifmultiaddr		*ifma;
 	struct in6_multi		*inm;
 	struct ip6_msource		*ims;
 	int				*name;
 	int				 retval;
 	u_int				 namelen;
 	uint32_t			 fmode, ifindex;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/* int: ifindex + 4 * 32 bits of IPv6 address */
 	if (namelen != 5)
 		return (EINVAL);
 
 	ifindex = name[0];
 	if (ifindex <= 0 || ifindex > V_if_index) {
 		CTR2(KTR_MLD, "%s: ifindex %u out of range",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 
 	memcpy(&mcaddr, &name[1], sizeof(struct in6_addr));
 	if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) {
 		CTR2(KTR_MLD, "%s: group %s is not multicast",
 		    __func__, ip6_sprintf(ip6tbuf, &mcaddr));
 		return (EINVAL);
 	}
 
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR2(KTR_MLD, "%s: no ifp for ifindex %u",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 	/*
 	 * Internal MLD lookups require that scope/zone ID is set.
 	 */
 	(void)in6_setscope(&mcaddr, ifp, NULL);
 
 	retval = sysctl_wire_old_buffer(req,
 	    sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr)));
 	if (retval)
 		return (retval);
 
 	IN6_MULTI_LOCK();
 
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET6 ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 		if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr))
 			continue;
 		fmode = inm->in6m_st[1].iss_fmode;
 		retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
 		if (retval != 0)
 			break;
 		RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 			CTR2(KTR_MLD, "%s: visit node %p", __func__, ims);
 			/*
 			 * Only copy-out sources which are in-mode.
 			 */
 			if (fmode != im6s_get_mode(inm, ims, 1)) {
 				CTR1(KTR_MLD, "%s: skip non-in-mode",
 				    __func__);
 				continue;
 			}
 			src = ims->im6s_addr;
 			retval = SYSCTL_OUT(req, &src,
 			    sizeof(struct in6_addr));
 			if (retval != 0)
 				break;
 		}
 	}
 	IF_ADDR_RUNLOCK(ifp);
 
 	IN6_MULTI_UNLOCK();
 
 	return (retval);
 }
 
 #ifdef KTR
 
 static const char *in6m_modestrs[] = { "un", "in", "ex" };
 
 static const char *
 in6m_mode_str(const int mode)
 {
 
 	if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
 		return (in6m_modestrs[mode]);
 	return ("??");
 }
 
 static const char *in6m_statestrs[] = {
 	"not-member",
 	"silent",
 	"idle",
 	"lazy",
 	"sleeping",
 	"awakening",
 	"query-pending",
 	"sg-query-pending",
 	"leaving"
 };
 
 static const char *
 in6m_state_str(const int state)
 {
 
 	if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER)
 		return (in6m_statestrs[state]);
 	return ("??");
 }
 
 /*
  * Dump an in6_multi structure to the console.
  */
 void
 in6m_print(const struct in6_multi *inm)
 {
 	int t;
 	char ip6tbuf[INET6_ADDRSTRLEN];
 
 	if ((ktr_mask & KTR_MLD) == 0)
 		return;
 
 	printf("%s: --- begin in6m %p ---\n", __func__, inm);
 	printf("addr %s ifp %p(%s) ifma %p\n",
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp,
 	    if_name(inm->in6m_ifp),
 	    inm->in6m_ifma);
 	printf("timer %u state %s refcount %u scq.len %u\n",
 	    inm->in6m_timer,
 	    in6m_state_str(inm->in6m_state),
 	    inm->in6m_refcount,
 	    mbufq_len(&inm->in6m_scq));
 	printf("mli %p nsrc %lu sctimer %u scrv %u\n",
 	    inm->in6m_mli,
 	    inm->in6m_nsrc,
 	    inm->in6m_sctimer,
 	    inm->in6m_scrv);
 	for (t = 0; t < 2; t++) {
 		printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
 		    in6m_mode_str(inm->in6m_st[t].iss_fmode),
 		    inm->in6m_st[t].iss_asm,
 		    inm->in6m_st[t].iss_ex,
 		    inm->in6m_st[t].iss_in,
 		    inm->in6m_st[t].iss_rec);
 	}
 	printf("%s: --- end in6m %p ---\n", __func__, inm);
 }
 
 #else /* !KTR */
 
 void
 in6m_print(const struct in6_multi *inm)
 {
 
 }
 
 #endif /* KTR */
Index: head/sys/netinet6/in6_src.c
===================================================================
--- head/sys/netinet6/in6_src.c	(revision 292014)
+++ head/sys/netinet6/in6_src.c	(revision 292015)
@@ -1,1171 +1,1169 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_src.c,v 1.132 2003/08/26 04:42:27 keiichi Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/rmlock.h>
 #include <sys/sx.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/if_llatbl.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
+#include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 
 static struct mtx addrsel_lock;
 #define	ADDRSEL_LOCK_INIT()	mtx_init(&addrsel_lock, "addrsel_lock", NULL, MTX_DEF)
 #define	ADDRSEL_LOCK()		mtx_lock(&addrsel_lock)
 #define	ADDRSEL_UNLOCK()	mtx_unlock(&addrsel_lock)
 #define	ADDRSEL_LOCK_ASSERT()	mtx_assert(&addrsel_lock, MA_OWNED)
 
 static struct sx addrsel_sxlock;
 #define	ADDRSEL_SXLOCK_INIT()	sx_init(&addrsel_sxlock, "addrsel_sxlock")
 #define	ADDRSEL_SLOCK()		sx_slock(&addrsel_sxlock)
 #define	ADDRSEL_SUNLOCK()	sx_sunlock(&addrsel_sxlock)
 #define	ADDRSEL_XLOCK()		sx_xlock(&addrsel_sxlock)
 #define	ADDRSEL_XUNLOCK()	sx_xunlock(&addrsel_sxlock)
 
 #define ADDR_LABEL_NOTAPP (-1)
 static VNET_DEFINE(struct in6_addrpolicy, defaultaddrpolicy);
 #define	V_defaultaddrpolicy		VNET(defaultaddrpolicy)
 
 VNET_DEFINE(int, ip6_prefer_tempaddr) = 0;
 
 static int selectroute(struct sockaddr_in6 *, struct ip6_pktopts *,
 	struct ip6_moptions *, struct route_in6 *, struct ifnet **,
 	struct rtentry **, int, u_int);
 static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *,
 	struct ip6_moptions *, struct route_in6 *ro, struct ifnet **,
 	struct ifnet *, u_int);
 
 static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *);
 
 static void init_policy_queue(void);
 static int add_addrsel_policyent(struct in6_addrpolicy *);
 static int delete_addrsel_policyent(struct in6_addrpolicy *);
 static int walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *),
 	void *);
 static int dump_addrsel_policyent(struct in6_addrpolicy *, void *);
 static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *);
 
 /*
  * Return an IPv6 address, which is the most appropriate for a given
  * destination and user specified options.
  * If necessary, this function lookups the routing table and returns
  * an entry to the caller for later use.
  */
 #define REPLACE(r) do {\
 	IP6STAT_INC(ip6s_sources_rule[(r)]); \
 	rule = (r);	\
 	/* { \
 	char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
 	printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
 	} */ \
 	goto replace; \
 } while(0)
 #define NEXT(r) do {\
 	/* { \
 	char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
 	printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
 	} */ \
 	goto next;		/* XXX: we can't use 'continue' here */ \
 } while(0)
 #define BREAK(r) do { \
 	IP6STAT_INC(ip6s_sources_rule[(r)]); \
 	rule = (r);	\
 	goto out;		/* XXX: we can't use 'break' here */ \
 } while(0)
 
 int
 in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct inpcb *inp, struct route_in6 *ro, struct ucred *cred,
     struct ifnet **ifpp, struct in6_addr *srcp)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_addr dst, tmp;
 	struct ifnet *ifp = NULL, *oifp = NULL;
 	struct in6_ifaddr *ia = NULL, *ia_best = NULL;
 	struct in6_pktinfo *pi = NULL;
 	int dst_scope = -1, best_scope = -1, best_matchlen = -1;
 	struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL;
 	u_int32_t odstzone;
 	int prefer_tempaddr;
 	int error, rule;
 	struct ip6_moptions *mopts;
 
 	KASSERT(srcp != NULL, ("%s: srcp is NULL", __func__));
 
 	dst = dstsock->sin6_addr; /* make a copy for local operation */
 	if (ifpp) {
 		/*
 		 * Save a possibly passed in ifp for in6_selectsrc. Only
 		 * neighbor discovery code should use this feature, where
 		 * we may know the interface but not the FIB number holding
 		 * the connected subnet in case someone deleted it from the
 		 * default FIB and we need to check the interface.
 		 */
 		if (*ifpp != NULL)
 			oifp = *ifpp;
 		*ifpp = NULL;
 	}
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		mopts = inp->in6p_moptions;
 	} else {
 		mopts = NULL;
 	}
 
 	/*
 	 * If the source address is explicitly specified by the caller,
 	 * check if the requested source address is indeed a unicast address
 	 * assigned to the node, and can be used as the packet's source
 	 * address.  If everything is okay, use the address as source.
 	 */
 	if (opts && (pi = opts->ip6po_pktinfo) &&
 	    !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) {
 		struct sockaddr_in6 srcsock;
 		struct in6_ifaddr *ia6;
 
 		/* get the outgoing interface */
 		if ((error = in6_selectif(dstsock, opts, mopts, ro, &ifp, oifp,
 		    (inp != NULL) ? inp->inp_inc.inc_fibnum : RT_DEFAULT_FIB))
 		    != 0)
 			return (error);
 
 		/*
 		 * determine the appropriate zone id of the source based on
 		 * the zone of the destination and the outgoing interface.
 		 * If the specified address is ambiguous wrt the scope zone,
 		 * the interface must be specified; otherwise, ifa_ifwithaddr()
 		 * will fail matching the address.
 		 */
 		bzero(&srcsock, sizeof(srcsock));
 		srcsock.sin6_family = AF_INET6;
 		srcsock.sin6_len = sizeof(srcsock);
 		srcsock.sin6_addr = pi->ipi6_addr;
 		if (ifp) {
 			error = in6_setscope(&srcsock.sin6_addr, ifp, NULL);
 			if (error)
 				return (error);
 		}
 		if (cred != NULL && (error = prison_local_ip6(cred,
 		    &srcsock.sin6_addr, (inp != NULL &&
 		    (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
 			return (error);
 
 		ia6 = (struct in6_ifaddr *)ifa_ifwithaddr(
 		    (struct sockaddr *)&srcsock);
 		if (ia6 == NULL ||
 		    (ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY))) {
 			if (ia6 != NULL)
 				ifa_free(&ia6->ia_ifa);
 			return (EADDRNOTAVAIL);
 		}
 		pi->ipi6_addr = srcsock.sin6_addr; /* XXX: this overrides pi */
 		if (ifpp)
 			*ifpp = ifp;
 		bcopy(&ia6->ia_addr.sin6_addr, srcp, sizeof(*srcp));
 		ifa_free(&ia6->ia_ifa);
 		return (0);
 	}
 
 	/*
 	 * Otherwise, if the socket has already bound the source, just use it.
 	 */
 	if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 		if (cred != NULL &&
 		    (error = prison_local_ip6(cred, &inp->in6p_laddr,
 		    ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
 			return (error);
 		bcopy(&inp->in6p_laddr, srcp, sizeof(*srcp));
 		return (0);
 	}
 
 	/*
 	 * Bypass source address selection and use the primary jail IP
 	 * if requested.
 	 */
 	if (cred != NULL && !prison_saddrsel_ip6(cred, srcp))
 		return (0);
 
 	/*
 	 * If the address is not specified, choose the best one based on
 	 * the outgoing interface and the destination address.
 	 */
 	/* get the outgoing interface */
 	if ((error = in6_selectif(dstsock, opts, mopts, ro, &ifp, oifp,
 	    (inp != NULL) ? inp->inp_inc.inc_fibnum : RT_DEFAULT_FIB)) != 0)
 		return (error);
 
 #ifdef DIAGNOSTIC
 	if (ifp == NULL)	/* this should not happen */
 		panic("in6_selectsrc: NULL ifp");
 #endif
 	error = in6_setscope(&dst, ifp, &odstzone);
 	if (error)
 		return (error);
 
 	rule = 0;
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 		int new_scope = -1, new_matchlen = -1;
 		struct in6_addrpolicy *new_policy = NULL;
 		u_int32_t srczone, osrczone, dstzone;
 		struct in6_addr src;
 		struct ifnet *ifp1 = ia->ia_ifp;
 
 		/*
 		 * We'll never take an address that breaks the scope zone
 		 * of the destination.  We also skip an address if its zone
 		 * does not contain the outgoing interface.
 		 * XXX: we should probably use sin6_scope_id here.
 		 */
 		if (in6_setscope(&dst, ifp1, &dstzone) ||
 		    odstzone != dstzone) {
 			continue;
 		}
 		src = ia->ia_addr.sin6_addr;
 		if (in6_setscope(&src, ifp, &osrczone) ||
 		    in6_setscope(&src, ifp1, &srczone) ||
 		    osrczone != srczone) {
 			continue;
 		}
 
 		/* avoid unusable addresses */
 		if ((ia->ia6_flags &
 		     (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) {
 				continue;
 		}
 		if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia))
 			continue;
 
 		/* If jailed only take addresses of the jail into account. */
 		if (cred != NULL &&
 		    prison_check_ip6(cred, &ia->ia_addr.sin6_addr) != 0)
 			continue;
 
 		/* Rule 1: Prefer same address */
 		if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) {
 			ia_best = ia;
 			BREAK(1); /* there should be no better candidate */
 		}
 
 		if (ia_best == NULL)
 			REPLACE(0);
 
 		/* Rule 2: Prefer appropriate scope */
 		if (dst_scope < 0)
 			dst_scope = in6_addrscope(&dst);
 		new_scope = in6_addrscope(&ia->ia_addr.sin6_addr);
 		if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) {
 			if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0)
 				REPLACE(2);
 			NEXT(2);
 		} else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) {
 			if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0)
 				NEXT(2);
 			REPLACE(2);
 		}
 
 		/*
 		 * Rule 3: Avoid deprecated addresses.  Note that the case of
 		 * !ip6_use_deprecated is already rejected above.
 		 */
 		if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia))
 			NEXT(3);
 		if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia))
 			REPLACE(3);
 
 		/* Rule 4: Prefer home addresses */
 		/*
 		 * XXX: This is a TODO.  We should probably merge the MIP6
 		 * case above.
 		 */
 
 		/* Rule 5: Prefer outgoing interface */
 		if (!(ND_IFINFO(ifp)->flags & ND6_IFF_NO_PREFER_IFACE)) {
 			if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp)
 				NEXT(5);
 			if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp)
 				REPLACE(5);
 		}
 
 		/*
 		 * Rule 6: Prefer matching label
 		 * Note that best_policy should be non-NULL here.
 		 */
 		if (dst_policy == NULL)
 			dst_policy = lookup_addrsel_policy(dstsock);
 		if (dst_policy->label != ADDR_LABEL_NOTAPP) {
 			new_policy = lookup_addrsel_policy(&ia->ia_addr);
 			if (dst_policy->label == best_policy->label &&
 			    dst_policy->label != new_policy->label)
 				NEXT(6);
 			if (dst_policy->label != best_policy->label &&
 			    dst_policy->label == new_policy->label)
 				REPLACE(6);
 		}
 
 		/*
 		 * Rule 7: Prefer public addresses.
 		 * We allow users to reverse the logic by configuring
 		 * a sysctl variable, so that privacy conscious users can
 		 * always prefer temporary addresses.
 		 */
 		if (opts == NULL ||
 		    opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) {
 			prefer_tempaddr = V_ip6_prefer_tempaddr;
 		} else if (opts->ip6po_prefer_tempaddr ==
 		    IP6PO_TEMPADDR_NOTPREFER) {
 			prefer_tempaddr = 0;
 		} else
 			prefer_tempaddr = 1;
 		if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
 		    (ia->ia6_flags & IN6_IFF_TEMPORARY)) {
 			if (prefer_tempaddr)
 				REPLACE(7);
 			else
 				NEXT(7);
 		}
 		if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
 		    !(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
 			if (prefer_tempaddr)
 				NEXT(7);
 			else
 				REPLACE(7);
 		}
 
 		/*
 		 * Rule 8: prefer addresses on alive interfaces.
 		 * This is a KAME specific rule.
 		 */
 		if ((ia_best->ia_ifp->if_flags & IFF_UP) &&
 		    !(ia->ia_ifp->if_flags & IFF_UP))
 			NEXT(8);
 		if (!(ia_best->ia_ifp->if_flags & IFF_UP) &&
 		    (ia->ia_ifp->if_flags & IFF_UP))
 			REPLACE(8);
 
 		/*
 		 * Rule 9: prefer address with better virtual status.
 		 */
 		if (ifa_preferred(&ia_best->ia_ifa, &ia->ia_ifa))
 			REPLACE(9);
 		if (ifa_preferred(&ia->ia_ifa, &ia_best->ia_ifa))
 			NEXT(9);
 
 		/*
 		 * Rule 10: prefer address with `prefer_source' flag.
 		 */
 		if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0 &&
 		    (ia->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0)
 			REPLACE(10);
 		if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0 &&
 		    (ia->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0)
 			NEXT(10);
 
 		/*
 		 * Rule 14: Use longest matching prefix.
 		 * Note: in the address selection draft, this rule is
 		 * documented as "Rule 8".  However, since it is also
 		 * documented that this rule can be overridden, we assign
 		 * a large number so that it is easy to assign smaller numbers
 		 * to more preferred rules.
 		 */
 		new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst);
 		if (best_matchlen < new_matchlen)
 			REPLACE(14);
 		if (new_matchlen < best_matchlen)
 			NEXT(14);
 
 		/* Rule 15 is reserved. */
 
 		/*
 		 * Last resort: just keep the current candidate.
 		 * Or, do we need more rules?
 		 */
 		continue;
 
 	  replace:
 		ia_best = ia;
 		best_scope = (new_scope >= 0 ? new_scope :
 			      in6_addrscope(&ia_best->ia_addr.sin6_addr));
 		best_policy = (new_policy ? new_policy :
 			       lookup_addrsel_policy(&ia_best->ia_addr));
 		best_matchlen = (new_matchlen >= 0 ? new_matchlen :
 				 in6_matchlen(&ia_best->ia_addr.sin6_addr,
 					      &dst));
 
 	  next:
 		continue;
 
 	  out:
 		break;
 	}
 
 	if ((ia = ia_best) == NULL) {
 		IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 		IP6STAT_INC(ip6s_sources_none);
 		return (EADDRNOTAVAIL);
 	}
 
 	/*
 	 * At this point at least one of the addresses belonged to the jail
 	 * but it could still be, that we want to further restrict it, e.g.
 	 * theoratically IN6_IS_ADDR_LOOPBACK.
 	 * It must not be IN6_IS_ADDR_UNSPECIFIED anymore.
 	 * prison_local_ip6() will fix an IN6_IS_ADDR_LOOPBACK but should
 	 * let all others previously selected pass.
 	 * Use tmp to not change ::1 on lo0 to the primary jail address.
 	 */
 	tmp = ia->ia_addr.sin6_addr;
 	if (cred != NULL && prison_local_ip6(cred, &tmp, (inp != NULL &&
 	    (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0) {
 		IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 		IP6STAT_INC(ip6s_sources_none);
 		return (EADDRNOTAVAIL);
 	}
 
 	if (ifpp)
 		*ifpp = ifp;
 
 	bcopy(&tmp, srcp, sizeof(*srcp));
 	if (ia->ia_ifp == ifp)
 		IP6STAT_INC(ip6s_sources_sameif[best_scope]);
 	else
 		IP6STAT_INC(ip6s_sources_otherif[best_scope]);
 	if (dst_scope == best_scope)
 		IP6STAT_INC(ip6s_sources_samescope[best_scope]);
 	else
 		IP6STAT_INC(ip6s_sources_otherscope[best_scope]);
 	if (IFA6_IS_DEPRECATED(ia))
 		IP6STAT_INC(ip6s_sources_deprecated[best_scope]);
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (0);
 }
 
 /*
  * clone - meaningful only for bsdi and freebsd
  */
 static int
 selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct ip6_moptions *mopts, struct route_in6 *ro,
     struct ifnet **retifp, struct rtentry **retrt, int norouteok, u_int fibnum)
 {
 	int error = 0;
 	struct ifnet *ifp = NULL;
 	struct rtentry *rt = NULL;
 	struct sockaddr_in6 *sin6_next;
 	struct in6_pktinfo *pi = NULL;
 	struct in6_addr *dst = &dstsock->sin6_addr;
 	uint32_t zoneid;
 #if 0
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
 	    dstsock->sin6_addr.s6_addr32[1] == 0 &&
 	    !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
 		printf("in6_selectroute: strange destination %s\n",
 		       ip6_sprintf(ip6buf, &dstsock->sin6_addr));
 	} else {
 		printf("in6_selectroute: destination = %s%%%d\n",
 		       ip6_sprintf(ip6buf, &dstsock->sin6_addr),
 		       dstsock->sin6_scope_id); /* for debug */
 	}
 #endif
 
 	/* If the caller specify the outgoing interface explicitly, use it. */
 	if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
 		/* XXX boundary check is assumed to be already done. */
 		ifp = ifnet_byindex(pi->ipi6_ifindex);
 		if (ifp != NULL &&
 		    (norouteok || retrt == NULL ||
 		    IN6_IS_ADDR_MULTICAST(dst))) {
 			/*
 			 * we do not have to check or get the route for
 			 * multicast.
 			 */
 			goto done;
 		} else
 			goto getroute;
 	}
 	/*
 	 * If the destination address is a multicast address and the outgoing
 	 * interface for the address is specified by the caller, use it.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(dst) &&
 	    mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) {
 		goto done; /* we do not need a route for multicast. */
 	}
 	/*
 	 * If destination address is LLA or link- or node-local multicast,
 	 * use it's embedded scope zone id to determine outgoing interface.
 	 */
 	if (IN6_IS_ADDR_MC_LINKLOCAL(dst) ||
 	    IN6_IS_ADDR_MC_NODELOCAL(dst)) {
 		zoneid = ntohs(in6_getscope(dst));
 		if (zoneid > 0) {
 			ifp = in6_getlinkifnet(zoneid);
 			goto done;
 		}
 	}
 
   getroute:
 	/*
 	 * If the next hop address for the packet is specified by the caller,
 	 * use it as the gateway.
 	 */
 	if (opts && opts->ip6po_nexthop) {
 		struct route_in6 *ron;
 
 		sin6_next = satosin6(opts->ip6po_nexthop);
 		if (IN6_IS_ADDR_LINKLOCAL(&sin6_next->sin6_addr)) {
 			/*
 			 * Next hop is LLA, thus it should be neighbor.
 			 * Determine outgoing interface by zone index.
 			 */
 			zoneid = ntohs(in6_getscope(&sin6_next->sin6_addr));
 			if (zoneid > 0) {
 				ifp = in6_getlinkifnet(zoneid);
 				goto done;
 			}
 		}
 		ron = &opts->ip6po_nextroute;
 		/* Use a cached route if it exists and is valid. */
 		if (ron->ro_rt != NULL && (
 		    (ron->ro_rt->rt_flags & RTF_UP) == 0 ||
 		    ron->ro_dst.sin6_family != AF_INET6 ||
 		    !IN6_ARE_ADDR_EQUAL(&ron->ro_dst.sin6_addr,
 			&sin6_next->sin6_addr)))
 			RO_RTFREE(ron);
 		if (ron->ro_rt == NULL) {
 			ron->ro_dst = *sin6_next;
 			in6_rtalloc(ron, fibnum); /* multi path case? */
 		}
 		/*
 		 * The node identified by that address must be a
 		 * neighbor of the sending host.
 		 */
 		if (ron->ro_rt == NULL ||
 		    (ron->ro_rt->rt_flags & RTF_GATEWAY) != 0)
 			error = EHOSTUNREACH;
 		goto done;
 	}
 
 	/*
 	 * Use a cached route if it exists and is valid, else try to allocate
 	 * a new one.  Note that we should check the address family of the
 	 * cached destination, in case of sharing the cache with IPv4.
 	 */
 	if (ro) {
 		if (ro->ro_rt &&
 		    (!(ro->ro_rt->rt_flags & RTF_UP) ||
 		     ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 ||
 		     !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr,
 		     dst))) {
 			RTFREE(ro->ro_rt);
 			ro->ro_rt = (struct rtentry *)NULL;
 		}
 		if (ro->ro_rt == (struct rtentry *)NULL) {
 			struct sockaddr_in6 *sa6;
 
 			/* No route yet, so try to acquire one */
 			bzero(&ro->ro_dst, sizeof(struct sockaddr_in6));
 			sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
 			*sa6 = *dstsock;
 			sa6->sin6_scope_id = 0;
 
 #ifdef RADIX_MPATH
 				rtalloc_mpath_fib((struct route *)ro,
 				    ntohl(sa6->sin6_addr.s6_addr32[3]), fibnum);
 #else			
 				ro->ro_rt = in6_rtalloc1((struct sockaddr *)
 				    &ro->ro_dst, 0, 0UL, fibnum);
 				if (ro->ro_rt)
 					RT_UNLOCK(ro->ro_rt);
 #endif
 		}
 				
 		/*
 		 * do not care about the result if we have the nexthop
 		 * explicitly specified.
 		 */
 		if (opts && opts->ip6po_nexthop)
 			goto done;
 
 		if (ro->ro_rt) {
 			ifp = ro->ro_rt->rt_ifp;
 
 			if (ifp == NULL) { /* can this really happen? */
 				RTFREE(ro->ro_rt);
 				ro->ro_rt = NULL;
 			}
 		}
 		if (ro->ro_rt == NULL)
 			error = EHOSTUNREACH;
 		rt = ro->ro_rt;
 
 		/*
 		 * Check if the outgoing interface conflicts with
 		 * the interface specified by ipi6_ifindex (if specified).
 		 * Note that loopback interface is always okay.
 		 * (this may happen when we are sending a packet to one of
 		 *  our own addresses.)
 		 */
 		if (ifp && opts && opts->ip6po_pktinfo &&
 		    opts->ip6po_pktinfo->ipi6_ifindex) {
 			if (!(ifp->if_flags & IFF_LOOPBACK) &&
 			    ifp->if_index !=
 			    opts->ip6po_pktinfo->ipi6_ifindex) {
 				error = EHOSTUNREACH;
 				goto done;
 			}
 		}
 	}
 
   done:
 	if (ifp == NULL && rt == NULL) {
 		/*
 		 * This can happen if the caller did not pass a cached route
 		 * nor any other hints.  We treat this case an error.
 		 */
 		error = EHOSTUNREACH;
 	}
 	if (error == EHOSTUNREACH)
 		IP6STAT_INC(ip6s_noroute);
 
 	if (retifp != NULL) {
 		*retifp = ifp;
 
 		/*
 		 * Adjust the "outgoing" interface.  If we're going to loop 
 		 * the packet back to ourselves, the ifp would be the loopback 
 		 * interface. However, we'd rather know the interface associated 
 		 * to the destination address (which should probably be one of 
 		 * our own addresses.)
 		 */
 		if (rt) {
 			if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) &&
 			    (rt->rt_gateway->sa_family == AF_LINK))
 				*retifp = 
 					ifnet_byindex(((struct sockaddr_dl *)
 						       rt->rt_gateway)->sdl_index);
 		}
 	}
 
 	if (retrt != NULL)
 		*retrt = rt;	/* rt may be NULL */
 
 	return (error);
 }
 
 static int
 in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp,
     struct ifnet *oifp, u_int fibnum)
 {
 	int error;
 	struct route_in6 sro;
 	struct rtentry *rt = NULL;
 
 	KASSERT(retifp != NULL, ("%s: retifp is NULL", __func__));
 
 	if (ro == NULL) {
 		bzero(&sro, sizeof(sro));
 		ro = &sro;
 	}
 
 	if ((error = selectroute(dstsock, opts, mopts, ro, retifp,
 	    &rt, 1, fibnum)) != 0) {
 		if (ro == &sro && rt && rt == sro.ro_rt)
 			RTFREE(rt);
 		/* Help ND. See oifp comment in in6_selectsrc(). */
 		if (oifp != NULL && fibnum == RT_DEFAULT_FIB) {
 			*retifp = oifp;
 			error = 0;
 		}
 		return (error);
 	}
 
 	/*
 	 * do not use a rejected or black hole route.
 	 * XXX: this check should be done in the L2 output routine.
 	 * However, if we skipped this check here, we'd see the following
 	 * scenario:
 	 * - install a rejected route for a scoped address prefix
 	 *   (like fe80::/10)
 	 * - send a packet to a destination that matches the scoped prefix,
 	 *   with ambiguity about the scope zone.
 	 * - pick the outgoing interface from the route, and disambiguate the
 	 *   scope zone with the interface.
 	 * - ip6_output() would try to get another route with the "new"
 	 *   destination, which may be valid.
 	 * - we'd see no error on output.
 	 * Although this may not be very harmful, it should still be confusing.
 	 * We thus reject the case here.
 	 */
 	if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
 		int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
 
 		if (ro == &sro && rt && rt == sro.ro_rt)
 			RTFREE(rt);
 		return (flags);
 	}
 
 	if (ro == &sro && rt && rt == sro.ro_rt)
 		RTFREE(rt);
 	return (0);
 }
 
 /*
  * Public wrapper function to selectroute().
  *
  * XXX-BZ in6_selectroute() should and will grow the FIB argument. The
  * in6_selectroute_fib() function is only there for backward compat on stable.
  */
 int
 in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct ip6_moptions *mopts, struct route_in6 *ro,
     struct ifnet **retifp, struct rtentry **retrt)
 {
 
 	return (selectroute(dstsock, opts, mopts, ro, retifp,
 	    retrt, 0, RT_DEFAULT_FIB));
 }
 
 #ifndef BURN_BRIDGES
 int
 in6_selectroute_fib(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct ip6_moptions *mopts, struct route_in6 *ro,
     struct ifnet **retifp, struct rtentry **retrt, u_int fibnum)
 {
 
 	return (selectroute(dstsock, opts, mopts, ro, retifp,
 	    retrt, 0, fibnum));
 }
 #endif
 
 /*
  * Default hop limit selection. The precedence is as follows:
  * 1. Hoplimit value specified via ioctl.
  * 2. (If the outgoing interface is detected) the current
  *     hop limit of the interface specified by router advertisement.
  * 3. The system default hoplimit.
  */
 int
 in6_selecthlim(struct inpcb *in6p, struct ifnet *ifp)
 {
 
 	if (in6p && in6p->in6p_hops >= 0)
 		return (in6p->in6p_hops);
 	else if (ifp)
 		return (ND_IFINFO(ifp)->chlim);
 	else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
-		struct route_in6 ro6;
-		struct ifnet *lifp;
+		struct nhop6_basic nh6;
+		struct in6_addr dst;
+		uint32_t fibnum, scopeid;
+		int hlim;
 
-		bzero(&ro6, sizeof(ro6));
-		ro6.ro_dst.sin6_family = AF_INET6;
-		ro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
-		ro6.ro_dst.sin6_addr = in6p->in6p_faddr;
-		in6_rtalloc(&ro6, in6p->inp_inc.inc_fibnum);
-		if (ro6.ro_rt) {
-			lifp = ro6.ro_rt->rt_ifp;
-			RTFREE(ro6.ro_rt);
-			if (lifp)
-				return (ND_IFINFO(lifp)->chlim);
+		fibnum = in6p->inp_inc.inc_fibnum;
+		in6_splitscope(&in6p->in6p_faddr, &dst, &scopeid);
+		if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6)==0){
+			hlim = ND_IFINFO(nh6.nh_ifp)->chlim;
+			return (hlim);
 		}
 	}
 	return (V_ip6_defhlim);
 }
 
 /*
  * XXX: this is borrowed from in6_pcbbind(). If possible, we should
  * share this function by all *bsd*...
  */
 int
 in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	u_int16_t lport = 0;
 	int error, lookupflags = 0;
 #ifdef INVARIANTS
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	error = prison_local_ip6(cred, laddr,
 	    ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0));
 	if (error)
 		return(error);
 
 	/* XXX: this is redundant when called from in6_pcbbind */
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 
 	inp->inp_flags |= INP_ANONPORT;
 
 	error = in_pcb_lport(inp, NULL, &lport, cred, lookupflags);
 	if (error != 0)
 		return (error);
 
 	inp->inp_lport = lport;
 	if (in_pcbinshash(inp) != 0) {
 		inp->in6p_laddr = in6addr_any;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 
 	return (0);
 }
 
 void
 addrsel_policy_init(void)
 {
 
 	init_policy_queue();
 
 	/* initialize the "last resort" policy */
 	bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy));
 	V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP;
 
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	ADDRSEL_LOCK_INIT();
 	ADDRSEL_SXLOCK_INIT();
 }
 
 static struct in6_addrpolicy *
 lookup_addrsel_policy(struct sockaddr_in6 *key)
 {
 	struct in6_addrpolicy *match = NULL;
 
 	ADDRSEL_LOCK();
 	match = match_addrsel_policy(key);
 
 	if (match == NULL)
 		match = &V_defaultaddrpolicy;
 	else
 		match->use++;
 	ADDRSEL_UNLOCK();
 
 	return (match);
 }
 
 /*
  * Subroutines to manage the address selection policy table via sysctl.
  */
 struct walkarg {
 	struct sysctl_req *w_req;
 };
 
 static int in6_src_sysctl(SYSCTL_HANDLER_ARGS);
 SYSCTL_DECL(_net_inet6_ip6);
 static SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy,
 	CTLFLAG_RD, in6_src_sysctl, "");
 
 static int
 in6_src_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct walkarg w;
 
 	if (req->newptr)
 		return EPERM;
 
 	bzero(&w, sizeof(w));
 	w.w_req = req;
 
 	return (walk_addrsel_policy(dump_addrsel_policyent, &w));
 }
 
 int
 in6_src_ioctl(u_long cmd, caddr_t data)
 {
 	struct in6_addrpolicy ent0;
 
 	if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY)
 		return (EOPNOTSUPP); /* check for safety */
 
 	ent0 = *(struct in6_addrpolicy *)data;
 
 	if (ent0.label == ADDR_LABEL_NOTAPP)
 		return (EINVAL);
 	/* check if the prefix mask is consecutive. */
 	if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0)
 		return (EINVAL);
 	/* clear trailing garbages (if any) of the prefix address. */
 	IN6_MASK_ADDR(&ent0.addr.sin6_addr, &ent0.addrmask.sin6_addr);
 	ent0.use = 0;
 
 	switch (cmd) {
 	case SIOCAADDRCTL_POLICY:
 		return (add_addrsel_policyent(&ent0));
 	case SIOCDADDRCTL_POLICY:
 		return (delete_addrsel_policyent(&ent0));
 	}
 
 	return (0);		/* XXX: compromise compilers */
 }
 
 /*
  * The followings are implementation of the policy table using a
  * simple tail queue.
  * XXX such details should be hidden.
  * XXX implementation using binary tree should be more efficient.
  */
 struct addrsel_policyent {
 	TAILQ_ENTRY(addrsel_policyent) ape_entry;
 	struct in6_addrpolicy ape_policy;
 };
 
 TAILQ_HEAD(addrsel_policyhead, addrsel_policyent);
 
 static VNET_DEFINE(struct addrsel_policyhead, addrsel_policytab);
 #define	V_addrsel_policytab		VNET(addrsel_policytab)
 
 static void
 init_policy_queue(void)
 {
 
 	TAILQ_INIT(&V_addrsel_policytab);
 }
 
 static int
 add_addrsel_policyent(struct in6_addrpolicy *newpolicy)
 {
 	struct addrsel_policyent *new, *pol;
 
 	new = malloc(sizeof(*new), M_IFADDR,
 	       M_WAITOK);
 	ADDRSEL_XLOCK();
 	ADDRSEL_LOCK();
 
 	/* duplication check */
 	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
 		if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr,
 				       &pol->ape_policy.addr.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr,
 				       &pol->ape_policy.addrmask.sin6_addr)) {
 			ADDRSEL_UNLOCK();
 			ADDRSEL_XUNLOCK();
 			free(new, M_IFADDR);
 			return (EEXIST);	/* or override it? */
 		}
 	}
 
 	bzero(new, sizeof(*new));
 
 	/* XXX: should validate entry */
 	new->ape_policy = *newpolicy;
 
 	TAILQ_INSERT_TAIL(&V_addrsel_policytab, new, ape_entry);
 	ADDRSEL_UNLOCK();
 	ADDRSEL_XUNLOCK();
 
 	return (0);
 }
 
 static int
 delete_addrsel_policyent(struct in6_addrpolicy *key)
 {
 	struct addrsel_policyent *pol;
 
 	ADDRSEL_XLOCK();
 	ADDRSEL_LOCK();
 
 	/* search for the entry in the table */
 	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
 		if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr,
 		    &pol->ape_policy.addr.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr,
 		    &pol->ape_policy.addrmask.sin6_addr)) {
 			break;
 		}
 	}
 	if (pol == NULL) {
 		ADDRSEL_UNLOCK();
 		ADDRSEL_XUNLOCK();
 		return (ESRCH);
 	}
 
 	TAILQ_REMOVE(&V_addrsel_policytab, pol, ape_entry);
 	ADDRSEL_UNLOCK();
 	ADDRSEL_XUNLOCK();
 	free(pol, M_IFADDR);
 
 	return (0);
 }
 
 static int
 walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w)
 {
 	struct addrsel_policyent *pol;
 	int error = 0;
 
 	ADDRSEL_SLOCK();
 	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
 		if ((error = (*callback)(&pol->ape_policy, w)) != 0) {
 			ADDRSEL_SUNLOCK();
 			return (error);
 		}
 	}
 	ADDRSEL_SUNLOCK();
 	return (error);
 }
 
 static int
 dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg)
 {
 	int error = 0;
 	struct walkarg *w = arg;
 
 	error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol));
 
 	return (error);
 }
 
 static struct in6_addrpolicy *
 match_addrsel_policy(struct sockaddr_in6 *key)
 {
 	struct addrsel_policyent *pent;
 	struct in6_addrpolicy *bestpol = NULL, *pol;
 	int matchlen, bestmatchlen = -1;
 	u_char *mp, *ep, *k, *p, m;
 
 	TAILQ_FOREACH(pent, &V_addrsel_policytab, ape_entry) {
 		matchlen = 0;
 
 		pol = &pent->ape_policy;
 		mp = (u_char *)&pol->addrmask.sin6_addr;
 		ep = mp + 16;	/* XXX: scope field? */
 		k = (u_char *)&key->sin6_addr;
 		p = (u_char *)&pol->addr.sin6_addr;
 		for (; mp < ep && *mp; mp++, k++, p++) {
 			m = *mp;
 			if ((*k & m) != *p)
 				goto next; /* not match */
 			if (m == 0xff) /* short cut for a typical case */
 				matchlen += 8;
 			else {
 				while (m >= 0x80) {
 					matchlen++;
 					m <<= 1;
 				}
 			}
 		}
 
 		/* matched.  check if this is better than the current best. */
 		if (bestpol == NULL ||
 		    matchlen > bestmatchlen) {
 			bestpol = pol;
 			bestmatchlen = matchlen;
 		}
 
 	  next:
 		continue;
 	}
 
 	return (bestpol);
 }