Index: head/sys/net/if_llatbl.c
===================================================================
--- head/sys/net/if_llatbl.c	(revision 361703)
+++ head/sys/net/if_llatbl.c	(revision 361704)
@@ -1,920 +1,921 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2004 Luigi Rizzo, Alessandro Cerri. All rights reserved.
  * Copyright (c) 2004-2008 Qing Li. All rights reserved.
  * Copyright (c) 2008 Kip Macy. All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 
 #include <netinet/in.h>
 #include <net/if_llatbl.h>
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_var.h>
 #include <net/route.h>
+#include <net/route/route_ctl.h>
 #include <net/vnet.h>
 #include <netinet/if_ether.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
 
 MALLOC_DEFINE(M_LLTABLE, "lltable", "link level address tables");
 
 VNET_DEFINE_STATIC(SLIST_HEAD(, lltable), lltables) =
     SLIST_HEAD_INITIALIZER(lltables);
 #define	V_lltables	VNET(lltables)
 
 static struct rwlock lltable_list_lock;
 RW_SYSINIT(lltable_list_lock, &lltable_list_lock, "lltable_list_lock");
 #define	LLTABLE_LIST_RLOCK()		rw_rlock(&lltable_list_lock)
 #define	LLTABLE_LIST_RUNLOCK()		rw_runlock(&lltable_list_lock)
 #define	LLTABLE_LIST_WLOCK()		rw_wlock(&lltable_list_lock)
 #define	LLTABLE_LIST_WUNLOCK()		rw_wunlock(&lltable_list_lock)
 #define	LLTABLE_LIST_LOCK_ASSERT()	rw_assert(&lltable_list_lock, RA_LOCKED)
 
 static void lltable_unlink(struct lltable *llt);
 static void llentries_unlink(struct lltable *llt, struct llentries *head);
 
 /*
  * Dump lle state for a specific address family.
  */
 static int
 lltable_dump_af(struct lltable *llt, struct sysctl_req *wr)
 {
 	struct epoch_tracker et;
 	int error;
 
 	LLTABLE_LIST_LOCK_ASSERT();
 
 	if (llt->llt_ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 	error = 0;
 
 	NET_EPOCH_ENTER(et);
 	error = lltable_foreach_lle(llt,
 	    (llt_foreach_cb_t *)llt->llt_dump_entry, wr);
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 /*
  * Dump arp state for a specific address family.
  */
 int
 lltable_sysctl_dumparp(int af, struct sysctl_req *wr)
 {
 	struct lltable *llt;
 	int error = 0;
 
 	LLTABLE_LIST_RLOCK();
 	SLIST_FOREACH(llt, &V_lltables, llt_link) {
 		if (llt->llt_af == af) {
 			error = lltable_dump_af(llt, wr);
 			if (error != 0)
 				goto done;
 		}
 	}
 done:
 	LLTABLE_LIST_RUNLOCK();
 	return (error);
 }
 
 /*
  * Common function helpers for chained hash table.
  */
 
 /*
  * Runs specified callback for each entry in @llt.
  * Caller does the locking.
  *
  */
 static int
 htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg)
 {
 	struct llentry *lle, *next;
 	int i, error;
 
 	error = 0;
 
 	for (i = 0; i < llt->llt_hsize; i++) {
 		CK_LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) {
 			error = f(llt, lle, farg);
 			if (error != 0)
 				break;
 		}
 	}
 
 	return (error);
 }
 
 /*
  * The htable_[un]link_entry() functions return:
  * 0 if the entry was (un)linked already and nothing changed,
  * 1 if the entry was added/removed to/from the table, and
  * -1 on error (e.g., not being able to add the entry due to limits reached).
  * While the "unlink" operation should never error, callers of
  * lltable_link_entry() need to check for errors and handle them.
  */
 static int
 htable_link_entry(struct lltable *llt, struct llentry *lle)
 {
 	struct llentries *lleh;
 	uint32_t hashidx;
 
 	if ((lle->la_flags & LLE_LINKED) != 0)
 		return (0);
 
 	IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp);
 
 	if (llt->llt_maxentries > 0 &&
 	    llt->llt_entries >= llt->llt_maxentries)
 		return (-1);
 
 	hashidx = llt->llt_hash(lle, llt->llt_hsize);
 	lleh = &llt->lle_head[hashidx];
 
 	lle->lle_tbl  = llt;
 	lle->lle_head = lleh;
 	lle->la_flags |= LLE_LINKED;
 	CK_LIST_INSERT_HEAD(lleh, lle, lle_next);
 	llt->llt_entries++;
 
 	return (1);
 }
 
 static int
 htable_unlink_entry(struct llentry *lle)
 {
 	struct lltable *llt;
 
 	if ((lle->la_flags & LLE_LINKED) == 0)
 		return (0);
 
 	llt = lle->lle_tbl;
 	IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp);
 	KASSERT(llt->llt_entries > 0, ("%s: lltable %p (%s) entries %d <= 0",
 	    __func__, llt, if_name(llt->llt_ifp), llt->llt_entries));
 
 	CK_LIST_REMOVE(lle, lle_next);
 	lle->la_flags &= ~(LLE_VALID | LLE_LINKED);
 #if 0
 	lle->lle_tbl = NULL;
 	lle->lle_head = NULL;
 #endif
 	llt->llt_entries--;
 
 	return (1);
 }
 
 struct prefix_match_data {
 	const struct sockaddr *addr;
 	const struct sockaddr *mask;
 	struct llentries dchain;
 	u_int flags;
 };
 
 static int
 htable_prefix_free_cb(struct lltable *llt, struct llentry *lle, void *farg)
 {
 	struct prefix_match_data *pmd;
 
 	pmd = (struct prefix_match_data *)farg;
 
 	if (llt->llt_match_prefix(pmd->addr, pmd->mask, pmd->flags, lle)) {
 		LLE_WLOCK(lle);
 		CK_LIST_INSERT_HEAD(&pmd->dchain, lle, lle_chain);
 	}
 
 	return (0);
 }
 
 static void
 htable_prefix_free(struct lltable *llt, const struct sockaddr *addr,
     const struct sockaddr *mask, u_int flags)
 {
 	struct llentry *lle, *next;
 	struct prefix_match_data pmd;
 
 	bzero(&pmd, sizeof(pmd));
 	pmd.addr = addr;
 	pmd.mask = mask;
 	pmd.flags = flags;
 	CK_LIST_INIT(&pmd.dchain);
 
 	IF_AFDATA_WLOCK(llt->llt_ifp);
 	/* Push matching lles to chain */
 	lltable_foreach_lle(llt, htable_prefix_free_cb, &pmd);
 
 	llentries_unlink(llt, &pmd.dchain);
 	IF_AFDATA_WUNLOCK(llt->llt_ifp);
 
 	CK_LIST_FOREACH_SAFE(lle, &pmd.dchain, lle_chain, next)
 		lltable_free_entry(llt, lle);
 }
 
 static void
 htable_free_tbl(struct lltable *llt)
 {
 
 	free(llt->lle_head, M_LLTABLE);
 	free(llt, M_LLTABLE);
 }
 
 static void
 llentries_unlink(struct lltable *llt, struct llentries *head)
 {
 	struct llentry *lle, *next;
 
 	CK_LIST_FOREACH_SAFE(lle, head, lle_chain, next)
 		llt->llt_unlink_entry(lle);
 }
 
 /*
  * Helper function used to drop all mbufs in hold queue.
  *
  * Returns the number of held packets, if any, that were dropped.
  */
 size_t
 lltable_drop_entry_queue(struct llentry *lle)
 {
 	size_t pkts_dropped;
 	struct mbuf *next;
 
 	LLE_WLOCK_ASSERT(lle);
 
 	pkts_dropped = 0;
 	while ((lle->la_numheld > 0) && (lle->la_hold != NULL)) {
 		next = lle->la_hold->m_nextpkt;
 		m_freem(lle->la_hold);
 		lle->la_hold = next;
 		lle->la_numheld--;
 		pkts_dropped++;
 	}
 
 	KASSERT(lle->la_numheld == 0,
 		("%s: la_numheld %d > 0, pkts_droped %zd", __func__,
 		 lle->la_numheld, pkts_dropped));
 
 	return (pkts_dropped);
 }
 
 void
 lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle,
     const char *linkhdr, size_t linkhdrsize, int lladdr_off)
 {
 
 	memcpy(lle->r_linkdata, linkhdr, linkhdrsize);
 	lle->r_hdrlen = linkhdrsize;
 	lle->ll_addr = &lle->r_linkdata[lladdr_off];
 	lle->la_flags |= LLE_VALID;
 	lle->r_flags |= RLLE_VALID;
 }
 
 /*
  * Tries to update @lle link-level address.
  * Since update requires AFDATA WLOCK, function
  * drops @lle lock, acquires AFDATA lock and then acquires
  * @lle lock to maintain lock order.
  *
  * Returns 1 on success.
  */
 int
 lltable_try_set_entry_addr(struct ifnet *ifp, struct llentry *lle,
     const char *linkhdr, size_t linkhdrsize, int lladdr_off)
 {
 
 	/* Perform real LLE update */
 	/* use afdata WLOCK to update fields */
 	LLE_WLOCK_ASSERT(lle);
 	LLE_ADDREF(lle);
 	LLE_WUNLOCK(lle);
 	IF_AFDATA_WLOCK(ifp);
 	LLE_WLOCK(lle);
 
 	/*
 	 * Since we droppped LLE lock, other thread might have deleted
 	 * this lle. Check and return
 	 */
 	if ((lle->la_flags & LLE_DELETED) != 0) {
 		IF_AFDATA_WUNLOCK(ifp);
 		LLE_FREE_LOCKED(lle);
 		return (0);
 	}
 
 	/* Update data */
 	lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off);
 
 	IF_AFDATA_WUNLOCK(ifp);
 
 	LLE_REMREF(lle);
 
 	return (1);
 }
 
  /*
  * Helper function used to pre-compute full/partial link-layer
  * header data suitable for feeding into if_output().
  */
 int
 lltable_calc_llheader(struct ifnet *ifp, int family, char *lladdr,
     char *buf, size_t *bufsize, int *lladdr_off)
 {
 	struct if_encap_req ereq;
 	int error;
 
 	bzero(buf, *bufsize);
 	bzero(&ereq, sizeof(ereq));
 	ereq.buf = buf;
 	ereq.bufsize = *bufsize;
 	ereq.rtype = IFENCAP_LL;
 	ereq.family = family;
 	ereq.lladdr = lladdr;
 	ereq.lladdr_len = ifp->if_addrlen;
 	error = ifp->if_requestencap(ifp, &ereq);
 	if (error == 0) {
 		*bufsize = ereq.bufsize;
 		*lladdr_off = ereq.lladdr_off;
 	}
 
 	return (error);
 }
 
 /*
  * Update link-layer header for given @lle after
  * interface lladdr was changed.
  */
 static int
 llentry_update_ifaddr(struct lltable *llt, struct llentry *lle, void *farg)
 {
 	struct ifnet *ifp;
 	u_char linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	u_char *lladdr;
 	int lladdr_off;
 
 	ifp = (struct ifnet *)farg;
 
 	lladdr = lle->ll_addr;
 
 	LLE_WLOCK(lle);
 	if ((lle->la_flags & LLE_VALID) == 0) {
 		LLE_WUNLOCK(lle);
 		return (0);
 	}
 
 	if ((lle->la_flags & LLE_IFADDR) != 0)
 		lladdr = IF_LLADDR(ifp);
 
 	linkhdrsize = sizeof(linkhdr);
 	lltable_calc_llheader(ifp, llt->llt_af, lladdr, linkhdr, &linkhdrsize,
 	    &lladdr_off);
 	memcpy(lle->r_linkdata, linkhdr, linkhdrsize);
 	LLE_WUNLOCK(lle);
 
 	return (0);
 }
 
 /*
  * Update all calculated headers for given @llt
  */
 void
 lltable_update_ifaddr(struct lltable *llt)
 {
 
 	if (llt->llt_ifp->if_flags & IFF_LOOPBACK)
 		return;
 
 	IF_AFDATA_WLOCK(llt->llt_ifp);
 	lltable_foreach_lle(llt, llentry_update_ifaddr, llt->llt_ifp);
 	IF_AFDATA_WUNLOCK(llt->llt_ifp);
 }
 
 /*
  *
  * Performs generic cleanup routines and frees lle.
  *
  * Called for non-linked entries, with callouts and
  * other AF-specific cleanups performed.
  *
  * @lle must be passed WLOCK'ed
  *
  * Returns the number of held packets, if any, that were dropped.
  */
 size_t
 llentry_free(struct llentry *lle)
 {
 	size_t pkts_dropped;
 
 	LLE_WLOCK_ASSERT(lle);
 
 	KASSERT((lle->la_flags & LLE_LINKED) == 0, ("freeing linked lle"));
 
 	pkts_dropped = lltable_drop_entry_queue(lle);
 
 	/* cancel timer */
 	if (callout_stop(&lle->lle_timer) > 0)
 		LLE_REMREF(lle);
 	LLE_FREE_LOCKED(lle);
 
 	return (pkts_dropped);
 }
 
 /*
  * Free all entries from given table and free itself.
  */
 
 static int
 lltable_free_cb(struct lltable *llt, struct llentry *lle, void *farg)
 {
 	struct llentries *dchain;
 
 	dchain = (struct llentries *)farg;
 
 	LLE_WLOCK(lle);
 	CK_LIST_INSERT_HEAD(dchain, lle, lle_chain);
 
 	return (0);
 }
 
 /*
  * Free all entries from given table and free itself.
  */
 void
 lltable_free(struct lltable *llt)
 {
 	struct llentry *lle, *next;
 	struct llentries dchain;
 
 	KASSERT(llt != NULL, ("%s: llt is NULL", __func__));
 
 	lltable_unlink(llt);
 
 	CK_LIST_INIT(&dchain);
 	IF_AFDATA_WLOCK(llt->llt_ifp);
 	/* Push all lles to @dchain */
 	lltable_foreach_lle(llt, lltable_free_cb, &dchain);
 	llentries_unlink(llt, &dchain);
 	IF_AFDATA_WUNLOCK(llt->llt_ifp);
 
 	CK_LIST_FOREACH_SAFE(lle, &dchain, lle_chain, next) {
 		llentry_free(lle);
 	}
 
 	KASSERT(llt->llt_entries == 0, ("%s: lltable %p (%s) entires not 0: %d",
 	    __func__, llt, llt->llt_ifp->if_xname, llt->llt_entries));
 
 	llt->llt_free_tbl(llt);
 }
 
 /*
  * Deletes an address from given lltable.
  * Used for userland interaction to remove
  * individual entries. Skips entries added by OS.
  */
 int
 lltable_delete_addr(struct lltable *llt, u_int flags,
     const struct sockaddr *l3addr)
 {
 	struct llentry *lle;
 	struct ifnet *ifp;
 
 	ifp = llt->llt_ifp;
 	IF_AFDATA_WLOCK(ifp);
 	lle = lla_lookup(llt, LLE_EXCLUSIVE, l3addr);
 
 	if (lle == NULL) {
 		IF_AFDATA_WUNLOCK(ifp);
 		return (ENOENT);
 	}
 	if ((lle->la_flags & LLE_IFADDR) != 0 && (flags & LLE_IFADDR) == 0) {
 		IF_AFDATA_WUNLOCK(ifp);
 		LLE_WUNLOCK(lle);
 		return (EPERM);
 	}
 
 	lltable_unlink_entry(llt, lle);
 	IF_AFDATA_WUNLOCK(ifp);
 
 	llt->llt_delete_entry(llt, lle);
 
 	return (0);
 }
 
 void
 lltable_prefix_free(int af, struct sockaddr *addr, struct sockaddr *mask,
     u_int flags)
 {
 	struct lltable *llt;
 
 	LLTABLE_LIST_RLOCK();
 	SLIST_FOREACH(llt, &V_lltables, llt_link) {
 		if (llt->llt_af != af)
 			continue;
 
 		llt->llt_prefix_free(llt, addr, mask, flags);
 	}
 	LLTABLE_LIST_RUNLOCK();
 }
 
 struct lltable *
 lltable_allocate_htbl(uint32_t hsize)
 {
 	struct lltable *llt;
 	int i;
 
 	llt = malloc(sizeof(struct lltable), M_LLTABLE, M_WAITOK | M_ZERO);
 	llt->llt_hsize = hsize;
 	llt->lle_head = malloc(sizeof(struct llentries) * hsize,
 	    M_LLTABLE, M_WAITOK | M_ZERO);
 
 	for (i = 0; i < llt->llt_hsize; i++)
 		CK_LIST_INIT(&llt->lle_head[i]);
 
 	/* Set some default callbacks */
 	llt->llt_link_entry = htable_link_entry;
 	llt->llt_unlink_entry = htable_unlink_entry;
 	llt->llt_prefix_free = htable_prefix_free;
 	llt->llt_foreach_entry = htable_foreach_lle;
 	llt->llt_free_tbl = htable_free_tbl;
 
 	return (llt);
 }
 
 /*
  * Links lltable to global llt list.
  */
 void
 lltable_link(struct lltable *llt)
 {
 
 	LLTABLE_LIST_WLOCK();
 	SLIST_INSERT_HEAD(&V_lltables, llt, llt_link);
 	LLTABLE_LIST_WUNLOCK();
 }
 
 static void
 lltable_unlink(struct lltable *llt)
 {
 
 	LLTABLE_LIST_WLOCK();
 	SLIST_REMOVE(&V_lltables, llt, lltable, llt_link);
 	LLTABLE_LIST_WUNLOCK();
 
 }
 
 /*
  * External methods used by lltable consumers
  */
 
 int
 lltable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg)
 {
 
 	return (llt->llt_foreach_entry(llt, f, farg));
 }
 
 struct llentry *
 lltable_alloc_entry(struct lltable *llt, u_int flags,
     const struct sockaddr *l3addr)
 {
 
 	return (llt->llt_alloc_entry(llt, flags, l3addr));
 }
 
 void
 lltable_free_entry(struct lltable *llt, struct llentry *lle)
 {
 
 	llt->llt_free_entry(llt, lle);
 }
 
 int
 lltable_link_entry(struct lltable *llt, struct llentry *lle)
 {
 
 	return (llt->llt_link_entry(llt, lle));
 }
 
 int
 lltable_unlink_entry(struct lltable *llt, struct llentry *lle)
 {
 
 	return (llt->llt_unlink_entry(lle));
 }
 
 void
 lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
 {
 	struct lltable *llt;
 
 	llt = lle->lle_tbl;
 	llt->llt_fill_sa_entry(lle, sa);
 }
 
 struct ifnet *
 lltable_get_ifp(const struct lltable *llt)
 {
 
 	return (llt->llt_ifp);
 }
 
 int
 lltable_get_af(const struct lltable *llt)
 {
 
 	return (llt->llt_af);
 }
 
 /*
  * Called in route_output when rtm_flags contains RTF_LLDATA.
  */
 int
 lla_rt_output(struct rt_msghdr *rtm, struct rt_addrinfo *info)
 {
 	struct sockaddr_dl *dl =
 	    (struct sockaddr_dl *)info->rti_info[RTAX_GATEWAY];
 	struct sockaddr *dst = (struct sockaddr *)info->rti_info[RTAX_DST];
 	struct ifnet *ifp;
 	struct lltable *llt;
 	struct llentry *lle, *lle_tmp;
 	uint8_t linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	int lladdr_off;
 	u_int laflags = 0;
 	int error;
 
 	if (dl == NULL || dl->sdl_family != AF_LINK)
 		return (EINVAL);
 
 	ifp = ifnet_byindex(dl->sdl_index);
 	if (ifp == NULL) {
 		log(LOG_INFO, "%s: invalid ifp (sdl_index %d)\n",
 		    __func__, dl->sdl_index);
 		return EINVAL;
 	}
 
 	/* XXX linked list may be too expensive */
 	LLTABLE_LIST_RLOCK();
 	SLIST_FOREACH(llt, &V_lltables, llt_link) {
 		if (llt->llt_af == dst->sa_family &&
 		    llt->llt_ifp == ifp)
 			break;
 	}
 	LLTABLE_LIST_RUNLOCK();
 	if (llt == NULL)
 		return (ESRCH);
 
 	error = 0;
 
 	switch (rtm->rtm_type) {
 	case RTM_ADD:
 		/* Add static LLE */
 		laflags = 0;
 		if (rtm->rtm_rmx.rmx_expire == 0)
 			laflags = LLE_STATIC;
 		lle = lltable_alloc_entry(llt, laflags, dst);
 		if (lle == NULL)
 			return (ENOMEM);
 
 		linkhdrsize = sizeof(linkhdr);
 		if (lltable_calc_llheader(ifp, dst->sa_family, LLADDR(dl),
 		    linkhdr, &linkhdrsize, &lladdr_off) != 0)
 			return (EINVAL);
 		lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize,
 		    lladdr_off);
 		if ((rtm->rtm_flags & RTF_ANNOUNCE))
 			lle->la_flags |= LLE_PUB;
 		lle->la_expire = rtm->rtm_rmx.rmx_expire;
 
 		laflags = lle->la_flags;
 
 		/* Try to link new entry */
 		lle_tmp = NULL;
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(lle);
 		lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, dst);
 		if (lle_tmp != NULL) {
 			/* Check if we are trying to replace immutable entry */
 			if ((lle_tmp->la_flags & LLE_IFADDR) != 0) {
 				IF_AFDATA_WUNLOCK(ifp);
 				LLE_WUNLOCK(lle_tmp);
 				lltable_free_entry(llt, lle);
 				return (EPERM);
 			}
 			/* Unlink existing entry from table */
 			lltable_unlink_entry(llt, lle_tmp);
 		}
 		lltable_link_entry(llt, lle);
 		IF_AFDATA_WUNLOCK(ifp);
 
 		if (lle_tmp != NULL) {
 			EVENTHANDLER_INVOKE(lle_event, lle_tmp,LLENTRY_EXPIRED);
 			lltable_free_entry(llt, lle_tmp);
 		}
 
 		/*
 		 * By invoking LLE handler here we might get
 		 * two events on static LLE entry insertion
 		 * in routing socket. However, since we might have
 		 * other subscribers we need to generate this event.
 		 */
 		EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
 		LLE_WUNLOCK(lle);
 #ifdef INET
 		/* gratuitous ARP */
 		if ((laflags & LLE_PUB) && dst->sa_family == AF_INET)
 			arprequest(ifp,
 			    &((struct sockaddr_in *)dst)->sin_addr,
 			    &((struct sockaddr_in *)dst)->sin_addr,
 			    (u_char *)LLADDR(dl));
 #endif
 
 		break;
 
 	case RTM_DELETE:
 		return (lltable_delete_addr(llt, 0, dst));
 
 	default:
 		error = EINVAL;
 	}
 
 	return (error);
 }
 
 #ifdef DDB
 struct llentry_sa {
 	struct llentry		base;
 	struct sockaddr		l3_addr;
 };
 
 static void
 llatbl_lle_show(struct llentry_sa *la)
 {
 	struct llentry *lle;
 	uint8_t octet[6];
 
 	lle = &la->base;
 	db_printf("lle=%p\n", lle);
 	db_printf(" lle_next=%p\n", lle->lle_next.cle_next);
 	db_printf(" lle_lock=%p\n", &lle->lle_lock);
 	db_printf(" lle_tbl=%p\n", lle->lle_tbl);
 	db_printf(" lle_head=%p\n", lle->lle_head);
 	db_printf(" la_hold=%p\n", lle->la_hold);
 	db_printf(" la_numheld=%d\n", lle->la_numheld);
 	db_printf(" la_expire=%ju\n", (uintmax_t)lle->la_expire);
 	db_printf(" la_flags=0x%04x\n", lle->la_flags);
 	db_printf(" la_asked=%u\n", lle->la_asked);
 	db_printf(" la_preempt=%u\n", lle->la_preempt);
 	db_printf(" ln_state=%d\n", lle->ln_state);
 	db_printf(" ln_router=%u\n", lle->ln_router);
 	db_printf(" ln_ntick=%ju\n", (uintmax_t)lle->ln_ntick);
 	db_printf(" lle_refcnt=%d\n", lle->lle_refcnt);
 	bcopy(lle->ll_addr, octet, sizeof(octet));
 	db_printf(" ll_addr=%02x:%02x:%02x:%02x:%02x:%02x\n",
 	    octet[0], octet[1], octet[2], octet[3], octet[4], octet[5]);
 	db_printf(" lle_timer=%p\n", &lle->lle_timer);
 
 	switch (la->l3_addr.sa_family) {
 #ifdef INET
 	case AF_INET:
 	{
 		struct sockaddr_in *sin;
 		char l3s[INET_ADDRSTRLEN];
 
 		sin = (struct sockaddr_in *)&la->l3_addr;
 		inet_ntoa_r(sin->sin_addr, l3s);
 		db_printf(" l3_addr=%s\n", l3s);
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6:
 	{
 		struct sockaddr_in6 *sin6;
 		char l3s[INET6_ADDRSTRLEN];
 
 		sin6 = (struct sockaddr_in6 *)&la->l3_addr;
 		ip6_sprintf(l3s, &sin6->sin6_addr);
 		db_printf(" l3_addr=%s\n", l3s);
 		break;
 	}
 #endif
 	default:
 		db_printf(" l3_addr=N/A (af=%d)\n", la->l3_addr.sa_family);
 		break;
 	}
 }
 
 DB_SHOW_COMMAND(llentry, db_show_llentry)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show llentry <struct llentry *>\n");
 		return;
 	}
 
 	llatbl_lle_show((struct llentry_sa *)addr);
 }
 
 static void
 llatbl_llt_show(struct lltable *llt)
 {
 	int i;
 	struct llentry *lle;
 
 	db_printf("llt=%p llt_af=%d llt_ifp=%p\n",
 	    llt, llt->llt_af, llt->llt_ifp);
 
 	for (i = 0; i < llt->llt_hsize; i++) {
 		CK_LIST_FOREACH(lle, &llt->lle_head[i], lle_next) {
 
 			llatbl_lle_show((struct llentry_sa *)lle);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(lltable, db_show_lltable)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show lltable <struct lltable *>\n");
 		return;
 	}
 
 	llatbl_llt_show((struct lltable *)addr);
 }
 
 DB_SHOW_ALL_COMMAND(lltables, db_show_all_lltables)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct lltable *llt;
 
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET_QUIET(vnet_iter);
 #ifdef VIMAGE
 		db_printf("vnet=%p\n", curvnet);
 #endif
 		SLIST_FOREACH(llt, &V_lltables, llt_link) {
 			db_printf("llt=%p llt_af=%d llt_ifp=%p(%s)\n",
 			    llt, llt->llt_af, llt->llt_ifp,
 			    (llt->llt_ifp != NULL) ?
 				llt->llt_ifp->if_xname : "?");
 			if (have_addr && addr != 0) /* verbose */
 				llatbl_llt_show(llt);
 			if (db_pager_quit) {
 				CURVNET_RESTORE();
 				return;
 			}
 		}
 		CURVNET_RESTORE();
 	}
 }
 #endif
Index: head/sys/net/route/nhop_ctl.c
===================================================================
--- head/sys/net/route/nhop_ctl.c	(revision 361703)
+++ head/sys/net/route/nhop_ctl.c	(revision 361704)
@@ -1,837 +1,838 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2020 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/kernel.h>
 #include <sys/epoch.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
+#include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop_utils.h>
 #include <net/route/nhop.h>
 #include <net/route/nhop_var.h>
 #include <net/route/shared.h>
 #include <net/vnet.h>
 
 /*
  * This file contains core functionality for the nexthop ("nhop") route subsystem.
  * The business logic needed to create nexhop objects is implemented here.
  *
  * Nexthops in the original sense are the objects containing all the necessary
  * information to forward the packet to the selected destination.
  * In particular, nexthop is defined by a combination of
  *  ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and
  *    NHF_DEFAULT
  *
  * Additionally, each nexthop gets assigned its unique index (nexthop index).
  * It serves two purposes: first one is to ease the ability of userland programs to
  *  reference nexthops by their index. The second one allows lookup algorithms to
  *  to store index instead of pointer (2 bytes vs 8) as a lookup result.
  * All nexthops are stored in the resizable hash table.
  *
  * Basically, this file revolves around supporting 3 functions:
  * 1) nhop_create_from_info / nhop_create_from_nhop, which contains all
  *  business logic on filling the nexthop fields based on the provided request.
  * 2) nhop_get(), which gets a usable referenced nexthops.
  *
  * Conventions:
  * 1) non-exported functions start with verb
  * 2) exported function starts with the subsystem prefix: "nhop"
  */
 
 static int dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w);
 
 static struct nhop_priv *alloc_nhop_structure(void);
 static int get_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
     struct nhop_priv **pnh_priv);
 static int finalize_nhop(struct nh_control *ctl, struct rt_addrinfo *info,
     struct nhop_priv *nh_priv);
 static struct ifnet *get_aifp(const struct nhop_object *nh, int reference);
 static void fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp);
 
 static void destroy_nhop_epoch(epoch_context_t ctx);
 static void destroy_nhop(struct nhop_priv *nh_priv);
 
 static void print_nhop(const char *prefix, const struct nhop_object *nh);
 
 _Static_assert(__offsetof(struct nhop_object, nh_ifp) == 32,
     "nhop_object: wrong nh_ifp offset");
 _Static_assert(sizeof(struct nhop_object) <= 128,
     "nhop_object: size exceeds 128 bytes");
 
 static uma_zone_t nhops_zone;	/* Global zone for each and every nexthop */
 
 
 #define	NHOP_OBJECT_ALIGNED_SIZE	roundup2(sizeof(struct nhop_object), \
 							2 * CACHE_LINE_SIZE)
 #define	NHOP_PRIV_ALIGNED_SIZE		roundup2(sizeof(struct nhop_priv), \
 							2 * CACHE_LINE_SIZE)
 void
 nhops_init(void)
 {
 
 	nhops_zone = uma_zcreate("routing nhops",
 	    NHOP_OBJECT_ALIGNED_SIZE + NHOP_PRIV_ALIGNED_SIZE,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 
 /*
  * Fetches the interface of source address used by the route.
  * In all cases except interface-address-route it would be the
  * same as the transmit interfaces.
  * However, for the interface address this function will return
  * this interface ifp instead of loopback. This is needed to support
  * link-local IPv6 loopback communications.
  *
  * If @reference is non-zero, found ifp is referenced.
  *
  * Returns found ifp.
  */
 static struct ifnet *
 get_aifp(const struct nhop_object *nh, int reference)
 {
 	struct ifnet *aifp = NULL;
 
 	/*
 	 * Adjust the "outgoing" interface.  If we're going to loop
 	 * the packet back to ourselves, the ifp would be the loopback
 	 * interface. However, we'd rather know the interface associated
 	 * to the destination address (which should probably be one of
 	 * our own addresses).
 	 */
 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) &&
 			nh->gw_sa.sa_family == AF_LINK) {
 		if (reference)
 			aifp = ifnet_byindex_ref(nh->gwl_sa.sdl_index);
 		else
 			aifp = ifnet_byindex(nh->gwl_sa.sdl_index);
 		if (aifp == NULL) {
 			DPRINTF("unable to get aifp for %s index %d",
 				if_name(nh->nh_ifp), nh->gwl_sa.sdl_index);
 		}
 	}
 
 	if (aifp == NULL) {
 		aifp = nh->nh_ifp;
 		if (reference)
 			if_ref(aifp);
 	}
 
 	return (aifp);
 }
 
 int
 cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two)
 {
 
 	if (memcmp(_one->nh, _two->nh, NHOP_END_CMP) != 0)
 		return (0);
 
 	if ((_one->nh_type != _two->nh_type) ||
 	    (_one->nh_family != _two->nh_family))
 		return (0);
 
 	return (1);
 }
 
 /*
  * Conditionally sets @nh mtu data based on the @info data.
  */
 static void
 set_nhop_mtu_from_info(struct nhop_object *nh, const struct rt_addrinfo *info)
 {
 
 	if (info->rti_mflags & RTV_MTU) {
 		if (info->rti_rmx->rmx_mtu != 0) {
 
 			/*
 			 * MTU was explicitly provided by user.
 			 * Keep it.
 			 */
 
 			nh->nh_priv->rt_flags |= RTF_FIXEDMTU;
 		} else {
 
 			/*
 			 * User explicitly sets MTU to 0.
 			 * Assume rollback to default.
 			 */
 			nh->nh_priv->rt_flags &= ~RTF_FIXEDMTU;
 		}
 		nh->nh_mtu = info->rti_rmx->rmx_mtu;
 	}
 }
 
 /*
  * Fills in shorted link-level sockadd version suitable to be stored inside the
  *  nexthop gateway buffer.
  */
 static void
 fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp)
 {
 
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_len = sizeof(struct sockaddr_dl_short);
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = ifp->if_type;
 }
 
 static int
 set_nhop_gw_from_info(struct nhop_object *nh, struct rt_addrinfo *info)
 {
 	struct sockaddr *gw;
 
 	gw = info->rti_info[RTAX_GATEWAY];
 	if (info->rti_flags & RTF_GATEWAY) {
 		if (gw->sa_len > sizeof(struct sockaddr_in6)) {
 			DPRINTF("nhop SA size too big: AF %d len %u",
 			    gw->sa_family, gw->sa_len);
 			return (ENOMEM);
 		}
 		memcpy(&nh->gw_sa, gw, gw->sa_len);
 	} else {
 		/*
 		 * Interface route. Currently the route.c code adds
 		 * sa of type AF_LINK, which is 56 bytes long. The only
 		 * meaningful data there is the interface index. It is used
 		 * used is the IPv6 loopback output, where we need to preserve
 		 * the original interface to maintain proper scoping.
 		 * Despite the fact that nexthop code stores original interface
 		 * in the separate field (nh_aifp, see below), write AF_LINK
 		 * compatible sa with shorter total length.
 		 */
 		fill_sdl_from_ifp(&nh->gwl_sa, nh->nh_ifp);
 	}
 
 	return (0);
 }
 
 static int
 fill_nhop_from_info(struct nhop_priv *nh_priv, struct rt_addrinfo *info)
 {
 	int error, rt_flags;
 	struct nhop_object *nh;
 
 	nh = nh_priv->nh;
 
 	rt_flags = info->rti_flags & NHOP_RT_FLAG_MASK;
 
 	nh->nh_priv->rt_flags = rt_flags;
 	nh_priv->nh_family = info->rti_info[RTAX_DST]->sa_family;
 	nh_priv->nh_type = 0; // hook responsibility to set nhop type
 
 	nh->nh_flags = fib_rte_to_nh_flags(rt_flags);
 	set_nhop_mtu_from_info(nh, info);
 	nh->nh_ifp = info->rti_ifa->ifa_ifp;
 	nh->nh_ifa = info->rti_ifa;
 	nh->nh_aifp = get_aifp(nh, 0);
 
 	if ((error = set_nhop_gw_from_info(nh, info)) != 0)
 		return (error);
 
 	/*
 	 * Note some of the remaining data is set by the
 	 * per-address-family pre-add hook.
 	 */
 
 	return (0);
 }
 
 /*
  * Creates a new nexthop based on the information in @info.
  *
  * Returns:
  * 0 on success, filling @nh_ret with the desired nexthop object ptr
  * errno otherwise
  */
 int
 nhop_create_from_info(struct rib_head *rnh, struct rt_addrinfo *info,
     struct nhop_object **nh_ret)
 {
 	struct nhop_priv *nh_priv;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	nh_priv = alloc_nhop_structure();
 
 	error = fill_nhop_from_info(nh_priv, info);
 	if (error != 0) {
 		uma_zfree(nhops_zone, nh_priv->nh);
 		return (error);
 	}
 
 	error = get_nhop(rnh, info, &nh_priv);
 	if (error == 0)
 		*nh_ret = nh_priv->nh;
 
 	return (error);
 }
 
 /*
  * Gets linked nhop using the provided @pnh_priv nexhop data.
  * If linked nhop is found, returns it, freeing the provided one.
  * If there is no such nexthop, attaches the remaining data to the
  *  provided nexthop and links it.
  *
  * Returns 0 on success, storing referenced nexthop in @pnh_priv.
  * Otherwise, errno is returned.
  */
 static int
 get_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
     struct nhop_priv **pnh_priv)
 {
 	const struct sockaddr *dst, *gateway, *netmask;
 	struct nhop_priv *nh_priv, *tmp_priv;
 	int error;
 
 	nh_priv = *pnh_priv;
 
 	/* Give the protocols chance to augment the request data */
 	dst = info->rti_info[RTAX_DST];
 	netmask = info->rti_info[RTAX_NETMASK];
 	gateway = info->rti_info[RTAX_GATEWAY];
 
 	error = rnh->rnh_preadd(rnh->rib_fibnum, dst, netmask, nh_priv->nh);
 	if (error != 0) {
 		uma_zfree(nhops_zone, nh_priv->nh);
 		return (error);
 	}
 
 	tmp_priv = find_nhop(rnh->nh_control, nh_priv);
 	if (tmp_priv != NULL) {
 		uma_zfree(nhops_zone, nh_priv->nh);
 		*pnh_priv = tmp_priv;
 		return (0);
 	}
 
 	/*
 	 * Existing nexthop not found, need to create new one.
 	 * Note: multiple simultaneous get_nhop() requests
 	 *  can result in multiple equal nexhops existing in the
 	 *  nexthop table. This is not a not a problem until the
 	 *  relative number of such nexthops is significant, which
 	 *  is extremely unlikely.
 	 */
 
 	error = finalize_nhop(rnh->nh_control, info, nh_priv);
 	if (error != 0)
 		return (error);
 
 	return (0);
 }
 
 /*
  * Update @nh with data supplied in @info.
  * This is a helper function to support route changes.
  *
  * It limits the changes that can be done to the route to the following:
  * 1) all combination of gateway changes (gw, interface, blackhole/reject)
  * 2) route flags (FLAG[123],STATIC,BLACKHOLE,REJECT)
  * 3) route MTU
  *
  * Returns:
  * 0 on success
  */
 static int
 alter_nhop_from_info(struct nhop_object *nh, struct rt_addrinfo *info)
 {
 	struct sockaddr *info_gw;
 	int error;
 
 	/* Update MTU if set in the request*/
 	set_nhop_mtu_from_info(nh, info);
 
 	/* XXX: allow only one of BLACKHOLE,REJECT,GATEWAY */
 
 	/* Allow some flags (FLAG1,STATIC,BLACKHOLE,REJECT) to be toggled on change. */
 	nh->nh_priv->rt_flags &= ~RTF_FMASK;
 	nh->nh_priv->rt_flags |= info->rti_flags & RTF_FMASK;
 
 	/* Consider gateway change */
 	info_gw = info->rti_info[RTAX_GATEWAY];
 	if (info_gw != NULL) {
 		error = set_nhop_gw_from_info(nh, info);
 		if (error != 0)
 			return (error);
 		/* Update RTF_GATEWAY flag status */
 		nh->nh_priv->rt_flags &= ~RTF_GATEWAY;
 		nh->nh_priv->rt_flags |= (RTF_GATEWAY & info->rti_flags);
 	}
 	/* Update datapath flags */
 	nh->nh_flags = fib_rte_to_nh_flags(nh->nh_priv->rt_flags);
 
 	if (info->rti_ifa != NULL)
 		nh->nh_ifa = info->rti_ifa;
 	if (info->rti_ifp != NULL)
 		nh->nh_ifp = info->rti_ifp;
 	nh->nh_aifp = get_aifp(nh, 0);
 
 	return (0);
 }
 
 /*
  * Creates new nexthop based on @nh_orig and augmentation data from @info.
  * Helper function used in the route changes, please see
  *   alter_nhop_from_info() comments for more details.
  *
  * Returns:
  * 0 on success, filling @nh_ret with the desired nexthop object
  * errno otherwise
  */
 int
 nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_orig,
     struct rt_addrinfo *info, struct nhop_object **pnh)
 {
 	struct nhop_priv *nh_priv;
 	struct nhop_object *nh;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	nh_priv = alloc_nhop_structure();
 	nh = nh_priv->nh;
 
 	/* Start with copying data from original nexthop */
 	nh_priv->nh_family = nh_orig->nh_priv->nh_family;
 	nh_priv->rt_flags = nh_orig->nh_priv->rt_flags;
 	nh_priv->nh_type = nh_orig->nh_priv->nh_type;
 
 	nh->nh_ifp = nh_orig->nh_ifp;
 	nh->nh_ifa = nh_orig->nh_ifa;
 	nh->nh_aifp = nh_orig->nh_aifp;
 	nh->nh_mtu = nh_orig->nh_mtu;
 	nh->nh_flags = nh_orig->nh_flags;
 	memcpy(&nh->gw_sa, &nh_orig->gw_sa, nh_orig->gw_sa.sa_len);
 
 	error = alter_nhop_from_info(nh, info);
 	if (error != 0) {
 		uma_zfree(nhops_zone, nh_priv->nh);
 		return (error);
 	}
 
 	error = get_nhop(rnh, info, &nh_priv);
 	if (error == 0)
 		*pnh = nh_priv->nh;
 
 	return (error);
 }
 
 /*
  * Allocates memory for public/private nexthop structures.
  *
  * Returns pointer to nhop_priv or NULL.
  */
 static struct nhop_priv *
 alloc_nhop_structure()
 {
 	struct nhop_object *nh;
 	struct nhop_priv *nh_priv;
 
 	nh = (struct nhop_object *)uma_zalloc(nhops_zone, M_NOWAIT | M_ZERO);
 	if (nh == NULL)
 		return (NULL);
 	nh_priv = (struct nhop_priv *)((char *)nh + NHOP_OBJECT_ALIGNED_SIZE);
 
 	nh->nh_priv = nh_priv;
 	nh_priv->nh = nh;
 
 	return (nh_priv);
 }
 
 /*
  * Alocates/references the remaining bits of nexthop data and links
  *  it to the hash table.
  * Returns 0 if successful,
  *  errno otherwise. @nh_priv is freed in case of error.
  */
 static int
 finalize_nhop(struct nh_control *ctl, struct rt_addrinfo *info,
     struct nhop_priv *nh_priv)
 {
 	struct nhop_object *nh;
 
 	nh = nh_priv->nh;
 
 	/* Allocate per-cpu packet counter */
 	nh->nh_pksent = counter_u64_alloc(M_NOWAIT);
 	if (nh->nh_pksent == NULL) {
 		uma_zfree(nhops_zone, nh);
 		RTSTAT_INC(rts_nh_alloc_failure);
 		DPRINTF("nh_alloc_finalize failed");
 		return (ENOMEM);
 	}
 
 	/* Save vnet to ease destruction */
 	nh_priv->nh_vnet = curvnet;
 
 	/* Reference external objects and calculate (referenced) ifa */
 	if_ref(nh->nh_ifp);
 	ifa_ref(nh->nh_ifa);
 	nh->nh_aifp = get_aifp(nh, 1);
 	DPRINTF("AIFP: %p nh_ifp %p", nh->nh_aifp, nh->nh_ifp);
 
 	refcount_init(&nh_priv->nh_refcnt, 1);
 
 	/* Please see nhop_free() comments on the initial value */
 	refcount_init(&nh_priv->nh_linked, 2);
 
 	print_nhop("FINALIZE", nh);
 
 	if (link_nhop(ctl, nh_priv) == 0) {
 
 		/*
 		 * Adding nexthop to the datastructures
 		 *  failed. Call destructor w/o waiting for
 		 *  the epoch end, as nexthop is not used
 		 *  and return.
 		 */
 		DPRINTF("link_nhop failed!");
 		destroy_nhop(nh_priv);
 
 		return (ENOBUFS);
 	}
 
 	return (0);
 }
 
 static void
 print_nhop_sa(char *buf, size_t buflen, const struct sockaddr *sa)
 {
 
 	if (sa->sa_family == AF_INET) {
 		const struct sockaddr_in *sin4;
 		sin4 = (const struct sockaddr_in *)sa;
 		inet_ntop(AF_INET, &sin4->sin_addr, buf, buflen);
 	} else if (sa->sa_family == AF_INET6) {
 		const struct sockaddr_in6 *sin6;
 		sin6 = (const struct sockaddr_in6 *)sa;
 		inet_ntop(AF_INET6, &sin6->sin6_addr, buf, buflen);
 	} else if (sa->sa_family == AF_LINK) {
 		const struct sockaddr_dl *sdl;
 		sdl = (const struct sockaddr_dl *)sa;
 		snprintf(buf, buflen, "if#%d", sdl->sdl_index);
 	} else
 		snprintf(buf, buflen, "af:%d", sa->sa_family);
 }
 
 static void
 print_nhop(const char *prefix, const struct nhop_object *nh)
 {
 	char src_buf[INET6_ADDRSTRLEN], addr_buf[INET6_ADDRSTRLEN];
 
 	print_nhop_sa(src_buf, sizeof(src_buf), nh->nh_ifa->ifa_addr);
 	print_nhop_sa(addr_buf, sizeof(addr_buf), &nh->gw_sa);
 
 	DPRINTF("%s nhop priv %p: AF %d ifp %p %s addr %s src %p %s aifp %p %s mtu %d nh_flags %X",
 	    prefix, nh->nh_priv, nh->nh_priv->nh_family, nh->nh_ifp,
 	    if_name(nh->nh_ifp), addr_buf, nh->nh_ifa, src_buf, nh->nh_aifp,
 	    if_name(nh->nh_aifp), nh->nh_mtu, nh->nh_flags);
 }
 
 static void
 destroy_nhop(struct nhop_priv *nh_priv)
 {
 	struct nhop_object *nh;
 
 	nh = nh_priv->nh;
 
 	print_nhop("DEL", nh);
 
 	if_rele(nh->nh_ifp);
 	if_rele(nh->nh_aifp);
 	ifa_free(nh->nh_ifa);
 	counter_u64_free(nh->nh_pksent);
 
 	uma_zfree(nhops_zone, nh);
 }
 
 /*
  * Epoch callback indicating nhop is safe to destroy
  */
 static void
 destroy_nhop_epoch(epoch_context_t ctx)
 {
 	struct nhop_priv *nh_priv;
 
 	nh_priv = __containerof(ctx, struct nhop_priv, nh_epoch_ctx);
 
 	destroy_nhop(nh_priv);
 }
 
 int
 nhop_ref_object(struct nhop_object *nh)
 {
 
 	return (refcount_acquire_if_not_zero(&nh->nh_priv->nh_refcnt));
 }
 
 void
 nhop_free(struct nhop_object *nh)
 {
 	struct nh_control *ctl;
 	struct nhop_priv *nh_priv = nh->nh_priv;
 	struct epoch_tracker et;
 
 	if (!refcount_release(&nh_priv->nh_refcnt))
 		return;
 
 	/*
 	 * There are only 2 places, where nh_linked can be decreased:
 	 *  rib destroy (nhops_destroy_rib) and this function.
 	 * nh_link can never be increased.
 	 *
 	 * Hence, use initial value of 2 to make use of
 	 *  refcount_release_if_not_last().
 	 *
 	 * There can be two scenarious when calling this function:
 	 *
 	 * 1) nh_linked value is 2. This means that either
 	 *  nhops_destroy_rib() has not been called OR it is running,
 	 *  but we are guaranteed that nh_control won't be freed in
 	 *  this epoch. Hence, nexthop can be safely unlinked.
 	 *
 	 * 2) nh_linked value is 1. In that case, nhops_destroy_rib()
 	 *  has been called and nhop unlink can be skipped.
 	 */
 
 	NET_EPOCH_ENTER(et);
 	if (refcount_release_if_not_last(&nh_priv->nh_linked)) {
 		ctl = nh_priv->nh_control;
 		if (unlink_nhop(ctl, nh_priv) == NULL) {
 			/* Do not try to reclaim */
 			DPRINTF("Failed to unlink nexhop %p", nh_priv);
 			NET_EPOCH_EXIT(et);
 			return;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	epoch_call(net_epoch_preempt, destroy_nhop_epoch,
 	    &nh_priv->nh_epoch_ctx);
 }
 
 int
 nhop_ref_any(struct nhop_object *nh)
 {
 
 	return (nhop_ref_object(nh));
 }
 
 void
 nhop_free_any(struct nhop_object *nh)
 {
 
 	nhop_free(nh);
 }
 
 
 /* Helper functions */
 
 uint32_t
 nhop_get_idx(const struct nhop_object *nh)
 {
 
 	return (nh->nh_priv->nh_idx);
 }
 
 enum nhop_type
 nhop_get_type(const struct nhop_object *nh)
 {
 
 	return (nh->nh_priv->nh_type);
 }
 
 void
 nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type)
 {
 
 	nh->nh_priv->nh_type = nh_type;
 }
 
 int
 nhop_get_rtflags(const struct nhop_object *nh)
 {
 
 	return (nh->nh_priv->rt_flags);
 }
 
 void
 nhop_set_rtflags(struct nhop_object *nh, int rt_flags)
 {
 
 	nh->nh_priv->rt_flags = rt_flags;
 }
 
 struct vnet *
 nhop_get_vnet(const struct nhop_object *nh)
 {
 
 	return (nh->nh_priv->nh_vnet);
 }
 
 void
 nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu)
 {
 	struct nh_control *ctl;
 	struct nhop_priv *nh_priv;
 	struct nhop_object *nh;
 
 	ctl = rh->nh_control;
 
 	NHOPS_WLOCK(ctl);
 	CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
 		nh = nh_priv->nh;
 		if (nh->nh_ifp == ifp) {
 			if ((nh_priv->rt_flags & RTF_FIXEDMTU) == 0 ||
 			    nh->nh_mtu > mtu) {
 				/* Update MTU directly */
 				nh->nh_mtu = mtu;
 			}
 		}
 	} CHT_SLIST_FOREACH_END;
 	NHOPS_WUNLOCK(ctl);
 
 }
 
 /*
  * Dumps a single entry to sysctl buffer.
  *
  * Layout:
  *  rt_msghdr - generic RTM header to allow users to skip non-understood messages
  *  nhop_external - nexhop description structure (with length)
  *  nhop_addrs - structure encapsulating GW/SRC sockaddrs
  */
 static int
 dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w)
 {
 	struct {
 		struct rt_msghdr	rtm;
 		struct nhop_external	nhe;
 		struct nhop_addrs	na;
 	} arpc;
 	struct nhop_external *pnhe;
 	struct sockaddr *gw_sa, *src_sa;
 	struct sockaddr_storage ss;
 	size_t addrs_len;
 	int error;
 
 	//DPRINTF("Dumping: head %p nh %p flags %X req %p\n", rh, nh, nh->nh_flags, w);
 
 	memset(&arpc, 0, sizeof(arpc));
 
 	arpc.rtm.rtm_msglen = sizeof(arpc);
 	arpc.rtm.rtm_version = RTM_VERSION;
 	arpc.rtm.rtm_type = RTM_GET;
 	//arpc.rtm.rtm_flags = RTF_UP;
 	arpc.rtm.rtm_flags = nh->nh_priv->rt_flags;
 
 	/* nhop_external */
 	pnhe = &arpc.nhe;
 	pnhe->nh_len = sizeof(struct nhop_external);
 	pnhe->nh_idx = nh->nh_priv->nh_idx;
 	pnhe->nh_fib = rh->rib_fibnum;
 	pnhe->ifindex = nh->nh_ifp->if_index;
 	pnhe->aifindex = nh->nh_aifp->if_index;
 	pnhe->nh_family = nh->nh_priv->nh_family;
 	pnhe->nh_type = nh->nh_priv->nh_type;
 	pnhe->nh_mtu = nh->nh_mtu;
 	pnhe->nh_flags = nh->nh_flags;
 
 	memcpy(pnhe->nh_prepend, nh->nh_prepend, sizeof(nh->nh_prepend));
 	pnhe->prepend_len = nh->nh_prepend_len;
 	pnhe->nh_refcount = nh->nh_priv->nh_refcnt;
 	pnhe->nh_pksent = counter_u64_fetch(nh->nh_pksent);
 
 	/* sockaddr container */
 	addrs_len = sizeof(struct nhop_addrs);
 	arpc.na.gw_sa_off = addrs_len;
 	gw_sa = (struct sockaddr *)&nh->gw4_sa;
 	addrs_len += gw_sa->sa_len;
 
 	src_sa = nh->nh_ifa->ifa_addr;
 	if (src_sa->sa_family == AF_LINK) {
 		/* Shorten structure */
 		memset(&ss, 0, sizeof(struct sockaddr_storage));
 		fill_sdl_from_ifp((struct sockaddr_dl_short *)&ss,
 		    nh->nh_ifa->ifa_ifp);
 		src_sa = (struct sockaddr *)&ss;
 	}
 	arpc.na.src_sa_off = addrs_len;
 	addrs_len += src_sa->sa_len;
 
 	/* Write total container length */
 	arpc.na.na_len = addrs_len;
 
 	arpc.rtm.rtm_msglen += arpc.na.na_len - sizeof(struct nhop_addrs);
 
 	error = SYSCTL_OUT(w, &arpc, sizeof(arpc));
 	if (error == 0)
 		error = SYSCTL_OUT(w, gw_sa, gw_sa->sa_len);
 	if (error == 0)
 		error = SYSCTL_OUT(w, src_sa, src_sa->sa_len);
 
 	return (error);
 }
 
 int
 nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
 {
 	struct nh_control *ctl;
 	struct nhop_priv *nh_priv;
 	int error;
 
 	ctl = rh->nh_control;
 
 	NHOPS_RLOCK(ctl);
 	DPRINTF("NHDUMP: count=%u", ctl->nh_head.items_count);
 	CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
 		error = dump_nhop_entry(rh, nh_priv->nh, w);
 		if (error != 0) {
 			NHOPS_RUNLOCK(ctl);
 			return (error);
 		}
 	} CHT_SLIST_FOREACH_END;
 	NHOPS_RUNLOCK(ctl);
 
 	return (0);
 }
 
Index: head/sys/net/route/route_ctl.c
===================================================================
--- head/sys/net/route/route_ctl.c	(revision 361703)
+++ head/sys/net/route/route_ctl.c	(revision 361704)
@@ -1,584 +1,762 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2020 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/vnet.h>
 #include <net/route.h>
+#include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop_utils.h>
 #include <net/route/nhop.h>
 #include <net/route/nhop_var.h>
 #include <net/route/shared.h>
 #include <netinet/in.h>
 
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <vm/uma.h>
 
 
 /*
  * This file contains control plane routing tables functions.
  *
  * All functions assumes they are called in net epoch.
  */
 
+static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type,
+    struct rib_cmd_info *rc);
+
 static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info);
 
+static struct rib_head *
+get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
+{
+	struct rib_head *rnh;
+	struct sockaddr *dst;
+
+	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
+
+	dst = info->rti_info[RTAX_DST];
+	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
+
+	return (rnh);
+}
+
+/*
+ * Adds route defined by @info into the kernel table specified by @fibnum and
+ * sa_family in @info->rti_info[RTAX_DST].
+ *
+ * Returns 0 on success and fills in operation metadata into @rc.
+ */
 int
+rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
+    struct rib_cmd_info *rc)
+{
+	struct rib_head *rnh;
+
+	NET_EPOCH_ASSERT();
+
+	rnh = get_rnh(fibnum, info);
+	if (rnh == NULL)
+		return (EAFNOSUPPORT);
+
+	/*
+	 * Check consistency between RTF_HOST flag and netmask
+	 * existence.
+	 */
+	if (info->rti_flags & RTF_HOST)
+		info->rti_info[RTAX_NETMASK] = NULL;
+	else if (info->rti_info[RTAX_NETMASK] == NULL)
+		return (EINVAL);
+
+	bzero(rc, sizeof(struct rib_cmd_info));
+	rc->rc_cmd = RTM_ADD;
+
+	return (add_route(rnh, info, rc));
+}
+
+int
 add_route(struct rib_head *rnh, struct rt_addrinfo *info,
-    struct rtentry **ret_nrt)
+    struct rib_cmd_info *rc)
 {
 	struct sockaddr *dst, *ndst, *gateway, *netmask;
 	struct rtentry *rt, *rt_old;
 	struct nhop_object *nh;
 	struct radix_node *rn;
 	struct ifaddr *ifa;
 	int error, flags;
 	struct epoch_tracker et;
 
 	dst = info->rti_info[RTAX_DST];
 	gateway = info->rti_info[RTAX_GATEWAY];
 	netmask = info->rti_info[RTAX_NETMASK];
 	flags = info->rti_flags;
 
 	if ((flags & RTF_GATEWAY) && !gateway)
 		return (EINVAL);
 	if (dst && gateway && (dst->sa_family != gateway->sa_family) && 
 	    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
 		return (EINVAL);
 
 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb))
 		return (EINVAL);
 
 	if (info->rti_ifa == NULL) {
 		error = rt_getifa_fib(info, rnh->rib_fibnum);
 		if (error)
 			return (error);
 	} else {
 		ifa_ref(info->rti_ifa);
 	}
 
 	NET_EPOCH_ENTER(et);
 	error = nhop_create_from_info(rnh, info, &nh);
 	NET_EPOCH_EXIT(et);
 	if (error != 0) {
 		ifa_free(info->rti_ifa);
 		return (error);
 	}
 
 	rt = uma_zalloc(V_rtzone, M_NOWAIT);
 	if (rt == NULL) {
 		ifa_free(info->rti_ifa);
 		nhop_free(nh);
 		return (ENOBUFS);
 	}
 	rt->rt_flags = RTF_UP | flags;
 	rt->rt_nhop = nh;
 
 	/* Fill in dst */
 	memcpy(&rt->rt_dst, dst, dst->sa_len);
 	rt_key(rt) = &rt->rt_dst;
 
 	/*
 	 * point to the (possibly newly malloc'd) dest address.
 	 */
 	ndst = (struct sockaddr *)rt_key(rt);
 
 	/*
 	 * make sure it contains the value we want (masked if needed).
 	 */
 	if (netmask) {
 		rt_maskedcopy(dst, ndst, netmask);
 	} else
 		bcopy(dst, ndst, dst->sa_len);
 
 	/*
 	 * We use the ifa reference returned by rt_getifa_fib().
 	 * This moved from below so that rnh->rnh_addaddr() can
 	 * examine the ifa and  ifa->ifa_ifp if it so desires.
 	 */
 	ifa = info->rti_ifa;
 	rt->rt_weight = 1;
 
 	rt_setmetrics(info, rt);
+	rt_old = NULL;
 
 	RIB_WLOCK(rnh);
 	RT_LOCK(rt);
 #ifdef RADIX_MPATH
 	/* do not permit exactly the same dst/mask/gw pair */
 	if (rt_mpath_capable(rnh) &&
 		rt_mpath_conflict(rnh, rt, netmask)) {
 		RIB_WUNLOCK(rnh);
 
 		nhop_free(nh);
 		uma_zfree(V_rtzone, rt);
 		return (EEXIST);
 	}
 #endif
 
 	rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes);
 
-	if (rn != NULL && rt->rt_expire > 0)
-		tmproutes_update(rnh, rt);
+	if (rn != NULL) {
+		/* Most common usecase */
+		if (rt->rt_expire > 0)
+			tmproutes_update(rnh, rt);
 
-	rt_old = NULL;
-	if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) {
+		/* Finalize notification */
+		rnh->rnh_gen++;
 
+		rc->rc_rt = RNTORT(rn);
+		rc->rc_nh_new = nh;
+
+		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
+	} else if ((info->rti_flags & RTF_PINNED) != 0) {
+
 		/*
 		 * Force removal and re-try addition
 		 * TODO: better multipath&pinned support
 		 */
 		struct sockaddr *info_dst = info->rti_info[RTAX_DST];
 		info->rti_info[RTAX_DST] = ndst;
 		/* Do not delete existing PINNED(interface) routes */
 		info->rti_flags &= ~RTF_PINNED;
 		rt_old = rt_unlinkrte(rnh, info, &error);
 		info->rti_flags |= RTF_PINNED;
 		info->rti_info[RTAX_DST] = info_dst;
-		if (rt_old != NULL)
+		if (rt_old != NULL) {
 			rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head,
 			    rt->rt_nodes);
+
+			/* Finalize notification */
+			rnh->rnh_gen++;
+
+			if (rn != NULL) {
+				rc->rc_cmd = RTM_CHANGE;
+				rc->rc_rt = RNTORT(rn);
+				rc->rc_nh_old = rt_old->rt_nhop;
+				rc->rc_nh_new = nh;
+			} else {
+				rc->rc_cmd = RTM_DELETE;
+				rc->rc_rt = RNTORT(rn);
+				rc->rc_nh_old = rt_old->rt_nhop;
+				rc->rc_nh_new = nh;
+			}
+			rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
+		}
 	}
 	RIB_WUNLOCK(rnh);
 
 	if (rt_old != NULL) {
 		rt_notifydelete(rt_old, info);
 		rtfree(rt_old);
 	}
 
 	/*
 	 * If it still failed to go into the tree,
 	 * then un-make it (this should be a function)
 	 */
 	if (rn == NULL) {
 		nhop_free(nh);
 		uma_zfree(V_rtzone, rt);
 		return (EEXIST);
 	}
 
 	/*
 	 * If this protocol has something to add to this then
 	 * allow it to do that as well.
 	 */
 	if (ifa->ifa_rtrequest)
 		ifa->ifa_rtrequest(RTM_ADD, rt, rt->rt_nhop, info);
 
-	/*
-	 * actually return a resultant rtentry
-	 */
-	if (ret_nrt)
-		*ret_nrt = rt;
-	rnh->rnh_gen++;		/* Routing table updated */
 	RT_UNLOCK(rt);
 
 	return (0);
 }
 
 
 /*
+ * Removes route defined by @info from the kernel table specified by @fibnum and
+ * sa_family in @info->rti_info[RTAX_DST].
+ *
+ * Returns 0 on success and fills in operation metadata into @rc.
+ */
+int
+rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
+{
+	struct rib_head *rnh;
+
+	NET_EPOCH_ASSERT();
+
+	rnh = get_rnh(fibnum, info);
+	if (rnh == NULL)
+		return (EAFNOSUPPORT);
+
+	bzero(rc, sizeof(struct rib_cmd_info));
+	rc->rc_cmd = RTM_DELETE;
+
+	return (del_route(rnh, info, rc));
+}
+
+/*
  * Conditionally unlinks rtentry matching data inside @info from @rnh.
  * Returns unlinked, locked and referenced @rtentry on success,
  * Returns NULL and sets @perror to:
  * ESRCH - if prefix was not found,
  * EADDRINUSE - if trying to delete PINNED route without appropriate flag.
  * ENOENT - if supplied filter function returned 0 (not matched).
  */
 struct rtentry *
 rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror)
 {
 	struct sockaddr *dst, *netmask;
 	struct rtentry *rt;
 	struct radix_node *rn;
 
 	dst = info->rti_info[RTAX_DST];
 	netmask = info->rti_info[RTAX_NETMASK];
 
 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
 	if (rt == NULL) {
 		*perror = ESRCH;
 		return (NULL);
 	}
 
 	if ((info->rti_flags & RTF_PINNED) == 0) {
 		/* Check if target route can be deleted */
 		if (rt->rt_flags & RTF_PINNED) {
 			*perror = EADDRINUSE;
 			return (NULL);
 		}
 	}
 
 	if (info->rti_filter != NULL) {
 		if (info->rti_filter(rt, rt->rt_nhop, info->rti_filterdata)==0){
 			/* Not matched */
 			*perror = ENOENT;
 			return (NULL);
 		}
 
 		/*
 		 * Filter function requested rte deletion.
 		 * Ease the caller work by filling in remaining info
 		 * from that particular entry.
 		 */
 		info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa;
 	}
 
 	/*
 	 * Remove the item from the tree and return it.
 	 * Complain if it is not there and do no more processing.
 	 */
 	*perror = ESRCH;
 #ifdef RADIX_MPATH
 	if (rt_mpath_capable(rnh))
 		rn = rt_mpath_unlink(rnh, info, rt, perror);
 	else
 #endif
 	rn = rnh->rnh_deladdr(dst, netmask, &rnh->head);
 	if (rn == NULL)
 		return (NULL);
 
 	if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 		panic ("rtrequest delete");
 
 	rt = RNTORT(rn);
 	RT_LOCK(rt);
 	rt->rt_flags &= ~RTF_UP;
 
 	*perror = 0;
 
 	return (rt);
 }
 
 int
 del_route(struct rib_head *rnh, struct rt_addrinfo *info,
-    struct rtentry **ret_nrt)
+    struct rib_cmd_info *rc)
 {
 	struct sockaddr *dst, *netmask;
 	struct sockaddr_storage mdst;
 	struct rtentry *rt;
 	int error;
 
 	dst = info->rti_info[RTAX_DST];
 	netmask = info->rti_info[RTAX_NETMASK];
 
 	if (netmask) {
 		if (dst->sa_len > sizeof(mdst))
 			return (EINVAL);
 		rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
 		dst = (struct sockaddr *)&mdst;
 	}
 
 	RIB_WLOCK(rnh);
 	rt = rt_unlinkrte(rnh, info, &error);
+	if (rt != NULL) {
+		/* Finalize notification */
+		rnh->rnh_gen++;
+		rc->rc_rt = rt;
+		rc->rc_nh_old = rt->rt_nhop;
+		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
+	}
 	RIB_WUNLOCK(rnh);
 	if (error != 0)
 		return (error);
 
 	rt_notifydelete(rt, info);
 
 	/*
 	 * If the caller wants it, then it can have it,
 	 * the entry will be deleted after the end of the current epoch.
 	 */
-	if (ret_nrt)
-		*ret_nrt = rt;
-
 	rtfree(rt);
 
 	return (0);
 }
 
+int
+rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
+    struct rib_cmd_info *rc)
+{
+	struct rib_head *rnh;
+
+	NET_EPOCH_ASSERT();
+
+	rnh = get_rnh(fibnum, info);
+	if (rnh == NULL)
+		return (EAFNOSUPPORT);
+
+	bzero(rc, sizeof(struct rib_cmd_info));
+	rc->rc_cmd = RTM_CHANGE;
+
+	return (change_route(rnh, info, rc));
+}
+
 static int
 change_route_one(struct rib_head *rnh, struct rt_addrinfo *info,
-    struct rtentry **ret_nrt)
+    struct rib_cmd_info *rc)
 {
 	RIB_RLOCK_TRACKER;
 	struct rtentry *rt = NULL;
 	int error = 0;
 	int free_ifa = 0;
 	struct nhop_object *nh, *nh_orig;
 
 	RIB_RLOCK(rnh);
 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], &rnh->head);
 
 	if (rt == NULL) {
 		RIB_RUNLOCK(rnh);
 		return (ESRCH);
 	}
 
 #ifdef RADIX_MPATH
 	/*
 	 * If we got multipath routes,
 	 * we require users to specify a matching RTAX_GATEWAY.
 	 */
 	if (rt_mpath_capable(rnh)) {
 		rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]);
 		if (rt == NULL) {
 			RIB_RUNLOCK(rnh);
 			return (ESRCH);
 		}
 	}
 #endif
 	nh_orig = rt->rt_nhop;
 
 	RIB_RUNLOCK(rnh);
 
 	rt = NULL;
 	nh = NULL;
 
 	/*
 	 * New gateway could require new ifaddr, ifp;
 	 * flags may also be different; ifp may be specified
 	 * by ll sockaddr when protocol address is ambiguous
 	 */
 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
 	    info->rti_info[RTAX_IFP] != NULL ||
 	    (info->rti_info[RTAX_IFA] != NULL &&
 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
 		error = rt_getifa_fib(info, rnh->rib_fibnum);
 		if (info->rti_ifa != NULL)
 			free_ifa = 1;
 
 		if (error != 0) {
 			if (free_ifa) {
 				ifa_free(info->rti_ifa);
 				info->rti_ifa = NULL;
 			}
 
 			return (error);
 		}
 	}
 
 	error = nhop_create_from_nhop(rnh, nh_orig, info, &nh);
 	if (free_ifa) {
 		ifa_free(info->rti_ifa);
 		info->rti_ifa = NULL;
 	}
 	if (error != 0)
 		return (error);
 
 	RIB_WLOCK(rnh);
 
 	/* Lookup rtentry once again and check if nexthop is still the same */
 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], &rnh->head);
 
 	if (rt == NULL) {
 		RIB_WUNLOCK(rnh);
 		nhop_free(nh);
 		return (ESRCH);
 	}
 
 	if (rt->rt_nhop != nh_orig) {
 		RIB_WUNLOCK(rnh);
 		nhop_free(nh);
 		return (EAGAIN);
 	}
 
 	/* Proceed with the update */
 	RT_LOCK(rt);
 
 	/* Provide notification to the protocols.*/
 	if ((nh_orig->nh_ifa != nh->nh_ifa) && nh_orig->nh_ifa->ifa_rtrequest)
 		nh_orig->nh_ifa->ifa_rtrequest(RTM_DELETE, rt, nh_orig, info);
 
 	rt->rt_nhop = nh;
 	rt_setmetrics(info, rt);
 
 	if ((nh_orig->nh_ifa != nh->nh_ifa) && nh_orig->nh_ifa->ifa_rtrequest)
 		nh_orig->nh_ifa->ifa_rtrequest(RTM_DELETE, rt, nh_orig, info);
 
-	if (ret_nrt != NULL)
-		*ret_nrt = rt;
+	/* Finalize notification */
+	rc->rc_rt = rt;
+	rc->rc_nh_old = nh_orig;
+	rc->rc_nh_new = rt->rt_nhop;
 
 	RT_UNLOCK(rt);
 
 	/* Update generation id to reflect rtable change */
 	rnh->rnh_gen++;
 
+	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
+
 	RIB_WUNLOCK(rnh);
 
 	nhop_free(nh_orig);
 
 	return (0);
 }
 
 int
 change_route(struct rib_head *rnh, struct rt_addrinfo *info,
-    struct rtentry **ret_nrt)
+    struct rib_cmd_info *rc)
 {
 	int error;
 
 	/* Check if updated gateway exists */
 	if ((info->rti_flags & RTF_GATEWAY) &&
 	    (info->rti_info[RTAX_GATEWAY] == NULL))
 		return (EINVAL);
 
 	/*
 	 * route change is done in multiple steps, with dropping and
 	 * reacquiring lock. In the situations with multiple processes
 	 * changes the same route in can lead to the case when route
 	 * is changed between the steps. Address it by retrying the operation
 	 * multiple times before failing.
 	 */
 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
-		error = change_route_one(rnh, info, ret_nrt);
+		error = change_route_one(rnh, info, rc);
 		if (error != EAGAIN)
 			break;
 	}
 
 	return (error);
 }
 
 static void
 rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct ifaddr *ifa;
 
 	/*
 	 * give the protocol a chance to keep things in sync.
 	 */
 	ifa = rt->rt_nhop->nh_ifa;
 	if (ifa != NULL && ifa->ifa_rtrequest != NULL)
 		ifa->ifa_rtrequest(RTM_DELETE, rt, rt->rt_nhop, info);
 }
 
 struct rt_delinfo
 {
 	struct rt_addrinfo info;
 	struct rib_head *rnh;
 	struct rtentry *head;
 };
 
 /*
  * Conditionally unlinks @rn from radix tree based
  * on info data passed in @arg.
  */
 static int
 rt_checkdelroute(struct radix_node *rn, void *arg)
 {
 	struct rt_delinfo *di;
 	struct rt_addrinfo *info;
 	struct rtentry *rt;
 	int error;
 
 	di = (struct rt_delinfo *)arg;
 	rt = (struct rtentry *)rn;
 	info = &di->info;
 	error = 0;
 
 	info->rti_info[RTAX_DST] = rt_key(rt);
 	info->rti_info[RTAX_NETMASK] = rt_mask(rt);
 	info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa;
 
 	rt = rt_unlinkrte(di->rnh, info, &error);
 	if (rt == NULL) {
 		/* Either not allowed or not matched. Skip entry */
 		return (0);
 	}
 
 	/* Entry was unlinked. Add to the list and return */
 	rt->rt_chain = di->head;
 	di->head = rt;
 
 	return (0);
 }
 
 /*
  * Iterates over a routing table specified by @fibnum and @family and
  *  deletes elements marked by @filter_f.
  * @fibnum: rtable id
  * @family: AF_ address family
  * @filter_f: function returning non-zero value for items to delete
  * @arg: data to pass to the @filter_f function
  * @report: true if rtsock notification is needed.
  */
 void
 rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f, void *arg, bool report)
 {
 	struct rib_head *rnh;
 	struct rt_delinfo di;
 	struct rtentry *rt;
 
 	rnh = rt_tables_get_rnh(fibnum, family);
 	if (rnh == NULL)
 		return;
 
 	bzero(&di, sizeof(di));
 	di.info.rti_filter = filter_f;
 	di.info.rti_filterdata = arg;
 	di.rnh = rnh;
 
 	RIB_WLOCK(rnh);
 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
 	RIB_WUNLOCK(rnh);
 
 	if (di.head == NULL)
 		return;
 
 	/* We might have something to reclaim. */
 	while (di.head != NULL) {
 		rt = di.head;
 		di.head = rt->rt_chain;
 		rt->rt_chain = NULL;
 
 		/* TODO std rt -> rt_addrinfo export */
 		di.info.rti_info[RTAX_DST] = rt_key(rt);
 		di.info.rti_info[RTAX_NETMASK] = rt_mask(rt);
 
 		rt_notifydelete(rt, &di.info);
 
 		if (report)
 			rt_routemsg(RTM_DELETE, rt, rt->rt_nhop->nh_ifp, 0,
 			    fibnum);
 		rtfree(rt);
 	}
 }
 
+static void
+rib_notify(struct rib_head *rnh, enum rib_subscription_type type,
+    struct rib_cmd_info *rc)
+{
+	struct rib_subscription *rs;
+
+	CK_STAILQ_FOREACH(rs, &rnh->rnh_subscribers, next) {
+		if (rs->type == type)
+			rs->func(rnh, rc, rs->arg);
+	}
+}
+
+struct rib_subscription *
+rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg,
+    enum rib_subscription_type type, int waitok)
+{
+	struct rib_head *rnh;
+	struct rib_subscription *rs;
+	int flags = M_ZERO | (waitok ? M_WAITOK : 0);
+
+	NET_EPOCH_ASSERT();
+	KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__));
+	rnh = rt_tables_get_rnh(fibnum, family);
+
+	rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags);
+	if (rs == NULL)
+		return (NULL);
+
+	rs->func = f;
+	rs->arg = arg;
+	rs->type = type;
+
+	RIB_WLOCK(rnh);
+	CK_STAILQ_INSERT_TAIL(&rnh->rnh_subscribers, rs, next);
+	RIB_WUNLOCK(rnh);
+
+	return (rs);
+}
+
+int
+rib_unsibscribe(uint32_t fibnum, int family, struct rib_subscription *rs)
+{
+	struct rib_head *rnh;
+
+	NET_EPOCH_ASSERT();
+	KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__));
+	rnh = rt_tables_get_rnh(fibnum, family);
+
+	if (rnh == NULL)
+		return (ENOENT);
+
+	RIB_WLOCK(rnh);
+	CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next);
+	RIB_WUNLOCK(rnh);
+
+	free(rs, M_RTABLE);
+	return (0);
+}
 
Index: head/sys/net/route/route_ctl.h
===================================================================
--- head/sys/net/route/route_ctl.h	(nonexistent)
+++ head/sys/net/route/route_ctl.h	(revision 361704)
@@ -0,0 +1,90 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file contains public functions and structures used for
+ * routing table manipulations.
+ */
+
+#ifndef	_NET_ROUTE_ROUTE_CTL_H_
+#define	_NET_ROUTE_ROUTE_CTL_H_
+
+struct rib_cmd_info {
+	uint8_t			rc_cmd;		/* RTM_ADD|RTM_DEL|RTM_CHANGE */
+	uint8_t			spare[3];
+	uint32_t		rc_nh_weight;	/* new nhop weight */
+	struct rtentry		*rc_rt;		/* Target entry */
+	struct nhop_object	*rc_nh_old;	/* Target nhop OR mpath */
+	struct nhop_object	*rc_nh_new;	/* Target nhop OR mpath */
+};
+
+
+int rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
+  struct rib_cmd_info *rc);
+int rib_del_route(uint32_t fibnum, struct rt_addrinfo *info,
+  struct rib_cmd_info *rc);
+int rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
+  struct rib_cmd_info *rc);
+
+int rib_add_redirect(u_int fibnum, struct sockaddr *dst,
+  struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp,
+  int flags, int expire_sec);
+
+typedef int rt_walktree_f_t(struct rtentry *, void *);
+void rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg);
+void rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f,
+  void *arg, bool report);
+
+typedef void rt_setwarg_t(struct rib_head *, uint32_t, int, void *);
+void rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *);
+void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg);
+
+enum rib_subscription_type {
+	RIB_NOTIFY_IMMEDIATE,
+	RIB_NOTIFY_DELAYED
+};
+
+typedef void rib_subscription_cb_t(struct rib_head *rnh, struct rib_cmd_info *rc,
+    void *arg);
+
+struct rib_subscription {
+	CK_STAILQ_ENTRY(rib_subscription)	next;
+	rib_subscription_cb_t			*func;
+	void					*arg;
+	enum rib_subscription_type		type;
+};
+
+struct rib_subscription *rib_subscribe(uint32_t fibnum, int family,
+    rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type,
+    int waitok);
+int rib_unsibscribe(uint32_t fibnum, int family, struct rib_subscription *rs);
+
+
+#endif
+

Property changes on: head/sys/net/route/route_ctl.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: head/sys/net/route/route_ddb.c
===================================================================
--- head/sys/net/route/route_ddb.c	(revision 361703)
+++ head/sys/net/route/route_ddb.c	(revision 361704)
@@ -1,270 +1,271 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright 2019 Conrad Meyer <cem@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/ctype.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 
 #include <ddb/ddb.h>
 #include <ddb/db_lex.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 
 /*
  * Unfortunately, RTF_ values are expressed as raw masks rather than powers of
  * 2, so we cannot use them as nice C99 initializer indices below.
  */
 static const char * const rtf_flag_strings[] = {
 	"UP",
 	"GATEWAY",
 	"HOST",
 	"REJECT",
 	"DYNAMIC",
 	"MODIFIED",
 	"DONE",
 	"UNUSED_0x80",
 	"UNUSED_0x100",
 	"XRESOLVE",
 	"LLDATA",
 	"STATIC",
 	"BLACKHOLE",
 	"UNUSED_0x2000",
 	"PROTO2",
 	"PROTO1",
 	"UNUSED_0x10000",
 	"UNUSED_0x20000",
 	"PROTO3",
 	"FIXEDMTU",
 	"PINNED",
 	"LOCAL",
 	"BROADCAST",
 	"MULTICAST",
 	/* Big gap. */
 	[28] = "STICKY",
 	[30] = "RNH_LOCKED",
 	[31] = "GWFLAG_COMPAT",
 };
 
 static const char * __pure
 rt_flag_name(unsigned idx)
 {
 	if (idx >= nitems(rtf_flag_strings))
 		return ("INVALID_FLAG");
 	if (rtf_flag_strings[idx] == NULL)
 		return ("UNKNOWN");
 	return (rtf_flag_strings[idx]);
 }
 
 static void
 rt_dumpaddr_ddb(const char *name, const struct sockaddr *sa)
 {
 	char buf[INET6_ADDRSTRLEN], *res;
 
 	res = NULL;
 	if (sa == NULL)
 		res = "NULL";
 	else if (sa->sa_family == AF_INET) {
 		res = inet_ntop(AF_INET,
 		    &((const struct sockaddr_in *)sa)->sin_addr,
 		    buf, sizeof(buf));
 	} else if (sa->sa_family == AF_INET6) {
 		res = inet_ntop(AF_INET6,
 		    &((const struct sockaddr_in6 *)sa)->sin6_addr,
 		    buf, sizeof(buf));
 	} else if (sa->sa_family == AF_LINK) {
 		res = "on link";
 	}
 
 	if (res != NULL) {
 		db_printf("%s <%s> ", name, res);
 		return;
 	}
 
 	db_printf("%s <af:%d> ", name, sa->sa_family);
 }
 
 static int
 rt_dumpentry_ddb(struct radix_node *rn, void *arg __unused)
 {
 	struct sockaddr_storage ss;
 	struct rtentry *rt;
 	struct nhop_object *nh;
 	int flags, idx;
 
 	/* If RNTORT is important, put it in a header. */
 	rt = (void *)rn;
 	nh = (struct nhop_object *)rt->rt_nhop;
 
 	rt_dumpaddr_ddb("dst", rt_key(rt));
 	rt_dumpaddr_ddb("gateway", &rt->rt_nhop->gw_sa);
 	rt_dumpaddr_ddb("netmask", rtsock_fix_netmask(rt_key(rt), rt_mask(rt),
 	    &ss));
 	if ((nh->nh_ifp->if_flags & IFF_DYING) == 0) {
 		rt_dumpaddr_ddb("ifp", nh->nh_ifp->if_addr->ifa_addr);
 		rt_dumpaddr_ddb("ifa", nh->nh_ifa->ifa_addr);
 	}
 
 	db_printf("flags ");
 	flags = rt->rt_flags;
 	if (flags == 0)
 		db_printf("none");
 
 	while ((idx = ffs(flags)) > 0) {
 		idx--;
 
 		if (flags != rt->rt_flags)
 			db_printf(",");
 		db_printf("%s", rt_flag_name(idx));
 
 		flags &= ~(1ul << idx);
 	}
 
 	db_printf("\n");
 	return (0);
 }
 
 DB_SHOW_COMMAND(routetable, db_show_routetable_cmd)
 {
 	struct rib_head *rnh;
 	int error, i, lim;
 
 	if (have_addr)
 		i = lim = addr;
 	else {
 		i = 1;
 		lim = AF_MAX;
 	}
 
 	for (; i <= lim; i++) {
 		rnh = rt_tables_get_rnh(0, i);
 		if (rnh == NULL) {
 			if (have_addr) {
 				db_printf("%s: AF %d not supported?\n",
 				    __func__, i);
 				break;
 			}
 			continue;
 		}
 
 		if (!have_addr && i > 1)
 			db_printf("\n");
 
 		db_printf("Route table for AF %d%s%s%s:\n", i,
 		    (i == AF_INET || i == AF_INET6) ? " (" : "",
 		    (i == AF_INET) ? "INET" : (i == AF_INET6) ? "INET6" : "",
 		    (i == AF_INET || i == AF_INET6) ? ")" : "");
 
 		error = rnh->rnh_walktree(&rnh->head, rt_dumpentry_ddb, NULL);
 		if (error != 0)
 			db_printf("%s: walktree(%d): %d\n", __func__, i,
 			    error);
 	}
 }
 
 _DB_FUNC(_show, route, db_show_route_cmd, db_show_table, CS_OWN, NULL)
 {
 	char abuf[INET6_ADDRSTRLEN], *buf, *end;
 	struct rib_head *rh;
 	struct radix_node *rn;
 	void *dst_addrp;
 	struct rtentry *rt;
 	union {
 		struct sockaddr_in dest_sin;
 		struct sockaddr_in6 dest_sin6;
 	} u;
 	int af;
 
 	buf = db_get_line();
 
 	/* Remove whitespaces from both ends */
 	end = buf + strlen(buf) - 1;
 	for (; (end >= buf) && (*end=='\n' || isspace(*end)); end--)
 		*end = '\0';
 	while (isspace(*buf))
 		buf++;
 
 	/* Determine AF */
 	if (strchr(buf, ':') != NULL) {
 		af = AF_INET6;
 		u.dest_sin6.sin6_family = af;
 		u.dest_sin6.sin6_len = sizeof(struct sockaddr_in6);
 		dst_addrp = &u.dest_sin6.sin6_addr;
 	} else {
 		af = AF_INET;
 		u.dest_sin.sin_family = af;
 		u.dest_sin.sin_len = sizeof(struct sockaddr_in);
 		dst_addrp = &u.dest_sin.sin_addr;
 	}
 
 	if (inet_pton(af, buf, dst_addrp) != 1)
 		goto usage;
 
 	if (inet_ntop(af, dst_addrp, abuf, sizeof(abuf)) != NULL)
 		db_printf("Looking up route to destination '%s'\n", abuf);
 
 	rt = NULL;
 	CURVNET_SET(vnet0);
 
 	rh = rt_tables_get_rnh(RT_DEFAULT_FIB, af);
 
 	rn = rh->rnh_matchaddr(&u, &rh->head);
 	if (rn && ((rn->rn_flags & RNF_ROOT) == 0))
 		rt = (struct rtentry *)rn;
 
 	CURVNET_RESTORE();
 
 	if (rt == NULL) {
 		db_printf("Could not get route for that server.\n");
 		return;
 	}
 
 	rt_dumpentry_ddb((void *)rt, NULL);
 
 	return;
 usage:
 	db_printf("Usage: 'show route <address>'\n"
 	    "  Currently accepts only IPv4 and IPv6 addresses\n");
 	db_skip_to_eol();
 }
Index: head/sys/net/route/route_helpers.c
===================================================================
--- head/sys/net/route/route_helpers.c	(revision 361703)
+++ head/sys/net/route/route_helpers.c	(revision 361704)
@@ -1,134 +1,135 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2020 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
+#include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop_utils.h>
 #include <net/route/nhop.h>
 #include <net/route/nhop_var.h>
 #include <net/route/shared.h>
 #ifdef INET
 #include <netinet/in_fib.h>
 #endif
 #ifdef INET6
 #include <netinet6/in6_fib.h>
 #endif
 #include <net/vnet.h>
 
 /*
  * RIB helper functions.
  */
 
 /*
  * Calls @wa_f with @arg for each entry in the table specified by
  * @af and @fibnum.
  *
  * Table is traversed under read lock.
  */
 void
 rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg)
 {
 	RIB_RLOCK_TRACKER;
 	struct rib_head *rnh;
 
 	if ((rnh = rt_tables_get_rnh(fibnum, af)) == NULL)
 		return;
 
 	RIB_RLOCK(rnh);
 	rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
 	RIB_RUNLOCK(rnh);
 }
 
 /*
  * Wrapper for the control plane functions for performing af-agnostic
  *  lookups.
  * @fibnum: fib to perform the lookup.
  * @dst: sockaddr with family and addr filled in. IPv6 addresses needs to be in
  *  deembedded from.
  * @flags: fib(9) flags.
  * @flowid: flow id for path selection in multipath use case.
  *
  * Returns nhop_object or NULL.
  *
  * Requires NET_EPOCH.
  *
  */
 struct nhop_object *
 rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags,
     uint32_t flowid)
 {
 	struct nhop_object *nh;
 
 	nh = NULL;
 
 	switch (dst->sa_family) {
 #ifdef INET
 	case AF_INET:
 	{
 		const struct sockaddr_in *a = (const struct sockaddr_in *)dst;
 		nh = fib4_lookup(fibnum, a->sin_addr, 0, flags, flowid);
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6:
 	{
 		const struct sockaddr_in6 *a = (const struct sockaddr_in6*)dst;
 		nh = fib6_lookup(fibnum, &a->sin6_addr, a->sin6_scope_id,
 		    flags, flowid);
 		break;
 	}
 #endif
 	}
 
 	return (nh);
 }
 
Index: head/sys/net/route/route_temporal.c
===================================================================
--- head/sys/net/route/route_temporal.c	(revision 361703)
+++ head/sys/net/route/route_temporal.c	(revision 361704)
@@ -1,161 +1,163 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2020 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This file contains code responsible for expiring temporal routes
  * (typically, redirect-originated) from the route tables.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/ck.h>
 #include <sys/rmlock.h>
 #include <sys/callout.h>
 
 #include <net/if.h>
 #include <net/route.h>
+#include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/vnet.h>
 
 /*
  * Callback returning 1 for the expired routes.
  * Updates time of the next nearest route expiration as a side effect.
  */
 static int
 expire_route(const struct rtentry *rt, const struct nhop_object *nh, void *arg)
 {
 	time_t *next_callout;
 
 	if (rt->rt_expire == 0)
 		return (0);
 
 	if (rt->rt_expire <= time_uptime)
 		return (1);
 
 	next_callout = (time_t *)arg;
 
 	/*
 	 * Update next_callout to determine the next ts to
 	 * run the callback at.
 	 */
 	if (*next_callout == 0 || *next_callout > rt->rt_expire)
 		*next_callout = rt->rt_expire;
 
 	return (0);
 }
 
 /*
  * Per-rnh callout function traversing the tree and deleting
  * expired routes. Calculates next callout run by looking at
  * the rt_expire time for the remaining temporal routes.
  */
 static void
 expire_callout(void *arg)
 {
 	struct rib_head *rnh;
 	time_t next_expire;
 	int seconds;
 
 	rnh = (struct rib_head *)arg;
 
 	CURVNET_SET(rnh->rib_vnet);
 	next_expire = 0;
 
 	rib_walk_del(rnh->rib_fibnum, rnh->rib_family, expire_route,
 	    (void *)&next_expire, 1);
 
 	RIB_WLOCK(rnh);
 	if (next_expire > 0) {
 		seconds = (next_expire - time_uptime);
 		if (seconds < 0)
 			seconds = 0;
 		callout_reset_sbt(&rnh->expire_callout, SBT_1S * seconds,
 		    SBT_1MS * 500, expire_callout, rnh, 0);
 		rnh->next_expire = next_expire;
 	} else {
 		/*
 		 * Before resetting next_expire, check that tmproutes_update()
 		 * has not kicked in and scheduled another invocation.
 		 */
 		if (callout_pending(&rnh->expire_callout) == 0)
 			rnh->next_expire = 0;
 	}
 	RIB_WUNLOCK(rnh);
 	CURVNET_RESTORE();
 }
 
 /*
  * Function responsible for updating the time of the next calllout
  * w.r.t. new temporal routes insertion.
  *
  * Called by the routing code upon adding new temporal route
  * to the tree. RIB_WLOCK must be held.
  */
 void
 tmproutes_update(struct rib_head *rnh, struct rtentry *rt)
 {
 	int seconds;
 
 	RIB_WLOCK_ASSERT(rnh);
 
 	if (rnh->next_expire == 0 || rnh->next_expire > rt->rt_expire) {
 		/*
 		 * Callback is not scheduled, is executing,
 		 * or is scheduled for a later time than we need.
 		 *
 		 * Schedule the one for the current @rt expiration time.
 		 */
 		seconds = (rt->rt_expire - time_uptime);
 		if (seconds < 0)
 			seconds = 0;
 		callout_reset_sbt(&rnh->expire_callout, SBT_1S * seconds,
 		    SBT_1MS * 500, expire_callout, rnh, 0);
 
 		rnh->next_expire = rt->rt_expire;
 	}
 }
 
 void
 tmproutes_init(struct rib_head *rh)
 {
 
 	callout_init(&rh->expire_callout, 1);
 }
 
 
 void
 tmproutes_destroy(struct rib_head *rh)
 {
 
 	callout_drain(&rh->expire_callout);
 }
 
Index: head/sys/net/route/route_var.h
===================================================================
--- head/sys/net/route/route_var.h	(revision 361703)
+++ head/sys/net/route/route_var.h	(revision 361704)
@@ -1,243 +1,246 @@
 /*-
  * Copyright (c) 2015-2016
  * 	Alexander V. Chernikov <melifaro@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _NET_ROUTE_VAR_H_
 #define _NET_ROUTE_VAR_H_
 
 #ifndef RNF_NORMAL
 #include <net/radix.h>
 #endif
+#include <sys/ck.h>
 #include <sys/epoch.h>
 #include <netinet/in.h>		/* struct sockaddr_in */
 #include <sys/counter.h>
 
 struct nh_control;
 typedef int rnh_preadd_entry_f_t(u_int fibnum, const struct sockaddr *addr,
 	const struct sockaddr *mask, struct nhop_object *nh);
 
 struct rib_head {
 	struct radix_head	head;
 	rn_matchaddr_f_t	*rnh_matchaddr;	/* longest match for sockaddr */
 	rn_addaddr_f_t		*rnh_addaddr;	/* add based on sockaddr*/
 	rn_deladdr_f_t		*rnh_deladdr;	/* remove based on sockaddr */
 	rn_lookup_f_t		*rnh_lookup;	/* exact match for sockaddr */
 	rn_walktree_t		*rnh_walktree;	/* traverse tree */
 	rn_walktree_from_t	*rnh_walktree_from; /* traverse tree below a */
 	rnh_preadd_entry_f_t	*rnh_preadd;	/* hook to alter record prior to insertion */
 	rt_gen_t		rnh_gen;	/* generation counter */
 	int			rnh_multipath;	/* multipath capable ? */
 	struct radix_node	rnh_nodes[3];	/* empty tree for common case */
 	struct rmlock		rib_lock;	/* config/data path lock */
 	struct radix_mask_head	rmhead;		/* masks radix head */
 	struct vnet		*rib_vnet;	/* vnet pointer */
 	int			rib_family;	/* AF of the rtable */
 	u_int			rib_fibnum;	/* fib number */
 	struct callout		expire_callout;	/* Callout for expiring dynamic routes */
 	time_t			next_expire;	/* Next expire run ts */
 	struct nh_control	*nh_control;	/* nexthop subsystem data */
+	CK_STAILQ_HEAD(, rib_subscription)	rnh_subscribers;/* notification subscribers */
 };
 
 #define	RIB_RLOCK_TRACKER	struct rm_priotracker _rib_tracker
 #define	RIB_LOCK_INIT(rh)	rm_init(&(rh)->rib_lock, "rib head lock")
 #define	RIB_LOCK_DESTROY(rh)	rm_destroy(&(rh)->rib_lock)
 #define	RIB_RLOCK(rh)		rm_rlock(&(rh)->rib_lock, &_rib_tracker)
 #define	RIB_RUNLOCK(rh)		rm_runlock(&(rh)->rib_lock, &_rib_tracker)
 #define	RIB_WLOCK(rh)		rm_wlock(&(rh)->rib_lock)
 #define	RIB_WUNLOCK(rh)		rm_wunlock(&(rh)->rib_lock)
 #define	RIB_LOCK_ASSERT(rh)	rm_assert(&(rh)->rib_lock, RA_LOCKED)
 #define	RIB_WLOCK_ASSERT(rh)	rm_assert(&(rh)->rib_lock, RA_WLOCKED)
 
 /* Constants */
 #define	RIB_MAX_RETRIES	3
 
 /* Macro for verifying fields in af-specific 'struct route' structures */
 #define CHK_STRUCT_FIELD_GENERIC(_s1, _f1, _s2, _f2)			\
 _Static_assert(sizeof(((_s1 *)0)->_f1) == sizeof(((_s2 *)0)->_f2),	\
 		"Fields " #_f1 " and " #_f2 " size differs");		\
 _Static_assert(__offsetof(_s1, _f1) == __offsetof(_s2, _f2),		\
 		"Fields " #_f1 " and " #_f2 " offset differs");
 
 #define _CHK_ROUTE_FIELD(_route_new, _field) \
 	CHK_STRUCT_FIELD_GENERIC(struct route, _field, _route_new, _field)
 
 #define CHK_STRUCT_ROUTE_FIELDS(_route_new)	\
 	_CHK_ROUTE_FIELD(_route_new, ro_nh)	\
 	_CHK_ROUTE_FIELD(_route_new, ro_lle)	\
 	_CHK_ROUTE_FIELD(_route_new, ro_prepend)\
 	_CHK_ROUTE_FIELD(_route_new, ro_plen)	\
 	_CHK_ROUTE_FIELD(_route_new, ro_flags)	\
 	_CHK_ROUTE_FIELD(_route_new, ro_mtu)	\
 	_CHK_ROUTE_FIELD(_route_new, spare)
 
 #define CHK_STRUCT_ROUTE_COMPAT(_ro_new, _dst_new)				\
 CHK_STRUCT_ROUTE_FIELDS(_ro_new);						\
 _Static_assert(__offsetof(struct route, ro_dst) == __offsetof(_ro_new, _dst_new),\
 		"ro_dst and " #_dst_new " are at different offset")
 
 struct rib_head *rt_tables_get_rnh(int fib, int family);
 void rt_mpath_init_rnh(struct rib_head *rnh);
 int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum);
 void rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt);
 #ifdef RADIX_MPATH
 struct radix_node *rt_mpath_unlink(struct rib_head *rnh,
     struct rt_addrinfo *info, struct rtentry *rto, int *perror);
 #endif
+struct rib_cmd_info;
 int add_route(struct rib_head *rnh, struct rt_addrinfo *info,
-    struct rtentry **ret_nrt);
+    struct rib_cmd_info *rc);
 int del_route(struct rib_head *rnh, struct rt_addrinfo *info,
-    struct rtentry **ret_nrt);
+    struct rib_cmd_info *rc);
 int change_route(struct rib_head *, struct rt_addrinfo *,
-    struct rtentry **);
+    struct rib_cmd_info *rc);
 
 VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat);
 #define	RTSTAT_ADD(name, val)	\
 	VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val))
 #define	RTSTAT_INC(name)	RTSTAT_ADD(name, 1)
 
 
 /*
  * Convert a 'struct radix_node *' to a 'struct rtentry *'.
  * The operation can be done safely (in this code) because a
  * 'struct rtentry' starts with two 'struct radix_node''s, the first
  * one representing leaf nodes in the routing tree, which is
  * what the code in radix.c passes us as a 'struct radix_node'.
  *
  * But because there are a lot of assumptions in this conversion,
  * do not cast explicitly, but always use the macro below.
  */
 #define RNTORT(p)	((struct rtentry *)(p))
 
 struct rtentry {
 	struct	radix_node rt_nodes[2];	/* tree glue, and other values */
 	/*
 	 * XXX struct rtentry must begin with a struct radix_node (or two!)
 	 * because the code does some casts of a 'struct radix_node *'
 	 * to a 'struct rtentry *'
 	 */
 #define	rt_key(r)	(*((struct sockaddr **)(&(r)->rt_nodes->rn_key)))
 #define	rt_mask(r)	(*((struct sockaddr **)(&(r)->rt_nodes->rn_mask)))
 #define	rt_key_const(r)		(*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_key)))
 #define	rt_mask_const(r)	(*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_mask)))
 
 	/*
 	 * 2 radix_node structurs above consists of 2x6 pointers, leaving
 	 * 4 pointers (32 bytes) of the second cache line on amd64.
 	 *
 	 */
 	struct nhop_object	*rt_nhop;	/* nexthop data */
 	union {
 		/*
 		 * Destination address storage.
 		 * sizeof(struct sockaddr_in6) == 28, however
 		 * the dataplane-relevant part (e.g. address) lies
 		 * at offset 8..24, making the address not crossing
 		 * cacheline boundary.
 		 */
 		struct sockaddr_in	rt_dst4;
 		struct sockaddr_in6	rt_dst6;
 		struct sockaddr		rt_dst;
 		char			rt_dstb[28];
 	};
 
 	int		rt_flags;	/* up/down?, host/net */
 	u_long		rt_weight;	/* absolute weight */ 
 	u_long		rt_expire;	/* lifetime for route, e.g. redirect */
 #define	rt_endzero	rt_mtx
 	struct mtx	rt_mtx;		/* mutex for routing entry */
 	struct rtentry	*rt_chain;	/* pointer to next rtentry to delete */
 	struct epoch_context	rt_epoch_ctx;	/* net epoch tracker */
 };
 
 #define	RT_LOCK_INIT(_rt) \
 	mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK | MTX_NEW)
 #define	RT_LOCK(_rt)		mtx_lock(&(_rt)->rt_mtx)
 #define	RT_UNLOCK(_rt)		mtx_unlock(&(_rt)->rt_mtx)
 #define	RT_LOCK_DESTROY(_rt)	mtx_destroy(&(_rt)->rt_mtx)
 #define	RT_LOCK_ASSERT(_rt)	mtx_assert(&(_rt)->rt_mtx, MA_OWNED)
 #define	RT_UNLOCK_COND(_rt)	do {				\
 	if (mtx_owned(&(_rt)->rt_mtx))				\
 		mtx_unlock(&(_rt)->rt_mtx);			\
 } while (0)
 
 /*
  * With the split between the routing entry and the nexthop,
  *  rt_flags has to be split between these 2 entries. As rtentry
  *  mostly contains prefix data and is thought to be generic enough
  *  so one can transparently change the nexthop pointer w/o requiring
  *  any other rtentry changes, most of rt_flags shifts to the particular nexthop.
  * /
  *
  * RTF_UP: rtentry, as an indication that it is linked.
  * RTF_HOST: rtentry, nhop. The latter indication is needed for the datapath
  * RTF_DYNAMIC: nhop, to make rtentry generic.
  * RTF_MODIFIED: nhop, to make rtentry generic. (legacy)
  * -- "native" path (nhop) properties:
  * RTF_GATEWAY, RTF_STATIC, RTF_PROTO1, RTF_PROTO2, RTF_PROTO3, RTF_FIXEDMTU,
  *  RTF_PINNED, RTF_REJECT, RTF_BLACKHOLE, RTF_BROADCAST
  */
 
 /* Nexthop rt flags mask */
 #define	NHOP_RT_FLAG_MASK	(RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_DYNAMIC | \
     RTF_MODIFIED | RTF_STATIC | RTF_BLACKHOLE | RTF_PROTO1 | RTF_PROTO2 | \
     RTF_PROTO3 | RTF_FIXEDMTU | RTF_PINNED | RTF_BROADCAST)
 
 /* rtentry rt flag mask */
 #define	RTE_RT_FLAG_MASK	(RTF_UP | RTF_HOST)
 
 /* Nexthop selection */
 #define	_NH2MP(_nh)	((struct nhgrp_object *)(_nh))
 #define	_SELECT_NHOP(_nh, _flowid)	\
 	(_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size]
 #define	_RT_SELECT_NHOP(_nh, _flowid)	\
 	((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid))
 #define	RT_SELECT_NHOP(_rt, _flowid)	_RT_SELECT_NHOP((_rt)->rt_nhop, _flowid)
  
 /* rte<>nhop translation */
 static inline uint16_t
 fib_rte_to_nh_flags(int rt_flags)
 {
 	uint16_t res;
 
 	res = (rt_flags & RTF_REJECT) ? NHF_REJECT : 0;
 	res |= (rt_flags & RTF_HOST) ? NHF_HOST : 0;
 	res |= (rt_flags & RTF_BLACKHOLE) ? NHF_BLACKHOLE : 0;
 	res |= (rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) ? NHF_REDIRECT : 0;
 	res |= (rt_flags & RTF_BROADCAST) ? NHF_BROADCAST : 0;
 	res |= (rt_flags & RTF_GATEWAY) ? NHF_GATEWAY : 0;
 
 	return (res);
 }
 
 void tmproutes_update(struct rib_head *rnh, struct rtentry *rt);
 void tmproutes_init(struct rib_head *rh);
 void tmproutes_destroy(struct rib_head *rh);
 
 #endif
Index: head/sys/net/route.c
===================================================================
--- head/sys/net/route.c	(revision 361703)
+++ head/sys/net/route.c	(revision 361704)
@@ -1,1527 +1,1536 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1980, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
  * $FreeBSD$
  */
 /************************************************************************
  * Note: In this file a 'fib' is a "forwarding information base"	*
  * Which is the new name for an in kernel routing (next hop) table.	*
  ***********************************************************************/
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_mrouting.h"
 #include "opt_mpath.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
+#include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop.h>
 #include <net/route/shared.h>
 #include <net/vnet.h>
 
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/ip_mroute.h>
 
 #include <vm/uma.h>
 
 #define	RT_MAXFIBS	UINT16_MAX
 
 /* Kernel config default option. */
 #ifdef ROUTETABLES
 #if ROUTETABLES <= 0
 #error "ROUTETABLES defined too low"
 #endif
 #if ROUTETABLES > RT_MAXFIBS
 #error "ROUTETABLES defined too big"
 #endif
 #define	RT_NUMFIBS	ROUTETABLES
 #endif /* ROUTETABLES */
 /* Initialize to default if not otherwise set. */
 #ifndef	RT_NUMFIBS
 #define	RT_NUMFIBS	1
 #endif
 
 /* This is read-only.. */
 u_int rt_numfibs = RT_NUMFIBS;
 SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, "");
 
 /*
  * By default add routes to all fibs for new interfaces.
  * Once this is set to 0 then only allocate routes on interface
  * changes for the FIB of the caller when adding a new set of addresses
  * to an interface.  XXX this is a shotgun aproach to a problem that needs
  * a more fine grained solution.. that will come.
  * XXX also has the problems getting the FIB from curthread which will not
  * always work given the fib can be overridden and prefixes can be added
  * from the network stack context.
  */
 VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1;
 SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET,
     &VNET_NAME(rt_add_addr_allfibs), 0, "");
 
 VNET_PCPUSTAT_DEFINE(struct rtstat, rtstat);
 
 VNET_PCPUSTAT_SYSINIT(rtstat);
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(rtstat);
 #endif
 
 VNET_DEFINE(struct rib_head *, rt_tables);
 #define	V_rt_tables	VNET(rt_tables)
 
 
 VNET_DEFINE(uma_zone_t, rtzone);		/* Routing table UMA zone. */
 #define	V_rtzone	VNET(rtzone)
 
 EVENTHANDLER_LIST_DEFINE(rt_addrmsg);
 
 static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *,
     void *arg);
 static void destroy_rtentry_epoch(epoch_context_t ctx);
 static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info,
     int flags);
 
 /*
  * handler for net.my_fibnum
  */
 static int
 sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
 {
         int fibnum;
         int error;
  
         fibnum = curthread->td_proc->p_fibnum;
         error = sysctl_handle_int(oidp, &fibnum, 0, req);
         return (error);
 }
 
 SYSCTL_PROC(_net, OID_AUTO, my_fibnum,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     &sysctl_my_fibnum, "I",
     "default FIB of caller");
 
 static __inline struct rib_head **
 rt_tables_get_rnh_ptr(int table, int fam)
 {
 	struct rib_head **rnh;
 
 	KASSERT(table >= 0 && table < rt_numfibs,
 	    ("%s: table out of bounds (0 <= %d < %d)", __func__, table,
 	     rt_numfibs));
 	KASSERT(fam >= 0 && fam < (AF_MAX + 1),
 	    ("%s: fam out of bounds (0 <= %d < %d)", __func__, fam, AF_MAX+1));
 
 	/* rnh is [fib=0][af=0]. */
 	rnh = (struct rib_head **)V_rt_tables;
 	/* Get the offset to the requested table and fam. */
 	rnh += table * (AF_MAX+1) + fam;
 
 	return (rnh);
 }
 
 struct rib_head *
 rt_tables_get_rnh(int table, int fam)
 {
 
 	return (*rt_tables_get_rnh_ptr(table, fam));
 }
 
 u_int
 rt_tables_get_gen(int table, int fam)
 {
 	struct rib_head *rnh;
 
 	rnh = *rt_tables_get_rnh_ptr(table, fam);
 	KASSERT(rnh != NULL, ("%s: NULL rib_head pointer table %d fam %d",
 	    __func__, table, fam));
 	return (rnh->rnh_gen);
 }
 
 
 /*
  * route initialization must occur before ip6_init2(), which happenas at
  * SI_ORDER_MIDDLE.
  */
 static void
 route_init(void)
 {
 
 	/* whack the tunable ints into  line. */
 	if (rt_numfibs > RT_MAXFIBS)
 		rt_numfibs = RT_MAXFIBS;
 	if (rt_numfibs == 0)
 		rt_numfibs = 1;
 	nhops_init();
 }
 SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL);
 
 static int
 rtentry_zinit(void *mem, int size, int how)
 {
 	struct rtentry *rt = mem;
 
 	RT_LOCK_INIT(rt);
 
 	return (0);
 }
 
 static void
 rtentry_zfini(void *mem, int size)
 {
 	struct rtentry *rt = mem;
 
 	RT_LOCK_DESTROY(rt);
 }
 
 static int
 rtentry_ctor(void *mem, int size, void *arg, int how)
 {
 	struct rtentry *rt = mem;
 
 	bzero(rt, offsetof(struct rtentry, rt_endzero));
 	rt->rt_chain = NULL;
 
 	return (0);
 }
 
 static void
 rtentry_dtor(void *mem, int size, void *arg)
 {
 	struct rtentry *rt = mem;
 
 	RT_UNLOCK_COND(rt);
 }
 
 static void
 vnet_route_init(const void *unused __unused)
 {
 	struct domain *dom;
 	struct rib_head **rnh;
 	int table;
 	int fam;
 
 	V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
 	    sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO);
 
 	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry),
 	    rtentry_ctor, rtentry_dtor,
 	    rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0);
 	for (dom = domains; dom; dom = dom->dom_next) {
 		if (dom->dom_rtattach == NULL)
 			continue;
 
 		for  (table = 0; table < rt_numfibs; table++) {
 			fam = dom->dom_family;
 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
 				break;
 
 			rnh = rt_tables_get_rnh_ptr(table, fam);
 			if (rnh == NULL)
 				panic("%s: rnh NULL", __func__);
 			dom->dom_rtattach((void **)rnh, 0, table);
 		}
 	}
 }
 VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
     vnet_route_init, 0);
 
 #ifdef VIMAGE
 static void
 vnet_route_uninit(const void *unused __unused)
 {
 	int table;
 	int fam;
 	struct domain *dom;
 	struct rib_head **rnh;
 
 	for (dom = domains; dom; dom = dom->dom_next) {
 		if (dom->dom_rtdetach == NULL)
 			continue;
 
 		for (table = 0; table < rt_numfibs; table++) {
 			fam = dom->dom_family;
 
 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
 				break;
 
 			rnh = rt_tables_get_rnh_ptr(table, fam);
 			if (rnh == NULL)
 				panic("%s: rnh NULL", __func__);
 			dom->dom_rtdetach((void **)rnh, 0);
 		}
 	}
 
 	/*
 	 * dom_rtdetach calls rt_table_destroy(), which
 	 *  schedules deletion for all rtentries, nexthops and control
 	 *  structures. Wait for the destruction callbacks to fire.
 	 * Note that this should result in freeing all rtentries, but
 	 *  nexthops deletions will be scheduled for the next epoch run
 	 *  and will be completed after vnet teardown.
 	 */
 	epoch_drain_callbacks(net_epoch_preempt);
 
 	free(V_rt_tables, M_RTABLE);
 	uma_zdestroy(V_rtzone);
 }
 VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
     vnet_route_uninit, 0);
 #endif
 
 struct rib_head *
 rt_table_init(int offset, int family, u_int fibnum)
 {
 	struct rib_head *rh;
 
 	rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO);
 
 	/* TODO: These details should be hidded inside radix.c */
 	/* Init masks tree */
 	rn_inithead_internal(&rh->head, rh->rnh_nodes, offset);
 	rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0);
 	rh->head.rnh_masks = &rh->rmhead;
 
 	/* Save metadata associated with this routing table. */
 	rh->rib_family = family;
 	rh->rib_fibnum = fibnum;
 #ifdef VIMAGE
 	rh->rib_vnet = curvnet;
 #endif
 
 	tmproutes_init(rh);
 
 	/* Init locks */
 	RIB_LOCK_INIT(rh);
 
 	nhops_init_rib(rh);
 
+	/* Init subscription system */
+	CK_STAILQ_INIT(&rh->rnh_subscribers);
+
 	/* Finally, set base callbacks */
 	rh->rnh_addaddr = rn_addroute;
 	rh->rnh_deladdr = rn_delete;
 	rh->rnh_matchaddr = rn_match;
 	rh->rnh_lookup = rn_lookup;
 	rh->rnh_walktree = rn_walktree;
 	rh->rnh_walktree_from = rn_walktree_from;
 
 	return (rh);
 }
 
 static int
 rt_freeentry(struct radix_node *rn, void *arg)
 {
 	struct radix_head * const rnh = arg;
 	struct radix_node *x;
 
 	x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh);
 	if (x != NULL)
 		R_Free(x);
 	return (0);
 }
 
 void
 rt_table_destroy(struct rib_head *rh)
 {
 
 	tmproutes_destroy(rh);
 
 	rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head);
 
 	nhops_destroy_rib(rh);
 
 	/* Assume table is already empty */
 	RIB_LOCK_DESTROY(rh);
 	free(rh, M_RTABLE);
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct setfib_args {
 	int     fibnum;
 };
 #endif
 int
 sys_setfib(struct thread *td, struct setfib_args *uap)
 {
 	if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
 		return EINVAL;
 	td->td_proc->p_fibnum = uap->fibnum;
 	return (0);
 }
 
 /*
  * Remove a reference count from an rtentry.
  * If the count gets low enough, take it out of the routing table
  */
 void
 rtfree(struct rtentry *rt)
 {
 
 	KASSERT(rt != NULL,("%s: NULL rt", __func__));
 
 	RT_LOCK_ASSERT(rt);
 
 	RT_UNLOCK(rt);
 	epoch_call(net_epoch_preempt, destroy_rtentry_epoch,
 	    &rt->rt_epoch_ctx);
 }
 
 static void
 destroy_rtentry(struct rtentry *rt)
 {
 
 	/*
 	 * At this moment rnh, nh_control may be already freed.
 	 * nhop interface may have been migrated to a different vnet.
 	 * Use vnet stored in the nexthop to delete the entry.
 	 */
 	CURVNET_SET(nhop_get_vnet(rt->rt_nhop));
 
 	/* Unreference nexthop */
 	nhop_free(rt->rt_nhop);
 
 	uma_zfree(V_rtzone, rt);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Epoch callback indicating rtentry is safe to destroy
  */
 static void
 destroy_rtentry_epoch(epoch_context_t ctx)
 {
 	struct rtentry *rt;
 
 	rt = __containerof(ctx, struct rtentry, rt_epoch_ctx);
 
 	destroy_rtentry(rt);
 }
 
 /*
  * Adds a temporal redirect entry to the routing table.
  * @fibnum: fib number
  * @dst: destination to install redirect to
  * @gateway: gateway to go via
  * @author: sockaddr of originating router, can be NULL
  * @ifp: interface to use for the redirected route
  * @flags: set of flags to add. Allowed: RTF_GATEWAY
  * @lifetime_sec: time in seconds to expire this redirect.
  *
  * Retuns 0 on success, errno otherwise.
  */
 int
 rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway,
     struct sockaddr *author, struct ifnet *ifp, int flags, int lifetime_sec)
 {
 	struct rtentry *rt;
 	int error;
 	struct rt_addrinfo info;
 	struct rt_metrics rti_rmx;
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 
 	if (rt_tables_get_rnh(fibnum, dst->sa_family) == NULL)
 		return (EAFNOSUPPORT);
 
 	/* Verify the allowed flag mask. */
 	KASSERT(((flags & ~(RTF_GATEWAY)) == 0),
 	    ("invalid redirect flags: %x", flags));
 
 	/* Get the best ifa for the given interface and gateway. */
 	if ((ifa = ifaof_ifpforaddr(gateway, ifp)) == NULL)
 		return (ENETUNREACH);
 	ifa_ref(ifa);
 	
 	bzero(&info, sizeof(info));
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_ifa = ifa;
 	info.rti_ifp = ifp;
 	info.rti_flags = flags | RTF_HOST | RTF_DYNAMIC;
 
 	/* Setup route metrics to define expire time. */
 	bzero(&rti_rmx, sizeof(rti_rmx));
 	/* Set expire time as absolute. */
 	rti_rmx.rmx_expire = lifetime_sec + time_second;
 	info.rti_mflags |= RTV_EXPIRE;
 	info.rti_rmx = &rti_rmx;
 
 	error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
 	ifa_free(ifa);
 
 	if (error != 0) {
 		/* TODO: add per-fib redirect stats. */
 		return (error);
 	}
 
 	RT_LOCK(rt);
 	flags = rt->rt_flags;
 	RT_UNLOCK(rt);
 
 	RTSTAT_INC(rts_dynamic);
 
 	/* Send notification of a route addition to userland. */
 	bzero(&info, sizeof(info));
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_info[RTAX_AUTHOR] = author;
 	rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum);
 
 	return (0);
 }
 
 /*
  * Routing table ioctl interface.
  */
 int
 rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
 {
 
 	/*
 	 * If more ioctl commands are added here, make sure the proper
 	 * super-user checks are being performed because it is possible for
 	 * prison-root to make it this far if raw sockets have been enabled
 	 * in jails.
 	 */
 #ifdef INET
 	/* Multicast goop, grrr... */
 	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
 #else /* INET */
 	return ENXIO;
 #endif /* INET */
 }
 
 struct ifaddr *
 ifa_ifwithroute(int flags, const struct sockaddr *dst,
     const struct sockaddr *gateway, u_int fibnum)
 {
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 	if ((flags & RTF_GATEWAY) == 0) {
 		/*
 		 * If we are adding a route to an interface,
 		 * and the interface is a pt to pt link
 		 * we should search for the destination
 		 * as our clue to the interface.  Otherwise
 		 * we can use the local address.
 		 */
 		ifa = NULL;
 		if (flags & RTF_HOST)
 			ifa = ifa_ifwithdstaddr(dst, fibnum);
 		if (ifa == NULL)
 			ifa = ifa_ifwithaddr(gateway);
 	} else {
 		/*
 		 * If we are adding a route to a remote net
 		 * or host, the gateway may still be on the
 		 * other end of a pt to pt link.
 		 */
 		ifa = ifa_ifwithdstaddr(gateway, fibnum);
 	}
 	if (ifa == NULL)
 		ifa = ifa_ifwithnet(gateway, 0, fibnum);
 	if (ifa == NULL) {
 		struct nhop_object *nh;
 
 		nh = rib_lookup(fibnum, gateway, NHR_NONE, 0);
 
 		/*
 		 * dismiss a gateway that is reachable only
 		 * through the default router
 		 */
 		if ((nh == NULL) || (nh->nh_flags & NHF_DEFAULT))
 			return (NULL);
 		ifa = nh->nh_ifa;
 	}
 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
 		struct ifaddr *oifa = ifa;
 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
 		if (ifa == NULL)
 			ifa = oifa;
 	}
 
 	return (ifa);
 }
 
 /*
  * Do appropriate manipulations of a routing tree given
  * all the bits of info needed
  */
 int
 rtrequest_fib(int req,
 	struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct rtentry **ret_nrt,
 	u_int fibnum)
 {
 	struct rt_addrinfo info;
 
 	if (dst->sa_len == 0)
 		return(EINVAL);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_flags = flags;
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_info[RTAX_NETMASK] = netmask;
 	return rtrequest1_fib(req, &info, ret_nrt, fibnum);
 }
 
 
 /*
  * Copy most of @rt data into @info.
  *
  * If @flags contains NHR_COPY, copies dst,netmask and gw to the
  * pointers specified by @info structure. Assume such pointers
  * are zeroed sockaddr-like structures with sa_len field initialized
  * to reflect size of the provided buffer. if no NHR_COPY is specified,
  * point dst,netmask and gw @info fields to appropriate @rt values.
  *
  * if @flags contains NHR_REF, do refcouting on rt_ifp and rt_ifa.
  *
  * Returns 0 on success.
  */
 int
 rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags)
 {
 	struct rt_metrics *rmx;
 	struct sockaddr *src, *dst;
 	struct nhop_object *nh;
 	int sa_len;
 
 	if (flags & NHR_COPY) {
 		/* Copy destination if dst is non-zero */
 		src = rt_key(rt);
 		dst = info->rti_info[RTAX_DST];
 		sa_len = src->sa_len;
 		if (dst != NULL) {
 			if (src->sa_len > dst->sa_len)
 				return (ENOMEM);
 			memcpy(dst, src, src->sa_len);
 			info->rti_addrs |= RTA_DST;
 		}
 
 		/* Copy mask if set && dst is non-zero */
 		src = rt_mask(rt);
 		dst = info->rti_info[RTAX_NETMASK];
 		if (src != NULL && dst != NULL) {
 
 			/*
 			 * Radix stores different value in sa_len,
 			 * assume rt_mask() to have the same length
 			 * as rt_key()
 			 */
 			if (sa_len > dst->sa_len)
 				return (ENOMEM);
 			memcpy(dst, src, src->sa_len);
 			info->rti_addrs |= RTA_NETMASK;
 		}
 
 		/* Copy gateway is set && dst is non-zero */
 		src = &rt->rt_nhop->gw_sa;
 		dst = info->rti_info[RTAX_GATEWAY];
 		if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){
 			if (src->sa_len > dst->sa_len)
 				return (ENOMEM);
 			memcpy(dst, src, src->sa_len);
 			info->rti_addrs |= RTA_GATEWAY;
 		}
 	} else {
 		info->rti_info[RTAX_DST] = rt_key(rt);
 		info->rti_addrs |= RTA_DST;
 		if (rt_mask(rt) != NULL) {
 			info->rti_info[RTAX_NETMASK] = rt_mask(rt);
 			info->rti_addrs |= RTA_NETMASK;
 		}
 		if (rt->rt_flags & RTF_GATEWAY) {
 			info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa;
 			info->rti_addrs |= RTA_GATEWAY;
 		}
 	}
 
 	nh = rt->rt_nhop;
 	rmx = info->rti_rmx;
 	if (rmx != NULL) {
 		info->rti_mflags |= RTV_MTU;
 		rmx->rmx_mtu = nh->nh_mtu;
 	}
 
 	info->rti_flags = rt->rt_flags | nhop_get_rtflags(nh);
 	info->rti_ifp = nh->nh_ifp;
 	info->rti_ifa = nh->nh_ifa;
 	if (flags & NHR_REF) {
 		if_ref(info->rti_ifp);
 		ifa_ref(info->rti_ifa);
 	}
 
 	return (0);
 }
 
 /*
  * Lookups up route entry for @dst in RIB database for fib @fibnum.
  * Exports entry data to @info using rt_exportinfo().
  *
  * If @flags contains NHR_REF, refcouting is performed on rt_ifp and rt_ifa.
  * All references can be released later by calling rib_free_info().
  *
  * Returns 0 on success.
  * Returns ENOENT for lookup failure, ENOMEM for export failure.
  */
 int
 rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags,
     uint32_t flowid, struct rt_addrinfo *info)
 {
 	RIB_RLOCK_TRACKER;
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct rtentry *rt;
 	int error;
 
 	KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rh == NULL)
 		return (ENOENT);
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rt = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rt->rt_nhop->nh_ifp)) {
 			flags = (flags & NHR_REF) | NHR_COPY;
 			error = rt_exportinfo(rt, info, flags);
 			RIB_RUNLOCK(rh);
 
 			return (error);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 /*
  * Releases all references acquired by rib_lookup_info() when
  * called with NHR_REF flags.
  */
 void
 rib_free_info(struct rt_addrinfo *info)
 {
 
 	ifa_free(info->rti_ifa);
 	if_rele(info->rti_ifp);
 }
 
 /*
  * Iterates over all existing fibs in system calling
  *  @setwa_f function prior to traversing each fib.
  *  Calls @wa_f function for each element in current fib.
  * If af is not AF_UNSPEC, iterates over fibs in particular
  * address family.
  */
 void
 rt_foreach_fib_walk(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f,
     void *arg)
 {
 	struct rib_head *rnh;
 	uint32_t fibnum;
 	int i;
 
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		/* Do we want some specific family? */
 		if (af != AF_UNSPEC) {
 			rnh = rt_tables_get_rnh(fibnum, af);
 			if (rnh == NULL)
 				continue;
 			if (setwa_f != NULL)
 				setwa_f(rnh, fibnum, af, arg);
 
 			RIB_WLOCK(rnh);
 			rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg);
 			RIB_WUNLOCK(rnh);
 			continue;
 		}
 
 		for (i = 1; i <= AF_MAX; i++) {
 			rnh = rt_tables_get_rnh(fibnum, i);
 			if (rnh == NULL)
 				continue;
 			if (setwa_f != NULL)
 				setwa_f(rnh, fibnum, i, arg);
 
 			RIB_WLOCK(rnh);
 			rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg);
 			RIB_WUNLOCK(rnh);
 		}
 	}
 }
 
 /*
  * Iterates over all existing fibs in system and deletes each element
  *  for which @filter_f function returns non-zero value.
  * If @family is not AF_UNSPEC, iterates over fibs in particular
  * address family.
  */
 void
 rt_foreach_fib_walk_del(int family, rt_filter_f_t *filter_f, void *arg)
 {
 	u_int fibnum;
 	int i, start, end;
 
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		/* Do we want some specific family? */
 		if (family != AF_UNSPEC) {
 			start = family;
 			end = family;
 		} else {
 			start = 1;
 			end = AF_MAX;
 		}
 
 		for (i = start; i <= end; i++) {
 			if (rt_tables_get_rnh(fibnum, i) == NULL)
 				continue;
 
 			rib_walk_del(fibnum, i, filter_f, arg, 0);
 		}
 	}
 }
 
 /*
  * Delete Routes for a Network Interface
  *
  * Called for each routing entry via the rnh->rnh_walktree() call above
  * to delete all route entries referencing a detaching network interface.
  *
  * Arguments:
  *	rt	pointer to rtentry
  *	nh	pointer to nhop
  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
  *
  * Returns:
  *	0	successful
  *	errno	failed - reason indicated
  */
 static int
 rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *nh, void *arg)
 {
 	struct ifnet	*ifp = arg;
 
 	if (nh->nh_ifp != ifp)
 		return (0);
 
 	/*
 	 * Protect (sorta) against walktree recursion problems
 	 * with cloned routes
 	 */
 	if ((rt->rt_flags & RTF_UP) == 0)
 		return (0);
 
 	return (1);
 }
 
 /*
  * Delete all remaining routes using this interface
  * Unfortuneatly the only way to do this is to slog through
  * the entire routing table looking for routes which point
  * to this interface...oh well...
  */
 void
 rt_flushifroutes_af(struct ifnet *ifp, int af)
 {
 	KASSERT((af >= 1 && af <= AF_MAX), ("%s: af %d not >= 1 and <= %d",
 	    __func__, af, AF_MAX));
 
 	rt_foreach_fib_walk_del(af, rt_ifdelroute, ifp);
 }
 
 void
 rt_flushifroutes(struct ifnet *ifp)
 {
 
 	rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp);
 }
 
 /*
  * Look up rt_addrinfo for a specific fib.  Note that if rti_ifa is defined,
  * it will be referenced so the caller must free it.
  *
  * Assume basic consistency checks are executed by callers:
  * RTAX_DST exists, if RTF_GATEWAY is set, RTAX_GATEWAY exists as well.
  */
 int
 rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
 {
 	const struct sockaddr *dst, *gateway, *ifpaddr, *ifaaddr;
 	struct epoch_tracker et;
 	int needref, error, flags;
 
 	dst = info->rti_info[RTAX_DST];
 	gateway = info->rti_info[RTAX_GATEWAY];
 	ifpaddr = info->rti_info[RTAX_IFP];
 	ifaaddr = info->rti_info[RTAX_IFA];
 	flags = info->rti_flags;
 
 	/*
 	 * ifp may be specified by sockaddr_dl
 	 * when protocol address is ambiguous.
 	 */
 	error = 0;
 	needref = (info->rti_ifa == NULL);
 	NET_EPOCH_ENTER(et);
 
 	/* If we have interface specified by the ifindex in the address, use it */
 	if (info->rti_ifp == NULL && ifpaddr != NULL &&
 	    ifpaddr->sa_family == AF_LINK) {
 	    const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)ifpaddr;
 	    if (sdl->sdl_index != 0)
 		    info->rti_ifp = ifnet_byindex(sdl->sdl_index);
 	}
 	/*
 	 * If we have source address specified, try to find it
 	 * TODO: avoid enumerating all ifas on all interfaces.
 	 */
 	if (info->rti_ifa == NULL && ifaaddr != NULL)
 		info->rti_ifa = ifa_ifwithaddr(ifaaddr);
 	if (info->rti_ifa == NULL) {
 		const struct sockaddr *sa;
 
 		/*
 		 * Most common use case for the userland-supplied routes.
 		 *
 		 * Choose sockaddr to select ifa.
 		 * -- if ifp is set --
 		 * Order of preference:
 		 * 1) IFA address
 		 * 2) gateway address
 		 *   Note: for interface routes link-level gateway address 
 		 *     is specified to indicate the interface index without
 		 *     specifying RTF_GATEWAY. In this case, ignore gateway
 		 *   Note: gateway AF may be different from dst AF. In this case,
 		 *   ignore gateway
 		 * 3) final destination.
 		 * 4) if all of these fails, try to get at least link-level ifa.
 		 * -- else --
 		 * try to lookup gateway or dst in the routing table to get ifa
 		 */
 		if (info->rti_info[RTAX_IFA] != NULL)
 			sa = info->rti_info[RTAX_IFA];
 		else if ((info->rti_flags & RTF_GATEWAY) != 0 &&
 		    gateway->sa_family == dst->sa_family)
 			sa = gateway;
 		else
 			sa = dst;
 		if (info->rti_ifp != NULL) {
 			info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
 			/* Case 4 */
 			if (info->rti_ifa == NULL && gateway != NULL)
 				info->rti_ifa = ifaof_ifpforaddr(gateway, info->rti_ifp);
 		} else if (dst != NULL && gateway != NULL)
 			info->rti_ifa = ifa_ifwithroute(flags, dst, gateway,
 							fibnum);
 		else if (sa != NULL)
 			info->rti_ifa = ifa_ifwithroute(flags, sa, sa,
 							fibnum);
 	}
 	if (needref && info->rti_ifa != NULL) {
 		if (info->rti_ifp == NULL)
 			info->rti_ifp = info->rti_ifa->ifa_ifp;
 		ifa_ref(info->rti_ifa);
 	} else
 		error = ENETUNREACH;
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 void
 rt_updatemtu(struct ifnet *ifp)
 {
 	struct rib_head *rnh;
 	int mtu;
 	int i, j;
 
 	/*
 	 * Try to update rt_mtu for all routes using this interface
 	 * Unfortunately the only way to do this is to traverse all
 	 * routing tables in all fibs/domains.
 	 */
 	for (i = 1; i <= AF_MAX; i++) {
 		mtu = if_getmtu_family(ifp, i);
 		for (j = 0; j < rt_numfibs; j++) {
 			rnh = rt_tables_get_rnh(j, i);
 			if (rnh == NULL)
 				continue;
 			nhops_update_ifmtu(rnh, ifp, mtu);
 		}
 	}
 }
 
 
 #if 0
 int p_sockaddr(char *buf, int buflen, struct sockaddr *s);
 int rt_print(char *buf, int buflen, struct rtentry *rt);
 
 int
 p_sockaddr(char *buf, int buflen, struct sockaddr *s)
 {
 	void *paddr = NULL;
 
 	switch (s->sa_family) {
 	case AF_INET:
 		paddr = &((struct sockaddr_in *)s)->sin_addr;
 		break;
 	case AF_INET6:
 		paddr = &((struct sockaddr_in6 *)s)->sin6_addr;
 		break;
 	}
 
 	if (paddr == NULL)
 		return (0);
 
 	if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL)
 		return (0);
 	
 	return (strlen(buf));
 }
 
 int
 rt_print(char *buf, int buflen, struct rtentry *rt)
 {
 	struct sockaddr *addr, *mask;
 	int i = 0;
 
 	addr = rt_key(rt);
 	mask = rt_mask(rt);
 
 	i = p_sockaddr(buf, buflen, addr);
 	if (!(rt->rt_flags & RTF_HOST)) {
 		buf[i++] = '/';
 		i += p_sockaddr(buf + i, buflen - i, mask);
 	}
 
 	if (rt->rt_flags & RTF_GATEWAY) {
 		buf[i++] = '>';
 		i += p_sockaddr(buf + i, buflen - i, &rt->rt_nhop->gw_sa);
 	}
 
 	return (i);
 }
 #endif
 
 #ifdef RADIX_MPATH
 /*
  * Deletes key for single-path routes, unlinks rtentry with
  * gateway specified in @info from multi-path routes.
  *
  * Returnes unlinked entry. In case of failure, returns NULL
  * and sets @perror to ESRCH.
  */
 struct radix_node *
 rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rtentry *rto, int *perror)
 {
 	/*
 	 * if we got multipath routes, we require users to specify
 	 * a matching RTAX_GATEWAY.
 	 */
 	struct rtentry *rt; // *rto = NULL;
 	struct radix_node *rn;
 	struct sockaddr *gw;
 
 	gw = info->rti_info[RTAX_GATEWAY];
 	rt = rt_mpath_matchgate(rto, gw);
 	if (rt == NULL) {
 		*perror = ESRCH;
 		return (NULL);
 	}
 
 	/*
 	 * this is the first entry in the chain
 	 */
 	if (rto == rt) {
 		rn = rn_mpath_next((struct radix_node *)rt);
 		/*
 		 * there is another entry, now it's active
 		 */
 		if (rn) {
 			rto = RNTORT(rn);
 			RT_LOCK(rto);
 			rto->rt_flags |= RTF_UP;
 			RT_UNLOCK(rto);
 		} else if (rt->rt_flags & RTF_GATEWAY) {
 			/*
 			 * For gateway routes, we need to 
 			 * make sure that we we are deleting
 			 * the correct gateway. 
 			 * rt_mpath_matchgate() does not 
 			 * check the case when there is only
 			 * one route in the chain.  
 			 */
 			if (gw &&
 			    (rt->rt_nhop->gw_sa.sa_len != gw->sa_len ||
 				memcmp(&rt->rt_nhop->gw_sa, gw, gw->sa_len))) {
 				*perror = ESRCH;
 				return (NULL);
 			}
 		}
 
 		/*
 		 * use the normal delete code to remove
 		 * the first entry
 		 */
 		rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST],
 					info->rti_info[RTAX_NETMASK],
 					&rnh->head);
 		*perror = 0;
 		return (rn);
 	}
 		
 	/*
 	 * if the entry is 2nd and on up
 	 */
 	if (rt_mpath_deldup(rto, rt) == 0)
 		panic ("rtrequest1: rt_mpath_deldup");
 	*perror = 0;
 	rn = (struct radix_node *)rt;
 	return (rn);
 }
 #endif
 
 int
 rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
 				u_int fibnum)
 {
 	const struct sockaddr *dst;
 	struct rib_head *rnh;
+	struct rib_cmd_info rc;
 	int error;
 
 	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
 	KASSERT((info->rti_flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked"));
 	NET_EPOCH_ASSERT();
 
 	dst = info->rti_info[RTAX_DST];
 
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We support multiple FIBs. */
 		break;
 	default:
 		fibnum = RT_DEFAULT_FIB;
 		break;
 	}
 
 	/*
 	 * Find the correct routing tree to use for this Address Family
 	 */
 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	/*
 	 * If we are adding a host route then we don't want to put
 	 * a netmask in the tree, nor do we want to clone it.
 	 */
 	if (info->rti_flags & RTF_HOST)
 		info->rti_info[RTAX_NETMASK] = NULL;
 
+	bzero(&rc, sizeof(struct rib_cmd_info));
 	error = 0;
 	switch (req) {
 	case RTM_DELETE:
-		error = del_route(rnh, info, ret_nrt);
+		error = del_route(rnh, info, &rc);
 		break;
 	case RTM_RESOLVE:
 		/*
 		 * resolve was only used for route cloning
 		 * here for compat
 		 */
 		break;
 	case RTM_ADD:
-		error = add_route(rnh, info, ret_nrt);
+		error = add_route(rnh, info, &rc);
 		break;
 	case RTM_CHANGE:
-		error = change_route(rnh, info, ret_nrt);
+		error = change_route(rnh, info, &rc);
 		break;
 	default:
 		error = EOPNOTSUPP;
 	}
+
+	if (ret_nrt != NULL)
+		*ret_nrt = rc.rc_rt;
 
 	return (error);
 }
 
 void
 rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt)
 {
 
 	if (info->rti_mflags & RTV_WEIGHT)
 		rt->rt_weight = info->rti_rmx->rmx_weight;
 	/* Kernel -> userland timebase conversion. */
 	if (info->rti_mflags & RTV_EXPIRE)
 		rt->rt_expire = info->rti_rmx->rmx_expire ?
 		    info->rti_rmx->rmx_expire - time_second + time_uptime : 0;
 }
 
 void
 rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
 {
 	u_char *cp1 = (u_char *)src;
 	u_char *cp2 = (u_char *)dst;
 	u_char *cp3 = (u_char *)netmask;
 	u_char *cplim = cp2 + *cp3;
 	u_char *cplim2 = cp2 + *cp1;
 
 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
 	cp3 += 2;
 	if (cplim > cplim2)
 		cplim = cplim2;
 	while (cp2 < cplim)
 		*cp2++ = *cp1++ & *cp3++;
 	if (cp2 < cplim2)
 		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
 }
 
 /*
  * Set up a routing table entry, normally
  * for an interface.
  */
 #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
 static inline  int
 rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
 {
 	RIB_RLOCK_TRACKER;
 	struct epoch_tracker et;
 	struct sockaddr *dst;
 	struct sockaddr *netmask;
 	struct rtentry *rt = NULL;
 	struct rt_addrinfo info;
 	int error = 0;
 	int startfib, endfib;
 	char tempbuf[_SOCKADDR_TMPSIZE];
 	int didwork = 0;
 	int a_failure = 0;
 	struct sockaddr_dl_short *sdl = NULL;
 	struct rib_head *rnh;
 
 	if (flags & RTF_HOST) {
 		dst = ifa->ifa_dstaddr;
 		netmask = NULL;
 	} else {
 		dst = ifa->ifa_addr;
 		netmask = ifa->ifa_netmask;
 	}
 	if (dst->sa_len == 0)
 		return(EINVAL);
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We support multiple FIBs. */
 		break;
 	default:
 		fibnum = RT_DEFAULT_FIB;
 		break;
 	}
 	if (fibnum == RT_ALL_FIBS) {
 		if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD)
 			startfib = endfib = ifa->ifa_ifp->if_fib;
 		else {
 			startfib = 0;
 			endfib = rt_numfibs - 1;
 		}
 	} else {
 		KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
 		startfib = fibnum;
 		endfib = fibnum;
 	}
 
 	/*
 	 * If it's a delete, check that if it exists,
 	 * it's on the correct interface or we might scrub
 	 * a route to another ifa which would
 	 * be confusing at best and possibly worse.
 	 */
 	if (cmd == RTM_DELETE) {
 		/*
 		 * It's a delete, so it should already exist..
 		 * If it's a net, mask off the host bits
 		 * (Assuming we have a mask)
 		 * XXX this is kinda inet specific..
 		 */
 		if (netmask != NULL) {
 			rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
 			dst = (struct sockaddr *)tempbuf;
 		}
 	} else if (cmd == RTM_ADD) {
 		sdl = (struct sockaddr_dl_short *)tempbuf;
 		bzero(sdl, sizeof(struct sockaddr_dl_short));
 		sdl->sdl_family = AF_LINK;
 		sdl->sdl_len = sizeof(struct sockaddr_dl_short);
 		sdl->sdl_type = ifa->ifa_ifp->if_type;
 		sdl->sdl_index = ifa->ifa_ifp->if_index;
         }
 	/*
 	 * Now go through all the requested tables (fibs) and do the
 	 * requested action. Realistically, this will either be fib 0
 	 * for protocols that don't do multiple tables or all the
 	 * tables for those that do.
 	 */
 	for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
 		if (cmd == RTM_DELETE) {
 			struct radix_node *rn;
 			/*
 			 * Look up an rtentry that is in the routing tree and
 			 * contains the correct info.
 			 */
 			rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 			if (rnh == NULL)
 				/* this table doesn't exist but others might */
 				continue;
 			RIB_RLOCK(rnh);
 			rn = rnh->rnh_lookup(dst, netmask, &rnh->head);
 #ifdef RADIX_MPATH
 			if (rt_mpath_capable(rnh)) {
 
 				if (rn == NULL) 
 					error = ESRCH;
 				else {
 					rt = RNTORT(rn);
 					/*
 					 * for interface route the gateway
 					 * gateway is sockaddr_dl, so
 					 * rt_mpath_matchgate must use the
 					 * interface address
 					 */
 					rt = rt_mpath_matchgate(rt,
 					    ifa->ifa_addr);
 					if (rt == NULL) 
 						error = ESRCH;
 				}
 			}
 #endif
 			error = (rn == NULL ||
 			    (rn->rn_flags & RNF_ROOT) ||
 			    RNTORT(rn)->rt_nhop->nh_ifa != ifa);
 			RIB_RUNLOCK(rnh);
 			if (error) {
 				/* this is only an error if bad on ALL tables */
 				continue;
 			}
 		}
 		/*
 		 * Do the actual request
 		 */
 		bzero((caddr_t)&info, sizeof(info));
 		info.rti_ifa = ifa;
 		info.rti_flags = flags |
 		    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
 		info.rti_info[RTAX_DST] = dst;
 		/* 
 		 * doing this for compatibility reasons
 		 */
 		if (cmd == RTM_ADD)
 			info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)sdl;
 		else
 			info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
 		info.rti_info[RTAX_NETMASK] = netmask;
 		NET_EPOCH_ENTER(et);
 		error = rtrequest1_fib(cmd, &info, &rt, fibnum);
 		if (error == 0 && rt != NULL) {
 			/*
 			 * notify any listening routing agents of the change
 			 */
 
 			/* TODO: interface routes/aliases */
 			rt_newaddrmsg_fib(cmd, ifa, rt, fibnum);
 			didwork = 1;
 		}
 		NET_EPOCH_EXIT(et);
 		if (error)
 			a_failure = error;
 	}
 	if (cmd == RTM_DELETE) {
 		if (didwork) {
 			error = 0;
 		} else {
 			/* we only give an error if it wasn't in any table */
 			error = ((flags & RTF_HOST) ?
 			    EHOSTUNREACH : ENETUNREACH);
 		}
 	} else {
 		if (a_failure) {
 			/* return an error if any of them failed */
 			error = a_failure;
 		}
 	}
 	return (error);
 }
 
 /*
  * Set up a routing table entry, normally
  * for an interface.
  */
 int
 rtinit(struct ifaddr *ifa, int cmd, int flags)
 {
 	struct sockaddr *dst;
 	int fib = RT_DEFAULT_FIB;
 
 	if (flags & RTF_HOST) {
 		dst = ifa->ifa_dstaddr;
 	} else {
 		dst = ifa->ifa_addr;
 	}
 
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We do support multiple FIBs. */
 		fib = RT_ALL_FIBS;
 		break;
 	}
 	return (rtinit1(ifa, cmd, flags, fib));
 }
 
 /*
  * Announce interface address arrival/withdraw
  * Returns 0 on success.
  */
 int
 rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 	    ("unexpected cmd %d", cmd));
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	EVENTHANDLER_DIRECT_INVOKE(rt_addrmsg, ifa, cmd);
 	return (rtsock_addrmsg(cmd, ifa, fibnum));
 }
 
 /*
  * Announce kernel-originated route addition/removal to rtsock based on @rt data.
  * cmd: RTM_ cmd
  * @rt: valid rtentry
  * @ifp: target route interface
  * @fibnum: fib id or RT_ALL_FIBS
  *
  * Returns 0 on success.
  */
 int
 rt_routemsg(int cmd, struct rtentry *rt, struct ifnet *ifp, int rti_addrs,
     int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 	    ("unexpected cmd %d", cmd));
 	
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__));
 
 	return (rtsock_routemsg(cmd, rt, ifp, 0, fibnum));
 }
 
 /*
  * Announce kernel-originated route addition/removal to rtsock based on @rt data.
  * cmd: RTM_ cmd
  * @info: addrinfo structure with valid data.
  * @fibnum: fib id or RT_ALL_FIBS
  *
  * Returns 0 on success.
  */
 int
 rt_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE || cmd == RTM_CHANGE,
 	    ("unexpected cmd %d", cmd));
 	
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	KASSERT(info->rti_info[RTAX_DST] != NULL, (":%s: RTAX_DST must be supplied", __func__));
 
 	return (rtsock_routemsg_info(cmd, info, fibnum));
 }
 
 
 /*
  * This is called to generate messages from the routing socket
  * indicating a network interface has had addresses associated with it.
  */
 void
 rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, struct rtentry *rt, int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 		("unexpected cmd %u", cmd));
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	if (cmd == RTM_ADD) {
 		rt_addrmsg(cmd, ifa, fibnum);
 		if (rt != NULL)
 			rt_routemsg(cmd, rt, ifa->ifa_ifp, 0, fibnum);
 	} else {
 		if (rt != NULL)
 			rt_routemsg(cmd, rt, ifa->ifa_ifp, 0, fibnum);
 		rt_addrmsg(cmd, ifa, fibnum);
 	}
 }
 
Index: head/sys/net/route.h
===================================================================
--- head/sys/net/route.h	(revision 361703)
+++ head/sys/net/route.h	(revision 361704)
@@ -1,436 +1,426 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)route.h	8.4 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _NET_ROUTE_H_
 #define _NET_ROUTE_H_
 
 #include <net/vnet.h>
 
 /*
  * Kernel resident routing tables.
  *
  * The routing tables are initialized when interface addresses
  * are set by making entries for all directly connected interfaces.
  */
 
 /*
  * Struct route consiste of a destination address,
  * a route entry pointer, link-layer prepend data pointer along
  * with its length.
  */
 struct route {
 	struct	nhop_object *ro_nh;
 	struct	llentry *ro_lle;
 	/*
 	 * ro_prepend and ro_plen are only used for bpf to pass in a
 	 * preformed header.  They are not cacheable.
 	 */
 	char		*ro_prepend;
 	uint16_t	ro_plen;
 	uint16_t	ro_flags;
 	uint16_t	ro_mtu;	/* saved ro_rt mtu */
 	uint16_t	spare;
 	struct	sockaddr ro_dst;
 };
 
 #define	RT_L2_ME_BIT		2	/* dst L2 addr is our address */
 #define	RT_MAY_LOOP_BIT		3	/* dst may require loop copy */
 #define	RT_HAS_HEADER_BIT	4	/* mbuf already have its header prepended */
 
 #define	RT_L2_ME		(1 << RT_L2_ME_BIT)		/* 0x0004 */
 #define	RT_MAY_LOOP		(1 << RT_MAY_LOOP_BIT)		/* 0x0008 */
 #define	RT_HAS_HEADER		(1 << RT_HAS_HEADER_BIT)	/* 0x0010 */
 
 #define	RT_REJECT		0x0020		/* Destination is reject */
 #define	RT_BLACKHOLE		0x0040		/* Destination is blackhole */
 #define	RT_HAS_GW		0x0080		/* Destination has GW  */
 #define	RT_LLE_CACHE		0x0100		/* Cache link layer  */
 
 struct rt_metrics {
 	u_long	rmx_locks;	/* Kernel must leave these values alone */
 	u_long	rmx_mtu;	/* MTU for this path */
 	u_long	rmx_hopcount;	/* max hops expected */
 	u_long	rmx_expire;	/* lifetime for route, e.g. redirect */
 	u_long	rmx_recvpipe;	/* inbound delay-bandwidth product */
 	u_long	rmx_sendpipe;	/* outbound delay-bandwidth product */
 	u_long	rmx_ssthresh;	/* outbound gateway buffer limit */
 	u_long	rmx_rtt;	/* estimated round trip time */
 	u_long	rmx_rttvar;	/* estimated rtt variance */
 	u_long	rmx_pksent;	/* packets sent using this route */
 	u_long	rmx_weight;	/* route weight */
 	u_long	rmx_nhidx;	/* route nexhop index */
 	u_long	rmx_filler[2];	/* will be used for T/TCP later */
 };
 
 /*
  * rmx_rtt and rmx_rttvar are stored as microseconds;
  * RTTTOPRHZ(rtt) converts to a value suitable for use
  * by a protocol slowtimo counter.
  */
 #define	RTM_RTTUNIT	1000000	/* units for rtt, rttvar, as units per sec */
 #define	RTTTOPRHZ(r)	((r) / (RTM_RTTUNIT / PR_SLOWHZ))
 
 /* lle state is exported in rmx_state rt_metrics field */
 #define	rmx_state	rmx_weight
 
 /*
  * Keep a generation count of routing table, incremented on route addition,
  * so we can invalidate caches.  This is accessed without a lock, as precision
  * is not required.
  */
 typedef volatile u_int rt_gen_t;	/* tree generation (for adds) */
 #define RT_GEN(fibnum, af)	rt_tables_get_gen(fibnum, af)
 
 #define	RT_DEFAULT_FIB	0	/* Explicitly mark fib=0 restricted cases */
 #define	RT_ALL_FIBS	-1	/* Announce event for every fib */
 #ifdef _KERNEL
 extern u_int rt_numfibs;	/* number of usable routing tables */
 VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */
 #define	V_rt_add_addr_allfibs	VNET(rt_add_addr_allfibs)
 #endif
 
 /*
  * We distinguish between routes to hosts and routes to networks,
  * preferring the former if available.  For each route we infer
  * the interface to use from the gateway address supplied when
  * the route was entered.  Routes that forward packets through
  * gateways are marked so that the output routines know to address the
  * gateway rather than the ultimate destination.
  */
 #define	RTF_UP		0x1		/* route usable */
 #define	RTF_GATEWAY	0x2		/* destination is a gateway */
 #define	RTF_HOST	0x4		/* host entry (net otherwise) */
 #define	RTF_REJECT	0x8		/* host or net unreachable */
 #define	RTF_DYNAMIC	0x10		/* created dynamically (by redirect) */
 #define	RTF_MODIFIED	0x20		/* modified dynamically (by redirect) */
 #define RTF_DONE	0x40		/* message confirmed */
 /*			0x80		   unused, was RTF_DELCLONE */
 /*			0x100		   unused, was RTF_CLONING */
 #define RTF_XRESOLVE	0x200		/* external daemon resolves name */
 #define RTF_LLINFO	0x400		/* DEPRECATED - exists ONLY for backward 
 					   compatibility */
 #define RTF_LLDATA	0x400		/* used by apps to add/del L2 entries */
 #define RTF_STATIC	0x800		/* manually added */
 #define RTF_BLACKHOLE	0x1000		/* just discard pkts (during updates) */
 #define RTF_PROTO2	0x4000		/* protocol specific routing flag */
 #define RTF_PROTO1	0x8000		/* protocol specific routing flag */
 /*			0x10000		   unused, was RTF_PRCLONING */
 /*			0x20000		   unused, was RTF_WASCLONED */
 #define RTF_PROTO3	0x40000		/* protocol specific routing flag */
 #define	RTF_FIXEDMTU	0x80000		/* MTU was explicitly specified */
 #define RTF_PINNED	0x100000	/* route is immutable */
 #define	RTF_LOCAL	0x200000 	/* route represents a local address */
 #define	RTF_BROADCAST	0x400000	/* route represents a bcast address */
 #define	RTF_MULTICAST	0x800000	/* route represents a mcast address */
 					/* 0x8000000 and up unassigned */
 #define	RTF_STICKY	 0x10000000	/* always route dst->src */
 
 #define	RTF_RNH_LOCKED	 0x40000000	/* radix node head is locked */
 
 #define	RTF_GWFLAG_COMPAT 0x80000000	/* a compatibility bit for interacting
 					   with existing routing apps */
 
 /* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */
 #define RTF_FMASK	\
 	(RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \
 	 RTF_REJECT | RTF_STATIC | RTF_STICKY)
 
 /*
  * fib_ nexthop API flags.
  */
 
 /* Consumer-visible nexthop info flags */
 #define	NHF_REJECT		0x0010	/* RTF_REJECT */
 #define	NHF_BLACKHOLE		0x0020	/* RTF_BLACKHOLE */
 #define	NHF_REDIRECT		0x0040	/* RTF_DYNAMIC|RTF_MODIFIED */
 #define	NHF_DEFAULT		0x0080	/* Default route */
 #define	NHF_BROADCAST		0x0100	/* RTF_BROADCAST */
 #define	NHF_GATEWAY		0x0200	/* RTF_GATEWAY */
 #define	NHF_HOST		0x0400	/* RTF_HOST */
 
 /* Nexthop request flags */
 #define	NHR_NONE		0x00	/* empty flags field */
 #define	NHR_IFAIF		0x01	/* Return ifa_ifp interface */
 #define	NHR_REF			0x02	/* For future use */
 
 /* uRPF */
 #define	NHR_NODEFAULT		0x04	/* do not consider default route */
 
 /* Control plane route request flags */
 #define	NHR_COPY		0x100	/* Copy rte data */
 
 /*
  * Routing statistics.
  */
 struct rtstat {
 	uint64_t rts_badredirect;	/* bogus redirect calls */
 	uint64_t rts_dynamic;		/* routes created by redirects */
 	uint64_t rts_newgateway;	/* routes modified by redirects */
 	uint64_t rts_unreach;		/* lookups which failed */
 	uint64_t rts_wildcard;		/* lookups satisfied by a wildcard */
 	uint64_t rts_nh_idx_alloc_failure;	/* nexthop index alloc failure*/
 	uint64_t rts_nh_alloc_failure;	/* nexthop allocation failure*/
 };
 
 /*
  * Structures for routing messages.
  */
 struct rt_msghdr {
 	u_short	rtm_msglen;	/* to skip over non-understood messages */
 	u_char	rtm_version;	/* future binary compatibility */
 	u_char	rtm_type;	/* message type */
 	u_short	rtm_index;	/* index for associated ifp */
 	u_short _rtm_spare1;
 	int	rtm_flags;	/* flags, incl. kern & message, e.g. DONE */
 	int	rtm_addrs;	/* bitmask identifying sockaddrs in msg */
 	pid_t	rtm_pid;	/* identify sender */
 	int	rtm_seq;	/* for sender to identify action */
 	int	rtm_errno;	/* why failed */
 	int	rtm_fmask;	/* bitmask used in RTM_CHANGE message */
 	u_long	rtm_inits;	/* which metrics we are initializing */
 	struct	rt_metrics rtm_rmx; /* metrics themselves */
 };
 
 #define RTM_VERSION	5	/* Up the ante and ignore older versions */
 
 /*
  * Message types.
  *
  * The format for each message is annotated below using the following
  * identifiers:
  *
  * (1) struct rt_msghdr
  * (2) struct ifa_msghdr
  * (3) struct if_msghdr
  * (4) struct ifma_msghdr
  * (5) struct if_announcemsghdr
  *
  */
 #define	RTM_ADD		0x1	/* (1) Add Route */
 #define	RTM_DELETE	0x2	/* (1) Delete Route */
 #define	RTM_CHANGE	0x3	/* (1) Change Metrics or flags */
 #define	RTM_GET		0x4	/* (1) Report Metrics */
 #define	RTM_LOSING	0x5	/* (1) Kernel Suspects Partitioning */
 #define	RTM_REDIRECT	0x6	/* (1) Told to use different route */
 #define	RTM_MISS	0x7	/* (1) Lookup failed on this address */
 #define	RTM_LOCK	0x8	/* (1) fix specified metrics */
 		    /*	0x9  */
 		    /*	0xa  */
 #define	RTM_RESOLVE	0xb	/* (1) req to resolve dst to LL addr */
 #define	RTM_NEWADDR	0xc	/* (2) address being added to iface */
 #define	RTM_DELADDR	0xd	/* (2) address being removed from iface */
 #define	RTM_IFINFO	0xe	/* (3) iface going up/down etc. */
 #define	RTM_NEWMADDR	0xf	/* (4) mcast group membership being added to if */
 #define	RTM_DELMADDR	0x10	/* (4) mcast group membership being deleted */
 #define	RTM_IFANNOUNCE	0x11	/* (5) iface arrival/departure */
 #define	RTM_IEEE80211	0x12	/* (5) IEEE80211 wireless event */
 
 /*
  * Bitmask values for rtm_inits and rmx_locks.
  */
 #define RTV_MTU		0x1	/* init or lock _mtu */
 #define RTV_HOPCOUNT	0x2	/* init or lock _hopcount */
 #define RTV_EXPIRE	0x4	/* init or lock _expire */
 #define RTV_RPIPE	0x8	/* init or lock _recvpipe */
 #define RTV_SPIPE	0x10	/* init or lock _sendpipe */
 #define RTV_SSTHRESH	0x20	/* init or lock _ssthresh */
 #define RTV_RTT		0x40	/* init or lock _rtt */
 #define RTV_RTTVAR	0x80	/* init or lock _rttvar */
 #define RTV_WEIGHT	0x100	/* init or lock _weight */
 
 /*
  * Bitmask values for rtm_addrs.
  */
 #define RTA_DST		0x1	/* destination sockaddr present */
 #define RTA_GATEWAY	0x2	/* gateway sockaddr present */
 #define RTA_NETMASK	0x4	/* netmask sockaddr present */
 #define RTA_GENMASK	0x8	/* cloning mask sockaddr present */
 #define RTA_IFP		0x10	/* interface name sockaddr present */
 #define RTA_IFA		0x20	/* interface addr sockaddr present */
 #define RTA_AUTHOR	0x40	/* sockaddr for author of redirect */
 #define RTA_BRD		0x80	/* for NEWADDR, broadcast or p-p dest addr */
 
 /*
  * Index offsets for sockaddr array for alternate internal encoding.
  */
 #define RTAX_DST	0	/* destination sockaddr present */
 #define RTAX_GATEWAY	1	/* gateway sockaddr present */
 #define RTAX_NETMASK	2	/* netmask sockaddr present */
 #define RTAX_GENMASK	3	/* cloning mask sockaddr present */
 #define RTAX_IFP	4	/* interface name sockaddr present */
 #define RTAX_IFA	5	/* interface addr sockaddr present */
 #define RTAX_AUTHOR	6	/* sockaddr for author of redirect */
 #define RTAX_BRD	7	/* for NEWADDR, broadcast or p-p dest addr */
 #define RTAX_MAX	8	/* size of array to allocate */
 
 struct rtentry;
 struct nhop_object;
 typedef int rt_filter_f_t(const struct rtentry *, const struct nhop_object *,
     void *);
 
 struct rt_addrinfo {
 	int	rti_addrs;			/* Route RTF_ flags */
 	int	rti_flags;			/* Route RTF_ flags */
 	struct	sockaddr *rti_info[RTAX_MAX];	/* Sockaddr data */
 	struct	ifaddr *rti_ifa;		/* value of rt_ifa addr */
 	struct	ifnet *rti_ifp;			/* route interface */
 	rt_filter_f_t	*rti_filter;		/* filter function */
 	void	*rti_filterdata;		/* filter paramenters */
 	u_long	rti_mflags;			/* metrics RTV_ flags */
 	u_long	rti_spare;			/* Will be used for fib */
 	struct	rt_metrics *rti_rmx;		/* Pointer to route metrics */
 };
 
 /*
  * This macro returns the size of a struct sockaddr when passed
  * through a routing socket. Basically we round up sa_len to
  * a multiple of sizeof(long), with a minimum of sizeof(long).
  * The case sa_len == 0 should only apply to empty structures.
  */
 #define SA_SIZE(sa)						\
     (  (((struct sockaddr *)(sa))->sa_len == 0) ?		\
 	sizeof(long)		:				\
 	1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(long) - 1) ) )
 
 #define	sa_equal(a, b) (	\
     (((const struct sockaddr *)(a))->sa_len == ((const struct sockaddr *)(b))->sa_len) && \
     (bcmp((a), (b), ((const struct sockaddr *)(b))->sa_len) == 0))
 
 #ifdef _KERNEL
 
 #define RT_LINK_IS_UP(ifp)	(!((ifp)->if_capabilities & IFCAP_LINKSTATE) \
 				 || (ifp)->if_link_state == LINK_STATE_UP)
 
 #define	RO_NHFREE(_ro) do {					\
 	if ((_ro)->ro_nh) {					\
 		NH_FREE((_ro)->ro_nh);				\
 		(_ro)->ro_nh = NULL;				\
 	}							\
 } while (0)
 
 #define	RO_INVALIDATE_CACHE(ro) do {					\
 		if ((ro)->ro_lle != NULL) {				\
 			LLE_FREE((ro)->ro_lle);				\
 			(ro)->ro_lle = NULL;				\
 		}							\
 		if ((ro)->ro_nh != NULL) {				\
 			NH_FREE((ro)->ro_nh);				\
 			(ro)->ro_nh = NULL;				\
 		}							\
 	} while (0)
 
 /*
  * Validate a cached route based on a supplied cookie.  If there is an
  * out-of-date cache, simply free it.  Update the generation number
  * for the new allocation
  */
 #define NH_VALIDATE(ro, cookiep, fibnum) do {				\
 	rt_gen_t cookie = RT_GEN(fibnum, (ro)->ro_dst.sa_family);	\
 	if (*(cookiep) != cookie) {					\
 		RO_INVALIDATE_CACHE(ro);				\
 		*(cookiep) = cookie;					\
 	}								\
 } while (0)
 
 struct ifmultiaddr;
 struct rib_head;
 
 void	 rt_ieee80211msg(struct ifnet *, int, void *, size_t);
 void	 rt_ifannouncemsg(struct ifnet *, int);
 void	 rt_ifmsg(struct ifnet *);
 void	 rt_missmsg(int, struct rt_addrinfo *, int, int);
 void	 rt_missmsg_fib(int, struct rt_addrinfo *, int, int, int);
 void	 rt_newaddrmsg_fib(int, struct ifaddr *, struct rtentry *, int);
 int	 rt_addrmsg(int, struct ifaddr *, int);
 int	 rt_routemsg(int, struct rtentry *, struct ifnet *ifp, int, int);
 int	 rt_routemsg_info(int, struct rt_addrinfo *, int);
 void	 rt_newmaddrmsg(int, struct ifmultiaddr *);
 void 	 rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *);
 struct rib_head *rt_table_init(int, int, u_int);
 void	rt_table_destroy(struct rib_head *);
 u_int	rt_tables_get_gen(int table, int fam);
 
 int	rtsock_addrmsg(int, struct ifaddr *, int);
 int	rtsock_routemsg(int, struct rtentry *, struct ifnet *ifp, int, int);
 int	rtsock_routemsg_info(int, struct rt_addrinfo *, int);
 
 struct sockaddr *rtsock_fix_netmask(const struct sockaddr *dst,
 	    const struct sockaddr *smask, struct sockaddr_storage *dmask);
 /*
  * Note the following locking behavior:
  *
  *    rtfree() and RTFREE_LOCKED() require a locked rtentry
  *
  *    RTFREE() uses an unlocked entry.
  */
 
 void	 rtfree(struct rtentry *);
 void	 rtfree_func(struct rtentry *);
 void	rt_updatemtu(struct ifnet *);
 
-typedef int rt_walktree_f_t(struct rtentry *, void *);
-typedef void rt_setwarg_t(struct rib_head *, uint32_t, int, void *);
-void	rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f,
-	    void *arg, bool report);
-void	rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *);
-void	rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg);
 void	rt_flushifroutes_af(struct ifnet *, int);
 void	rt_flushifroutes(struct ifnet *ifp);
 
 /* XXX MRT COMPAT VERSIONS THAT SET UNIVERSE to 0 */
 /* Thes are used by old code not yet converted to use multiple FIBS */
 int	 rtinit(struct ifaddr *, int, int);
 
 /* XXX MRT NEW VERSIONS THAT USE FIBs
  * For now the protocol indepedent versions are the same as the AF_INET ones
  * but this will change.. 
  */
 int	 rtioctl_fib(u_long, caddr_t, u_int);
 int	 rtrequest_fib(int, struct sockaddr *,
 	    struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int);
 int	 rtrequest1_fib(int, struct rt_addrinfo *, struct rtentry **, u_int);
 int	rib_lookup_info(uint32_t, const struct sockaddr *, uint32_t, uint32_t,
 	    struct rt_addrinfo *);
 void	rib_free_info(struct rt_addrinfo *info);
-int	rib_add_redirect(u_int fibnum, struct sockaddr *dst,
-	   struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp,
-	   int flags, int expire_sec);
 
 /* New API */
-void	rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg);
 struct nhop_object *rib_lookup(uint32_t fibnum, const struct sockaddr *dst,
 	    uint32_t flags, uint32_t flowid);
 #endif
 
 #endif
Index: head/sys/netinet/in_rmx.c
===================================================================
--- head/sys/netinet/in_rmx.c	(revision 361703)
+++ head/sys/netinet/in_rmx.c	(revision 361704)
@@ -1,201 +1,202 @@
 /*-
  * Copyright 1994, 1995 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/socket.h>
 #include <sys/mbuf.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
+#include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop.h>
 #include <net/route/shared.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 
 extern int	in_inithead(void **head, int off, u_int fibnum);
 #ifdef VIMAGE
 extern int	in_detachhead(void **head, int off);
 #endif
 
 static int
 rib4_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask,
     struct nhop_object *nh)
 {
 	const struct sockaddr_in *addr4 = (const struct sockaddr_in *)addr;
 	uint16_t nh_type;
 	int rt_flags;
 
 	/* XXX: RTF_LOCAL && RTF_MULTICAST */
 
 	rt_flags = nhop_get_rtflags(nh);
 
 	if (rt_flags & RTF_HOST) {
 
 		/*
 		 * Backward compatibility:
 		 * if the destination is broadcast,
 		 * mark route as broadcast.
 		 * This behavior was useful when route cloning
 		 * was in place, so there was an explicit cloned
 		 * route for every broadcasted address.
 		 * Currently (2020-04) there is no kernel machinery
 		 * to do route cloning, though someone might explicitly
 		 * add these routes to support some cases with active-active
 		 * load balancing. Given that, retain this support.
 		 */
 		if (in_broadcast(addr4->sin_addr, nh->nh_ifp)) {
 			rt_flags |= RTF_BROADCAST;
 			nhop_set_rtflags(nh, rt_flags);
 			nh->nh_flags |= NHF_BROADCAST;
 		}
 	}
 
 	/*
 	 * Check route MTU:
 	 * inherit interface MTU if not set or
 	 * check if MTU is too large.
 	 */
 	if (nh->nh_mtu == 0) {
 		nh->nh_mtu = nh->nh_ifp->if_mtu;
 	} else if (nh->nh_mtu > nh->nh_ifp->if_mtu)
 		nh->nh_mtu = nh->nh_ifp->if_mtu;
 
 	/* Ensure that default route nhop has special flag */
 	const struct sockaddr_in *mask4 = (const struct sockaddr_in *)mask;
 	if ((rt_flags & RTF_HOST) == 0 && mask4 != NULL &&
 	    mask4->sin_addr.s_addr == 0)
 		nh->nh_flags |= NHF_DEFAULT;
 
 	/* Set nhop type to basic per-AF nhop */
 	if (nhop_get_type(nh) == 0) {
 		if (nh->nh_flags & NHF_GATEWAY)
 			nh_type = NH_TYPE_IPV4_ETHER_NHOP;
 		else
 			nh_type = NH_TYPE_IPV4_ETHER_RSLV;
 
 		nhop_set_type(nh, nh_type);
 	}
 
 	return (0);
 }
 
 static int _in_rt_was_here;
 /*
  * Initialize our routing tree.
  */
 int
 in_inithead(void **head, int off, u_int fibnum)
 {
 	struct rib_head *rh;
 
 	rh = rt_table_init(32, AF_INET, fibnum);
 	if (rh == NULL)
 		return (0);
 
 	rh->rnh_preadd = rib4_preadd;
 #ifdef	RADIX_MPATH
 	rt_mpath_init_rnh(rh);
 #endif
 	*head = (void *)rh;
 
 	if (_in_rt_was_here == 0 ) {
 		_in_rt_was_here = 1;
 	}
 	return 1;
 }
 
 #ifdef VIMAGE
 int
 in_detachhead(void **head, int off)
 {
 
 	rt_table_destroy((struct rib_head *)(*head));
 	return (1);
 }
 #endif
 
 /*
  * This zaps old routes when the interface goes down or interface
  * address is deleted.  In the latter case, it deletes static routes
  * that point to this address.  If we don't do this, we may end up
  * using the old address in the future.  The ones we always want to
  * get rid of are things like ARP entries, since the user might down
  * the interface, walk over to a completely different network, and
  * plug back in.
  */
 struct in_ifadown_arg {
 	struct ifaddr *ifa;
 	int del;
 };
 
 static int
 in_ifadownkill(const struct rtentry *rt, const struct nhop_object *nh,
     void *xap)
 {
 	struct in_ifadown_arg *ap = xap;
 
 	if (nh->nh_ifa != ap->ifa)
 		return (0);
 
 	if ((nhop_get_rtflags(nh) & RTF_STATIC) != 0 && ap->del == 0)
 		return (0);
 
 	return (1);
 }
 
 void
 in_ifadown(struct ifaddr *ifa, int delete)
 {
 	struct in_ifadown_arg arg;
 
 	KASSERT(ifa->ifa_addr->sa_family == AF_INET,
 	    ("%s: wrong family", __func__));
 
 	arg.ifa = ifa;
 	arg.del = delete;
 
 	rt_foreach_fib_walk_del(AF_INET, in_ifadownkill, &arg);
 	ifa->ifa_flags &= ~IFA_ROUTE;		/* XXXlocking? */
 }
 
Index: head/sys/netinet/ip_icmp.c
===================================================================
--- head/sys/netinet/ip_icmp.c	(revision 361703)
+++ head/sys/netinet/ip_icmp.c	(revision 361704)
@@ -1,1141 +1,1142 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_icmp.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
+#include <net/route/route_ctl.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/sctp.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/icmp_var.h>
 
 
 #ifdef INET
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 #endif /* INET */
 
 /*
  * ICMP routines: error generation, receive packet processing, and
  * routines to turnaround packets back to the originator, and
  * host table maintenance routines.
  */
 VNET_DEFINE_STATIC(int, icmplim) = 200;
 #define	V_icmplim			VNET(icmplim)
 SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmplim), 0,
 	"Maximum number of ICMP responses per second");
 
 VNET_DEFINE_STATIC(int, icmplim_output) = 1;
 #define	V_icmplim_output		VNET(icmplim_output)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmplim_output), 0,
 	"Enable logging of ICMP response rate limiting");
 
 #ifdef INET
 VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat);
 VNET_PCPUSTAT_SYSINIT(icmpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat,
     icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(icmpstat);
 #endif /* VIMAGE */
 
 VNET_DEFINE_STATIC(int, icmpmaskrepl) = 0;
 #define	V_icmpmaskrepl			VNET(icmpmaskrepl)
 SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmpmaskrepl), 0,
 	"Reply to ICMP Address Mask Request packets");
 
 VNET_DEFINE_STATIC(u_int, icmpmaskfake) = 0;
 #define	V_icmpmaskfake			VNET(icmpmaskfake)
 SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmpmaskfake), 0,
 	"Fake reply to ICMP Address Mask Request packets");
 
 VNET_DEFINE(int, drop_redirect) = 0;
 #define	V_drop_redirect			VNET(drop_redirect)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(drop_redirect), 0,
 	"Ignore ICMP redirects");
 
 VNET_DEFINE_STATIC(int, log_redirect) = 0;
 #define	V_log_redirect			VNET(log_redirect)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(log_redirect), 0,
 	"Log ICMP redirects to the console");
 
 VNET_DEFINE_STATIC(int, redirtimeout) = 60 * 10; /* 10 minutes */
 #define	V_redirtimeout			VNET(redirtimeout)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, redirtimeout, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(redirtimeout), 0,
 	"Delay in seconds before expiring redirect route");
 
 VNET_DEFINE_STATIC(char, reply_src[IFNAMSIZ]);
 #define	V_reply_src			VNET(reply_src)
 SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(reply_src), IFNAMSIZ,
 	"ICMP reply source for non-local packets");
 
 VNET_DEFINE_STATIC(int, icmp_rfi) = 0;
 #define	V_icmp_rfi			VNET(icmp_rfi)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmp_rfi), 0,
 	"ICMP reply from incoming interface for non-local packets");
 /* Router requirements RFC 1812 section 4.3.2.3 requires 576 - 28. */
 VNET_DEFINE_STATIC(int, icmp_quotelen) = 548;
 #define	V_icmp_quotelen			VNET(icmp_quotelen)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmp_quotelen), 0,
 	"Number of bytes from original packet to quote in ICMP reply");
 
 VNET_DEFINE_STATIC(int, icmpbmcastecho) = 0;
 #define	V_icmpbmcastecho		VNET(icmpbmcastecho)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmpbmcastecho), 0,
 	"Reply to multicast ICMP Echo Request and Timestamp packets");
 
 VNET_DEFINE_STATIC(int, icmptstamprepl) = 1;
 #define	V_icmptstamprepl		VNET(icmptstamprepl)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmptstamprepl), 0,
 	"Respond to ICMP Timestamp packets");
 
 VNET_DEFINE_STATIC(int, error_keeptags) = 0;
 #define	V_error_keeptags		VNET(error_keeptags)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, error_keeptags, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(error_keeptags), 0,
 	"ICMP error response keeps copy of mbuf_tags of original packet");
 
 #ifdef ICMPPRINTFS
 int	icmpprintfs = 0;
 #endif
 
 static void	icmp_reflect(struct mbuf *);
 static void	icmp_send(struct mbuf *, struct mbuf *);
 static int	icmp_verify_redirect_gateway(struct sockaddr_in *,
     struct sockaddr_in *, struct sockaddr_in *, u_int);
 
 extern	struct protosw inetsw[];
 
 /*
  * Kernel module interface for updating icmpstat.  The argument is an index
  * into icmpstat treated as an array of u_long.  While this encodes the
  * general layout of icmpstat into the caller, it doesn't encode its
  * location, so that future changes to add, for example, per-CPU stats
  * support won't cause binary compatibility problems for kernel modules.
  */
 void
 kmod_icmpstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(icmpstat)[statnum], 1);
 }
 
 /*
  * Generate an error packet of type error
  * in response to bad packet ip.
  */
 void
 icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
 {
 	struct ip *oip, *nip;
 	struct icmp *icp;
 	struct mbuf *m;
 	unsigned icmplen, icmpelen, nlen, oiphlen;
 
 	KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type",
 	    __func__));
 
 	if (type != ICMP_REDIRECT)
 		ICMPSTAT_INC(icps_error);
 	/*
 	 * Don't send error:
 	 *  if the original packet was encrypted.
 	 *  if not the first fragment of message.
 	 *  in response to a multicast or broadcast packet.
 	 *  if the old packet protocol was an ICMP error message.
 	 */
 	if (n->m_flags & M_DECRYPTED)
 		goto freeit;
 	if (n->m_flags & (M_BCAST|M_MCAST))
 		goto freeit;
 
 	/* Drop if IP header plus 8 bytes is not contiguous in first mbuf. */
 	if (n->m_len < sizeof(struct ip) + ICMP_MINLEN)
 		goto freeit;
 	oip = mtod(n, struct ip *);
 	oiphlen = oip->ip_hl << 2;
 	if (n->m_len < oiphlen + ICMP_MINLEN)
 		goto freeit;
 #ifdef ICMPPRINTFS
 	if (icmpprintfs)
 		printf("icmp_error(%p, %x, %d)\n", oip, type, code);
 #endif
 	if (oip->ip_off & htons(~(IP_MF|IP_DF)))
 		goto freeit;
 	if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
 	    !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip +
 		oiphlen))->icmp_type)) {
 		ICMPSTAT_INC(icps_oldicmp);
 		goto freeit;
 	}
 	/*
 	 * Calculate length to quote from original packet and
 	 * prevent the ICMP mbuf from overflowing.
 	 * Unfortunately this is non-trivial since ip_forward()
 	 * sends us truncated packets.
 	 */
 	nlen = m_length(n, NULL);
 	if (oip->ip_p == IPPROTO_TCP) {
 		struct tcphdr *th;
 		int tcphlen;
 
 		if (oiphlen + sizeof(struct tcphdr) > n->m_len &&
 		    n->m_next == NULL)
 			goto stdreply;
 		if (n->m_len < oiphlen + sizeof(struct tcphdr) &&
 		    (n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL)
 			goto freeit;
 		oip = mtod(n, struct ip *);
 		th = mtodo(n, oiphlen);
 		tcphlen = th->th_off << 2;
 		if (tcphlen < sizeof(struct tcphdr))
 			goto freeit;
 		if (ntohs(oip->ip_len) < oiphlen + tcphlen)
 			goto freeit;
 		if (oiphlen + tcphlen > n->m_len && n->m_next == NULL)
 			goto stdreply;
 		if (n->m_len < oiphlen + tcphlen &&
 		    (n = m_pullup(n, oiphlen + tcphlen)) == NULL)
 			goto freeit;
 		oip = mtod(n, struct ip *);
 		icmpelen = max(tcphlen, min(V_icmp_quotelen,
 		    ntohs(oip->ip_len) - oiphlen));
 	} else if (oip->ip_p == IPPROTO_SCTP) {
 		struct sctphdr *sh;
 		struct sctp_chunkhdr *ch;
 
 		if (ntohs(oip->ip_len) < oiphlen + sizeof(struct sctphdr))
 			goto stdreply;
 		if (oiphlen + sizeof(struct sctphdr) > n->m_len &&
 		    n->m_next == NULL)
 			goto stdreply;
 		if (n->m_len < oiphlen + sizeof(struct sctphdr) &&
 		    (n = m_pullup(n, oiphlen + sizeof(struct sctphdr))) == NULL)
 			goto freeit;
 		oip = mtod(n, struct ip *);
 		icmpelen = max(sizeof(struct sctphdr),
 		    min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen));
 		sh = mtodo(n, oiphlen);
 		if (ntohl(sh->v_tag) == 0 &&
 		    ntohs(oip->ip_len) >= oiphlen +
 		    sizeof(struct sctphdr) + 8 &&
 		    (n->m_len >= oiphlen + sizeof(struct sctphdr) + 8 ||
 		     n->m_next != NULL)) {
 			if (n->m_len < oiphlen + sizeof(struct sctphdr) + 8 &&
 			    (n = m_pullup(n, oiphlen +
 			    sizeof(struct sctphdr) + 8)) == NULL)
 				goto freeit;
 			oip = mtod(n, struct ip *);
 			sh = mtodo(n, oiphlen);
 			ch = (struct sctp_chunkhdr *)(sh + 1);
 			if (ch->chunk_type == SCTP_INITIATION) {
 				icmpelen = max(sizeof(struct sctphdr) + 8,
 				    min(V_icmp_quotelen, ntohs(oip->ip_len) -
 				    oiphlen));
 			}
 		}
 	} else
 stdreply:	icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) -
 		    oiphlen));
 
 	icmplen = min(oiphlen + icmpelen, nlen);
 	if (icmplen < sizeof(struct ip))
 		goto freeit;
 
 	if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen)
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	else
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		goto freeit;
 #ifdef MAC
 	mac_netinet_icmp_reply(n, m);
 #endif
 	icmplen = min(icmplen, M_TRAILINGSPACE(m) -
 	    sizeof(struct ip) - ICMP_MINLEN);
 	m_align(m, sizeof(struct ip) + ICMP_MINLEN + icmplen);
 	m->m_data += sizeof(struct ip);
 	m->m_len = ICMP_MINLEN + icmplen;
 
 	/* XXX MRT  make the outgoing packet use the same FIB
 	 * that was associated with the incoming packet
 	 */
 	M_SETFIB(m, M_GETFIB(n));
 	icp = mtod(m, struct icmp *);
 	ICMPSTAT_INC(icps_outhist[type]);
 	icp->icmp_type = type;
 	if (type == ICMP_REDIRECT)
 		icp->icmp_gwaddr.s_addr = dest;
 	else {
 		icp->icmp_void = 0;
 		/*
 		 * The following assignments assume an overlay with the
 		 * just zeroed icmp_void field.
 		 */
 		if (type == ICMP_PARAMPROB) {
 			icp->icmp_pptr = code;
 			code = 0;
 		} else if (type == ICMP_UNREACH &&
 			code == ICMP_UNREACH_NEEDFRAG && mtu) {
 			icp->icmp_nextmtu = htons(mtu);
 		}
 	}
 	icp->icmp_code = code;
 
 	/*
 	 * Copy the quotation into ICMP message and
 	 * convert quoted IP header back to network representation.
 	 */
 	m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip);
 	nip = &icp->icmp_ip;
 
 	/*
 	 * Set up ICMP message mbuf and copy old IP header (without options
 	 * in front of ICMP message.
 	 * If the original mbuf was meant to bypass the firewall, the error
 	 * reply should bypass as well.
 	 */
 	m->m_flags |= n->m_flags & M_SKIP_FIREWALL;
 	KASSERT(M_LEADINGSPACE(m) >= sizeof(struct ip),
 	    ("insufficient space for ip header"));
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 	m->m_pkthdr.len = m->m_len;
 	m->m_pkthdr.rcvif = n->m_pkthdr.rcvif;
 	nip = mtod(m, struct ip *);
 	bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip));
 	nip->ip_len = htons(m->m_len);
 	nip->ip_v = IPVERSION;
 	nip->ip_hl = 5;
 	nip->ip_p = IPPROTO_ICMP;
 	nip->ip_tos = 0;
 	nip->ip_off = 0;
 
 	if (V_error_keeptags)
 		m_tag_copy_chain(m, n, M_NOWAIT);
 
 	icmp_reflect(m);
 
 freeit:
 	m_freem(n);
 }
 
 /*
  * Process a received ICMP message.
  */
 int
 icmp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct icmp *icp;
 	struct in_ifaddr *ia;
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct sockaddr_in icmpsrc, icmpdst, icmpgw;
 	int hlen = *offp;
 	int icmplen = ntohs(ip->ip_len) - *offp;
 	int i, code;
 	void (*ctlfunc)(int, struct sockaddr *, void *);
 	int fibnum;
 
 	NET_EPOCH_ASSERT();
 
 	*mp = NULL;
 
 	/*
 	 * Locate icmp structure in mbuf, and check
 	 * that not corrupted and of at least minimum length.
 	 */
 #ifdef ICMPPRINTFS
 	if (icmpprintfs) {
 		char srcbuf[INET_ADDRSTRLEN];
 		char dstbuf[INET_ADDRSTRLEN];
 
 		printf("icmp_input from %s to %s, len %d\n",
 		    inet_ntoa_r(ip->ip_src, srcbuf),
 		    inet_ntoa_r(ip->ip_dst, dstbuf), icmplen);
 	}
 #endif
 	if (icmplen < ICMP_MINLEN) {
 		ICMPSTAT_INC(icps_tooshort);
 		goto freeit;
 	}
 	i = hlen + min(icmplen, ICMP_ADVLENMIN);
 	if (m->m_len < i && (m = m_pullup(m, i)) == NULL)  {
 		ICMPSTAT_INC(icps_tooshort);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 	m->m_len -= hlen;
 	m->m_data += hlen;
 	icp = mtod(m, struct icmp *);
 	if (in_cksum(m, icmplen)) {
 		ICMPSTAT_INC(icps_checksum);
 		goto freeit;
 	}
 	m->m_len += hlen;
 	m->m_data -= hlen;
 
 #ifdef ICMPPRINTFS
 	if (icmpprintfs)
 		printf("icmp_input, type %d code %d\n", icp->icmp_type,
 		    icp->icmp_code);
 #endif
 
 	/*
 	 * Message type specific processing.
 	 */
 	if (icp->icmp_type > ICMP_MAXTYPE)
 		goto raw;
 
 	/* Initialize */
 	bzero(&icmpsrc, sizeof(icmpsrc));
 	icmpsrc.sin_len = sizeof(struct sockaddr_in);
 	icmpsrc.sin_family = AF_INET;
 	bzero(&icmpdst, sizeof(icmpdst));
 	icmpdst.sin_len = sizeof(struct sockaddr_in);
 	icmpdst.sin_family = AF_INET;
 	bzero(&icmpgw, sizeof(icmpgw));
 	icmpgw.sin_len = sizeof(struct sockaddr_in);
 	icmpgw.sin_family = AF_INET;
 
 	ICMPSTAT_INC(icps_inhist[icp->icmp_type]);
 	code = icp->icmp_code;
 	switch (icp->icmp_type) {
 
 	case ICMP_UNREACH:
 		switch (code) {
 			case ICMP_UNREACH_NET:
 			case ICMP_UNREACH_HOST:
 			case ICMP_UNREACH_SRCFAIL:
 			case ICMP_UNREACH_NET_UNKNOWN:
 			case ICMP_UNREACH_HOST_UNKNOWN:
 			case ICMP_UNREACH_ISOLATED:
 			case ICMP_UNREACH_TOSNET:
 			case ICMP_UNREACH_TOSHOST:
 			case ICMP_UNREACH_HOST_PRECEDENCE:
 			case ICMP_UNREACH_PRECEDENCE_CUTOFF:
 				code = PRC_UNREACH_NET;
 				break;
 
 			case ICMP_UNREACH_NEEDFRAG:
 				code = PRC_MSGSIZE;
 				break;
 
 			/*
 			 * RFC 1122, Sections 3.2.2.1 and 4.2.3.9.
 			 * Treat subcodes 2,3 as immediate RST
 			 */
 			case ICMP_UNREACH_PROTOCOL:
 				code = PRC_UNREACH_PROTOCOL;
 				break;
 			case ICMP_UNREACH_PORT:
 				code = PRC_UNREACH_PORT;
 				break;
 
 			case ICMP_UNREACH_NET_PROHIB:
 			case ICMP_UNREACH_HOST_PROHIB:
 			case ICMP_UNREACH_FILTER_PROHIB:
 				code = PRC_UNREACH_ADMIN_PROHIB;
 				break;
 
 			default:
 				goto badcode;
 		}
 		goto deliver;
 
 	case ICMP_TIMXCEED:
 		if (code > 1)
 			goto badcode;
 		code += PRC_TIMXCEED_INTRANS;
 		goto deliver;
 
 	case ICMP_PARAMPROB:
 		if (code > 1)
 			goto badcode;
 		code = PRC_PARAMPROB;
 	deliver:
 		/*
 		 * Problem with datagram; advise higher level routines.
 		 */
 		if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
 		    icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
 			ICMPSTAT_INC(icps_badlen);
 			goto freeit;
 		}
 		/* Discard ICMP's in response to multicast packets */
 		if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr)))
 			goto badcode;
 #ifdef ICMPPRINTFS
 		if (icmpprintfs)
 			printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
 #endif
 		icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
 		/*
 		 * XXX if the packet contains [IPv4 AH TCP], we can't make a
 		 * notification to TCP layer.
 		 */
 		i = sizeof(struct ip) + min(icmplen, ICMP_ADVLENPREF(icp));
 		ip_stripoptions(m);
 		if (m->m_len < i && (m = m_pullup(m, i)) == NULL) {
 			/* This should actually not happen */
 			ICMPSTAT_INC(icps_tooshort);
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 		icp = (struct icmp *)(ip + 1);
 		/*
 		 * The upper layer handler can rely on:
 		 * - The outer IP header has no options.
 		 * - The outer IP header, the ICMP header, the inner IP header,
 		 *   and the first n bytes of the inner payload are contiguous.
 		 *   n is at least 8, but might be larger based on
 		 *   ICMP_ADVLENPREF. See its definition in ip_icmp.h.
 		 */
 		ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput;
 		if (ctlfunc)
 			(*ctlfunc)(code, (struct sockaddr *)&icmpsrc,
 				   (void *)&icp->icmp_ip);
 		break;
 
 	badcode:
 		ICMPSTAT_INC(icps_badcode);
 		break;
 
 	case ICMP_ECHO:
 		if (!V_icmpbmcastecho
 		    && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
 			ICMPSTAT_INC(icps_bmcastecho);
 			break;
 		}
 		if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0)
 			goto freeit;
 		icp->icmp_type = ICMP_ECHOREPLY;
 		goto reflect;
 
 	case ICMP_TSTAMP:
 		if (V_icmptstamprepl == 0)
 			break;
 		if (!V_icmpbmcastecho
 		    && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
 			ICMPSTAT_INC(icps_bmcasttstamp);
 			break;
 		}
 		if (icmplen < ICMP_TSLEN) {
 			ICMPSTAT_INC(icps_badlen);
 			break;
 		}
 		if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0)
 			goto freeit;
 		icp->icmp_type = ICMP_TSTAMPREPLY;
 		icp->icmp_rtime = iptime();
 		icp->icmp_ttime = icp->icmp_rtime;	/* bogus, do later! */
 		goto reflect;
 
 	case ICMP_MASKREQ:
 		if (V_icmpmaskrepl == 0)
 			break;
 		/*
 		 * We are not able to respond with all ones broadcast
 		 * unless we receive it over a point-to-point interface.
 		 */
 		if (icmplen < ICMP_MASKLEN)
 			break;
 		switch (ip->ip_dst.s_addr) {
 
 		case INADDR_BROADCAST:
 		case INADDR_ANY:
 			icmpdst.sin_addr = ip->ip_src;
 			break;
 
 		default:
 			icmpdst.sin_addr = ip->ip_dst;
 		}
 		ia = (struct in_ifaddr *)ifaof_ifpforaddr(
 			    (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif);
 		if (ia == NULL)
 			break;
 		if (ia->ia_ifp == NULL)
 			break;
 		icp->icmp_type = ICMP_MASKREPLY;
 		if (V_icmpmaskfake == 0)
 			icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr;
 		else
 			icp->icmp_mask = V_icmpmaskfake;
 		if (ip->ip_src.s_addr == 0) {
 			if (ia->ia_ifp->if_flags & IFF_BROADCAST)
 			    ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr;
 			else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT)
 			    ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr;
 		}
 reflect:
 		ICMPSTAT_INC(icps_reflect);
 		ICMPSTAT_INC(icps_outhist[icp->icmp_type]);
 		icmp_reflect(m);
 		return (IPPROTO_DONE);
 
 	case ICMP_REDIRECT:
 		if (V_log_redirect) {
 			u_long src, dst, gw;
 
 			src = ntohl(ip->ip_src.s_addr);
 			dst = ntohl(icp->icmp_ip.ip_dst.s_addr);
 			gw = ntohl(icp->icmp_gwaddr.s_addr);
 			printf("icmp redirect from %d.%d.%d.%d: "
 			       "%d.%d.%d.%d => %d.%d.%d.%d\n",
 			       (int)(src >> 24), (int)((src >> 16) & 0xff),
 			       (int)((src >> 8) & 0xff), (int)(src & 0xff),
 			       (int)(dst >> 24), (int)((dst >> 16) & 0xff),
 			       (int)((dst >> 8) & 0xff), (int)(dst & 0xff),
 			       (int)(gw >> 24), (int)((gw >> 16) & 0xff),
 			       (int)((gw >> 8) & 0xff), (int)(gw & 0xff));
 		}
 		/*
 		 * RFC1812 says we must ignore ICMP redirects if we
 		 * are acting as router.
 		 */
 		if (V_drop_redirect || V_ipforwarding)
 			break;
 		if (code > 3)
 			goto badcode;
 		if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
 		    icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
 			ICMPSTAT_INC(icps_badlen);
 			break;
 		}
 		/*
 		 * Short circuit routing redirects to force
 		 * immediate change in the kernel's routing
 		 * tables.  The message is also handed to anyone
 		 * listening on a raw socket (e.g. the routing
 		 * daemon for use in updating its tables).
 		 */
 		icmpgw.sin_addr = ip->ip_src;
 		icmpdst.sin_addr = icp->icmp_gwaddr;
 #ifdef	ICMPPRINTFS
 		if (icmpprintfs) {
 			char dstbuf[INET_ADDRSTRLEN];
 			char gwbuf[INET_ADDRSTRLEN];
 
 			printf("redirect dst %s to %s\n",
 			       inet_ntoa_r(icp->icmp_ip.ip_dst, dstbuf),
 			       inet_ntoa_r(icp->icmp_gwaddr, gwbuf));
 		}
 #endif
 		icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
 
 		/*
 		 * RFC 1122 says network (code 0,2) redirects SHOULD
 		 * be treated identically to the host redirects.
 		 * Given that, ignore network masks.
 		 */
 
 		/*
 		 * Variable values:
 		 * icmpsrc: route destination
 		 * icmpdst: route gateway
 		 * icmpgw: message source
 		 */
 
 		if (icmp_verify_redirect_gateway(&icmpgw, &icmpsrc, &icmpdst,
 		    M_GETFIB(m)) != 0) {
 			/* TODO: increment bad redirects here */
 			break;
 		}
 
 		for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 			rib_add_redirect(fibnum, (struct sockaddr *)&icmpsrc,
 			    (struct sockaddr *)&icmpdst,
 			    (struct sockaddr *)&icmpgw, m->m_pkthdr.rcvif,
 			    RTF_GATEWAY, V_redirtimeout);
 		}
 		pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc);
 		break;
 
 	/*
 	 * No kernel processing for the following;
 	 * just fall through to send to raw listener.
 	 */
 	case ICMP_ECHOREPLY:
 	case ICMP_ROUTERADVERT:
 	case ICMP_ROUTERSOLICIT:
 	case ICMP_TSTAMPREPLY:
 	case ICMP_IREQREPLY:
 	case ICMP_MASKREPLY:
 	case ICMP_SOURCEQUENCH:
 	default:
 		break;
 	}
 
 raw:
 	*mp = m;
 	rip_input(mp, offp, proto);
 	return (IPPROTO_DONE);
 
 freeit:
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Reflect the ip packet back to the source
  */
 static void
 icmp_reflect(struct mbuf *m)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct ip *ip = mtod(m, struct ip *);
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	struct in_ifaddr *ia;
 	struct in_addr t;
 	struct nhop_object *nh;
 	struct mbuf *opts = NULL;
 	int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
 
 	NET_EPOCH_ASSERT();
 
 	if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 	    IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) ||
 	    IN_ZERONET(ntohl(ip->ip_src.s_addr)) ) {
 		m_freem(m);	/* Bad return address */
 		ICMPSTAT_INC(icps_badaddr);
 		goto done;	/* Ip_output() will check for broadcast */
 	}
 
 	t = ip->ip_dst;
 	ip->ip_dst = ip->ip_src;
 
 	/*
 	 * Source selection for ICMP replies:
 	 *
 	 * If the incoming packet was addressed directly to one of our
 	 * own addresses, use dst as the src for the reply.
 	 */
 	IN_IFADDR_RLOCK(&in_ifa_tracker);
 	LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) {
 		if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) {
 			t = IA_SIN(ia)->sin_addr;
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			goto match;
 		}
 	}
 	IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 
 	/*
 	 * If the incoming packet was addressed to one of our broadcast
 	 * addresses, use the first non-broadcast address which corresponds
 	 * to the incoming interface.
 	 */
 	ifp = m->m_pkthdr.rcvif;
 	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    t.s_addr) {
 				t = IA_SIN(ia)->sin_addr;
 				goto match;
 			}
 		}
 	}
 	/*
 	 * If the packet was transiting through us, use the address of
 	 * the interface the packet came through in.  If that interface
 	 * doesn't have a suitable IP address, the normal selection
 	 * criteria apply.
 	 */
 	if (V_icmp_rfi && ifp != NULL) {
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			t = IA_SIN(ia)->sin_addr;
 			goto match;
 		}
 	}
 	/*
 	 * If the incoming packet was not addressed directly to us, use
 	 * designated interface for icmp replies specified by sysctl
 	 * net.inet.icmp.reply_src (default not set). Otherwise continue
 	 * with normal source selection.
 	 */
 	if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) {
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			t = IA_SIN(ia)->sin_addr;
 			goto match;
 		}
 	}
 	/*
 	 * If the packet was transiting through us, use the address of
 	 * the interface that is the closest to the packet source.
 	 * When we don't have a route back to the packet source, stop here
 	 * and drop the packet.
 	 */
 	nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_NONE, 0);
 	if (nh == NULL) {
 		m_freem(m);
 		ICMPSTAT_INC(icps_noroute);
 		goto done;
 	}
 	t = IA_SIN(ifatoia(nh->nh_ifa))->sin_addr;
 match:
 #ifdef MAC
 	mac_netinet_icmp_replyinplace(m);
 #endif
 	ip->ip_src = t;
 	ip->ip_ttl = V_ip_defttl;
 
 	if (optlen > 0) {
 		u_char *cp;
 		int opt, cnt;
 		u_int len;
 
 		/*
 		 * Retrieve any source routing from the incoming packet;
 		 * add on any record-route or timestamp options.
 		 */
 		cp = (u_char *) (ip + 1);
 		if ((opts = ip_srcroute(m)) == NULL &&
 		    (opts = m_gethdr(M_NOWAIT, MT_DATA))) {
 			opts->m_len = sizeof(struct in_addr);
 			mtod(opts, struct in_addr *)->s_addr = 0;
 		}
 		if (opts) {
 #ifdef ICMPPRINTFS
 		    if (icmpprintfs)
 			    printf("icmp_reflect optlen %d rt %d => ",
 				optlen, opts->m_len);
 #endif
 		    for (cnt = optlen; cnt > 0; cnt -= len, cp += len) {
 			    opt = cp[IPOPT_OPTVAL];
 			    if (opt == IPOPT_EOL)
 				    break;
 			    if (opt == IPOPT_NOP)
 				    len = 1;
 			    else {
 				    if (cnt < IPOPT_OLEN + sizeof(*cp))
 					    break;
 				    len = cp[IPOPT_OLEN];
 				    if (len < IPOPT_OLEN + sizeof(*cp) ||
 				        len > cnt)
 					    break;
 			    }
 			    /*
 			     * Should check for overflow, but it "can't happen"
 			     */
 			    if (opt == IPOPT_RR || opt == IPOPT_TS ||
 				opt == IPOPT_SECURITY) {
 				    bcopy((caddr_t)cp,
 					mtod(opts, caddr_t) + opts->m_len, len);
 				    opts->m_len += len;
 			    }
 		    }
 		    /* Terminate & pad, if necessary */
 		    cnt = opts->m_len % 4;
 		    if (cnt) {
 			    for (; cnt < 4; cnt++) {
 				    *(mtod(opts, caddr_t) + opts->m_len) =
 					IPOPT_EOL;
 				    opts->m_len++;
 			    }
 		    }
 #ifdef ICMPPRINTFS
 		    if (icmpprintfs)
 			    printf("%d\n", opts->m_len);
 #endif
 		}
 		ip_stripoptions(m);
 	}
 	m_tag_delete_nonpersistent(m);
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 	icmp_send(m, opts);
 done:
 	if (opts)
 		(void)m_free(opts);
 }
 
 /*
  * Verifies if redirect message is valid, according to RFC 1122
  *
  * @src: sockaddr with address of redirect originator
  * @dst: sockaddr with destination in question
  * @gateway: new proposed gateway
  *
  * Returns 0 on success.
  */
 static int
 icmp_verify_redirect_gateway(struct sockaddr_in *src, struct sockaddr_in *dst,
     struct sockaddr_in *gateway, u_int fibnum)
 {
 	struct nhop_object *nh;
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 
 	/* Verify the gateway is directly reachable. */
 	if ((ifa = ifa_ifwithnet((struct sockaddr *)gateway, 0, fibnum))==NULL)
 		return (ENETUNREACH);
 
 	/* TODO: fib-aware. */
 	if (ifa_ifwithaddr_check((struct sockaddr *)gateway))
 		return (EHOSTUNREACH);
 
 	nh = fib4_lookup(fibnum, dst->sin_addr, 0, NHR_NONE, 0);
 	if (nh == NULL)
 		return (EINVAL);
 
 	/*
 	 * If the redirect isn't from our current router for this dst,
 	 * it's either old or wrong.  If it redirects us to ourselves,
 	 * we have a routing loop, perhaps as a result of an interface
 	 * going down recently.
 	 */
 	if (!sa_equal((struct sockaddr *)src, &nh->gw_sa))
 		return (EINVAL);
 	if (nh->nh_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK)
 		return (EINVAL);
 
 	/* If host route already exists, ignore redirect. */
 	if (nh->nh_flags & NHF_HOST)
 		return (EEXIST);
 
 	/* If the prefix is directly reachable, ignore redirect. */
 	if (!(nh->nh_flags & NHF_GATEWAY))
 		return (EEXIST);
 
 	return (0);
 }
 
 
 /*
  * Send an icmp packet back to the ip level,
  * after supplying a checksum.
  */
 static void
 icmp_send(struct mbuf *m, struct mbuf *opts)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	int hlen;
 	struct icmp *icp;
 
 	hlen = ip->ip_hl << 2;
 	m->m_data += hlen;
 	m->m_len -= hlen;
 	icp = mtod(m, struct icmp *);
 	icp->icmp_cksum = 0;
 	icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - hlen);
 	m->m_data -= hlen;
 	m->m_len += hlen;
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef ICMPPRINTFS
 	if (icmpprintfs) {
 		char dstbuf[INET_ADDRSTRLEN];
 		char srcbuf[INET_ADDRSTRLEN];
 
 		printf("icmp_send dst %s src %s\n",
 		    inet_ntoa_r(ip->ip_dst, dstbuf),
 		    inet_ntoa_r(ip->ip_src, srcbuf));
 	}
 #endif
 	(void) ip_output(m, opts, NULL, 0, NULL, NULL);
 }
 
 /*
  * Return milliseconds since 00:00 UTC in network format.
  */
 uint32_t
 iptime(void)
 {
 	struct timeval atv;
 	u_long t;
 
 	getmicrotime(&atv);
 	t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000;
 	return (htonl(t));
 }
 
 /*
  * Return the next larger or smaller MTU plateau (table from RFC 1191)
  * given current value MTU.  If DIR is less than zero, a larger plateau
  * is returned; otherwise, a smaller value is returned.
  */
 int
 ip_next_mtu(int mtu, int dir)
 {
 	static int mtutab[] = {
 		65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508,
 		296, 68, 0
 	};
 	int i, size;
 
 	size = (sizeof mtutab) / (sizeof mtutab[0]);
 	if (dir >= 0) {
 		for (i = 0; i < size; i++)
 			if (mtu > mtutab[i])
 				return mtutab[i];
 	} else {
 		for (i = size - 1; i >= 0; i--)
 			if (mtu < mtutab[i])
 				return mtutab[i];
 		if (mtu == mtutab[0])
 			return mtutab[0];
 	}
 	return 0;
 }
 #endif /* INET */
 
 
 /*
  * badport_bandlim() - check for ICMP bandwidth limit
  *
  *	Return 0 if it is ok to send an ICMP error response, -1 if we have
  *	hit our bandwidth limit and it is not ok.
  *
  *	If icmplim is <= 0, the feature is disabled and 0 is returned.
  *
  *	For now we separate the TCP and UDP subsystems w/ different 'which'
  *	values.  We may eventually remove this separation (and simplify the
  *	code further).
  *
  *	Note that the printing of the error message is delayed so we can
  *	properly print the icmp error rate that the system was trying to do
  *	(i.e. 22000/100 pps, etc...).  This can cause long delays in printing
  *	the 'final' error, but it doesn't make sense to solve the printing
  *	delay with more complex code.
  */
 struct icmp_rate {
 	const char *descr;
 	struct counter_rate cr;
 };
 VNET_DEFINE_STATIC(struct icmp_rate, icmp_rates[BANDLIM_MAX]) = {
 	{ "icmp unreach response" },
 	{ "icmp ping response" },
 	{ "icmp tstamp response" },
 	{ "closed port RST response" },
 	{ "open port RST response" },
 	{ "icmp6 unreach response" },
 	{ "sctp ootb response" }
 };
 #define	V_icmp_rates	VNET(icmp_rates)
 
 static void
 icmp_bandlimit_init(void)
 {
 
 	for (int i = 0; i < BANDLIM_MAX; i++) {
 		V_icmp_rates[i].cr.cr_rate = counter_u64_alloc(M_WAITOK);
 		V_icmp_rates[i].cr.cr_ticks = ticks;
 	}
 }
 VNET_SYSINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY,
     icmp_bandlimit_init, NULL);
 
 static void
 icmp_bandlimit_uninit(void)
 {
 
 	for (int i = 0; i < BANDLIM_MAX; i++)
 		counter_u64_free(V_icmp_rates[i].cr.cr_rate);
 }
 VNET_SYSUNINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     icmp_bandlimit_uninit, NULL);
 
 int
 badport_bandlim(int which)
 {
 	int64_t pps;
 
 	if (V_icmplim == 0 || which == BANDLIM_UNLIMITED)
 		return (0);
 
 	KASSERT(which >= 0 && which < BANDLIM_MAX,
 	    ("%s: which %d", __func__, which));
 
 	pps = counter_ratecheck(&V_icmp_rates[which].cr, V_icmplim);
 	if (pps == -1)
 		return (-1);
 	if (pps > 0 && V_icmplim_output)
 		log(LOG_NOTICE, "Limiting %s from %jd to %d packets/sec\n",
 			V_icmp_rates[which].descr, (intmax_t )pps, V_icmplim);
 	return (0);
 }
Index: head/sys/netinet6/icmp6.c
===================================================================
--- head/sys/netinet6/icmp6.c	(revision 361703)
+++ head/sys/netinet6/icmp6.c	(revision 361704)
@@ -1,2806 +1,2807 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_icmp.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define	MBUF_PRIVATE	/* XXXRW: Optimisation tries to avoid M_EXT mbufs */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/route.h>
+#include <net/route/route_ctl.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/tcp_var.h>
 
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6protosw.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/send.h>
 
 extern struct domain inet6domain;
 
 VNET_PCPUSTAT_DEFINE(struct icmp6stat, icmp6stat);
 VNET_PCPUSTAT_SYSINIT(icmp6stat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(icmp6stat);
 #endif /* VIMAGE */
 
 VNET_DECLARE(struct inpcbinfo, ripcbinfo);
 VNET_DECLARE(struct inpcbhead, ripcb);
 VNET_DECLARE(int, icmp6errppslim);
 VNET_DEFINE_STATIC(int, icmp6errpps_count) = 0;
 VNET_DEFINE_STATIC(struct timeval, icmp6errppslim_last);
 VNET_DECLARE(int, icmp6_nodeinfo);
 
 #define	V_ripcbinfo			VNET(ripcbinfo)
 #define	V_ripcb				VNET(ripcb)
 #define	V_icmp6errppslim		VNET(icmp6errppslim)
 #define	V_icmp6errpps_count		VNET(icmp6errpps_count)
 #define	V_icmp6errppslim_last		VNET(icmp6errppslim_last)
 #define	V_icmp6_nodeinfo		VNET(icmp6_nodeinfo)
 
 static void icmp6_errcount(int, int);
 static int icmp6_rip6_input(struct mbuf **, int);
 static void icmp6_reflect(struct mbuf *, size_t);
 static int icmp6_ratelimit(const struct in6_addr *, const int, const int);
 static const char *icmp6_redirect_diag(struct in6_addr *,
 	struct in6_addr *, struct in6_addr *);
 static struct mbuf *ni6_input(struct mbuf *, int, struct prison *);
 static struct mbuf *ni6_nametodns(const char *, int, int);
 static int ni6_dnsmatch(const char *, int, const char *, int);
 static int ni6_addrs(struct icmp6_nodeinfo *, struct mbuf *,
 			  struct ifnet **, struct in6_addr *);
 static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *,
 				struct ifnet *, int);
 static int icmp6_notify_error(struct mbuf **, int, int, int);
 
 /*
  * Kernel module interface for updating icmp6stat.  The argument is an index
  * into icmp6stat treated as an array of u_quad_t.  While this encodes the
  * general layout of icmp6stat into the caller, it doesn't encode its
  * location, so that future changes to add, for example, per-CPU stats
  * support won't cause binary compatibility problems for kernel modules.
  */
 void
 kmod_icmp6stat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(icmp6stat)[statnum], 1);
 }
 
 static void
 icmp6_errcount(int type, int code)
 {
 	switch (type) {
 	case ICMP6_DST_UNREACH:
 		switch (code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 			ICMP6STAT_INC(icp6s_odst_unreach_noroute);
 			return;
 		case ICMP6_DST_UNREACH_ADMIN:
 			ICMP6STAT_INC(icp6s_odst_unreach_admin);
 			return;
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			ICMP6STAT_INC(icp6s_odst_unreach_beyondscope);
 			return;
 		case ICMP6_DST_UNREACH_ADDR:
 			ICMP6STAT_INC(icp6s_odst_unreach_addr);
 			return;
 		case ICMP6_DST_UNREACH_NOPORT:
 			ICMP6STAT_INC(icp6s_odst_unreach_noport);
 			return;
 		}
 		break;
 	case ICMP6_PACKET_TOO_BIG:
 		ICMP6STAT_INC(icp6s_opacket_too_big);
 		return;
 	case ICMP6_TIME_EXCEEDED:
 		switch (code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 			ICMP6STAT_INC(icp6s_otime_exceed_transit);
 			return;
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			ICMP6STAT_INC(icp6s_otime_exceed_reassembly);
 			return;
 		}
 		break;
 	case ICMP6_PARAM_PROB:
 		switch (code) {
 		case ICMP6_PARAMPROB_HEADER:
 			ICMP6STAT_INC(icp6s_oparamprob_header);
 			return;
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			ICMP6STAT_INC(icp6s_oparamprob_nextheader);
 			return;
 		case ICMP6_PARAMPROB_OPTION:
 			ICMP6STAT_INC(icp6s_oparamprob_option);
 			return;
 		}
 		break;
 	case ND_REDIRECT:
 		ICMP6STAT_INC(icp6s_oredirect);
 		return;
 	}
 	ICMP6STAT_INC(icp6s_ounknown);
 }
 
 /*
  * A wrapper function for icmp6_error() necessary when the erroneous packet
  * may not contain enough scope zone information.
  */
 void
 icmp6_error2(struct mbuf *m, int type, int code, int param,
     struct ifnet *ifp)
 {
 	struct ip6_hdr *ip6;
 
 	if (ifp == NULL)
 		return;
 
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return;
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0)
 		return;
 	if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
 		return;
 
 	icmp6_error(m, type, code, param);
 }
 
 /*
  * Generate an error packet of type error in response to bad IP6 packet.
  */
 void
 icmp6_error(struct mbuf *m, int type, int code, int param)
 {
 	struct ip6_hdr *oip6, *nip6;
 	struct icmp6_hdr *icmp6;
 	u_int preplen;
 	int off;
 	int nxt;
 
 	ICMP6STAT_INC(icp6s_error);
 
 	/* count per-type-code statistics */
 	icmp6_errcount(type, code);
 
 #ifdef M_DECRYPTED	/*not openbsd*/
 	if (m->m_flags & M_DECRYPTED) {
 		ICMP6STAT_INC(icp6s_canterror);
 		goto freeit;
 	}
 #endif
 
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return;
 		}
 	}
 	oip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * If the destination address of the erroneous packet is a multicast
 	 * address, or the packet was sent using link-layer multicast,
 	 * we should basically suppress sending an error (RFC 2463, Section
 	 * 2.4).
 	 * We have two exceptions (the item e.2 in that section):
 	 * - the Packet Too Big message can be sent for path MTU discovery.
 	 * - the Parameter Problem Message that can be allowed an icmp6 error
 	 *   in the option type field.  This check has been done in
 	 *   ip6_unknown_opt(), so we can just check the type and code.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST) ||
 	     IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) &&
 	    (type != ICMP6_PACKET_TOO_BIG &&
 	     (type != ICMP6_PARAM_PROB ||
 	      code != ICMP6_PARAMPROB_OPTION)))
 		goto freeit;
 
 	/*
 	 * RFC 2463, 2.4 (e.5): source address check.
 	 * XXX: the case of anycast source?
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) ||
 	    IN6_IS_ADDR_MULTICAST(&oip6->ip6_src))
 		goto freeit;
 
 	/*
 	 * If we are about to send ICMPv6 against ICMPv6 error/redirect,
 	 * don't do it.
 	 */
 	nxt = -1;
 	off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
 	if (off >= 0 && nxt == IPPROTO_ICMPV6) {
 		struct icmp6_hdr *icp;
 
 		if (m->m_len < off + sizeof(struct icmp6_hdr)) {
 			m = m_pullup(m, off + sizeof(struct icmp6_hdr));
 			if (m == NULL) {
 				IP6STAT_INC(ip6s_exthdrtoolong);
 				return;
 			}
 		}
 		oip6 = mtod(m, struct ip6_hdr *);
 		icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 
 		if (icp->icmp6_type < ICMP6_ECHO_REQUEST ||
 		    icp->icmp6_type == ND_REDIRECT) {
 			/*
 			 * ICMPv6 error
 			 * Special case: for redirect (which is
 			 * informational) we must not send icmp6 error.
 			 */
 			ICMP6STAT_INC(icp6s_canterror);
 			goto freeit;
 		} else {
 			/* ICMPv6 informational - send the error */
 		}
 	} else {
 		/* non-ICMPv6 - send the error */
 	}
 
 	/* Finally, do rate limitation check. */
 	if (icmp6_ratelimit(&oip6->ip6_src, type, code)) {
 		ICMP6STAT_INC(icp6s_toofreq);
 		goto freeit;
 	}
 
 	/*
 	 * OK, ICMP6 can be generated.
 	 */
 
 	if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN)
 		m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len);
 
 	preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 	M_PREPEND(m, preplen, M_NOWAIT);	/* FIB is also copied over. */
 	if (m == NULL) {
 		nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__));
 		return;
 	}
 
 	nip6 = mtod(m, struct ip6_hdr *);
 	nip6->ip6_src  = oip6->ip6_src;
 	nip6->ip6_dst  = oip6->ip6_dst;
 
 	in6_clearscope(&oip6->ip6_src);
 	in6_clearscope(&oip6->ip6_dst);
 
 	icmp6 = (struct icmp6_hdr *)(nip6 + 1);
 	icmp6->icmp6_type = type;
 	icmp6->icmp6_code = code;
 	icmp6->icmp6_pptr = htonl((u_int32_t)param);
 
 	ICMP6STAT_INC(icp6s_outhist[type]);
 	icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */
 
 	return;
 
   freeit:
 	/*
 	 * If we can't tell whether or not we can generate ICMP6, free it.
 	 */
 	m_freem(m);
 }
 
 /*
  * Process a received ICMP6 message.
  */
 int
 icmp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m, *n;
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6, *nip6;
 	struct icmp6_hdr *icmp6, *nicmp6;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 	int code, error, icmp6len, ip6len, noff, off, sum;
 
 	NET_EPOCH_ASSERT();
 
 	m = *mp;
 	off = *offp;
 
 	if (m->m_len < off + sizeof(struct icmp6_hdr)) {
 		m = m_pullup(m, off + sizeof(struct icmp6_hdr));
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			*mp = m;
 			return (IPPROTO_DONE);
 		}
 	}
 
 	/*
 	 * Locate icmp6 structure in mbuf, and check
 	 * that not corrupted and of at least minimum length
 	 */
 
 	icmp6len = m->m_pkthdr.len - off;
 	if (icmp6len < sizeof(struct icmp6_hdr)) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		goto freeit;
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ifp = m->m_pkthdr.rcvif;
 	/*
 	 * Check multicast group membership.
 	 * Note: SSM filters are not applied for ICMPv6 traffic.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		struct in6_multi	*inm;
 
 		inm = in6m_lookup(ifp, &ip6->ip6_dst);
 		if (inm == NULL) {
 			IP6STAT_INC(ip6s_notmember);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 			goto freeit;
 		}
 	}
 
 	/* Calculate the checksum. */
 	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
 	code = icmp6->icmp6_code;
 	if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) {
 		nd6log((LOG_ERR,
 		    "ICMP6 checksum error(%d|%x) %s\n",
 		    icmp6->icmp6_type, sum,
 		    ip6_sprintf(ip6bufs, &ip6->ip6_src)));
 		ICMP6STAT_INC(icp6s_checksum);
 		goto freeit;
 	}
 
 	ICMP6STAT_INC(icp6s_inhist[icmp6->icmp6_type]);
 	icmp6_ifstat_inc(ifp, ifs6_in_msg);
 	if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK)
 		icmp6_ifstat_inc(ifp, ifs6_in_error);
 
 	ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
 	switch (icmp6->icmp6_type) {
 	case ICMP6_DST_UNREACH:
 		icmp6_ifstat_inc(ifp, ifs6_in_dstunreach);
 		switch (code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 		case ICMP6_DST_UNREACH_ADDR:	/* PRC_HOSTDEAD is a DOS */
 			code = PRC_UNREACH_NET;
 			break;
 		case ICMP6_DST_UNREACH_ADMIN:
 			icmp6_ifstat_inc(ifp, ifs6_in_adminprohib);
 			code = PRC_UNREACH_ADMIN_PROHIB;
 			break;
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			/* I mean "source address was incorrect." */
 			code = PRC_PARAMPROB;
 			break;
 		case ICMP6_DST_UNREACH_NOPORT:
 			code = PRC_UNREACH_PORT;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_PACKET_TOO_BIG:
 		icmp6_ifstat_inc(ifp, ifs6_in_pkttoobig);
 
 		/* validation is made in icmp6_mtudisc_update */
 
 		code = PRC_MSGSIZE;
 
 		/*
 		 * Updating the path MTU will be done after examining
 		 * intermediate extension headers.
 		 */
 		goto deliver;
 		break;
 
 	case ICMP6_TIME_EXCEEDED:
 		icmp6_ifstat_inc(ifp, ifs6_in_timeexceed);
 		switch (code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 			code = PRC_TIMXCEED_INTRANS;
 			break;
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			code = PRC_TIMXCEED_REASS;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_PARAM_PROB:
 		icmp6_ifstat_inc(ifp, ifs6_in_paramprob);
 		switch (code) {
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			code = PRC_UNREACH_PROTOCOL;
 			break;
 		case ICMP6_PARAMPROB_HEADER:
 		case ICMP6_PARAMPROB_OPTION:
 			code = PRC_PARAMPROB;
 			break;
 		default:
 			goto badcode;
 		}
 		goto deliver;
 		break;
 
 	case ICMP6_ECHO_REQUEST:
 		icmp6_ifstat_inc(ifp, ifs6_in_echo);
 		if (code != 0)
 			goto badcode;
 		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) {
 			/* Give up remote */
 			break;
 		}
 		if (!M_WRITABLE(n)
 		 || n->m_len < off + sizeof(struct icmp6_hdr)) {
 			struct mbuf *n0 = n;
 			int n0len;
 
 			CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) <= MHLEN);
 			n = m_gethdr(M_NOWAIT, n0->m_type);
 			if (n == NULL) {
 				/* Give up remote */
 				m_freem(n0);
 				break;
 			}
 
 			m_move_pkthdr(n, n0);	/* FIB copied. */
 			n0len = n0->m_pkthdr.len;	/* save for use below */
 			/*
 			 * Copy IPv6 and ICMPv6 only.
 			 */
 			nip6 = mtod(n, struct ip6_hdr *);
 			bcopy(ip6, nip6, sizeof(struct ip6_hdr));
 			nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
 			bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
 			noff = sizeof(struct ip6_hdr);
 			/* new mbuf contains only ipv6+icmpv6 headers */
 			n->m_len = noff + sizeof(struct icmp6_hdr);
 			/*
 			 * Adjust mbuf.  ip6_plen will be adjusted in
 			 * ip6_output().
 			 */
 			m_adj(n0, off + sizeof(struct icmp6_hdr));
 			/* recalculate complete packet size */
 			n->m_pkthdr.len = n0len + (noff - off);
 			n->m_next = n0;
 		} else {
 			if (n->m_len < off + sizeof(*nicmp6)) {
 				n = m_pullup(n, off + sizeof(*nicmp6));
 				if (n == NULL) {
 					IP6STAT_INC(ip6s_exthdrtoolong);
 					break;
 				}
 			}
 			nicmp6 = (struct icmp6_hdr *)(mtod(n, caddr_t) + off);
 			noff = off;
 		}
 		if (n) {
 			nicmp6->icmp6_type = ICMP6_ECHO_REPLY;
 			nicmp6->icmp6_code = 0;
 			ICMP6STAT_INC(icp6s_reflect);
 			ICMP6STAT_INC(icp6s_outhist[ICMP6_ECHO_REPLY]);
 			icmp6_reflect(n, noff);
 		}
 		break;
 
 	case ICMP6_ECHO_REPLY:
 		icmp6_ifstat_inc(ifp, ifs6_in_echoreply);
 		if (code != 0)
 			goto badcode;
 		break;
 
 	case MLD_LISTENER_QUERY:
 	case MLD_LISTENER_REPORT:
 	case MLD_LISTENER_DONE:
 	case MLDV2_LISTENER_REPORT:
 		/*
 		 * Drop MLD traffic which is not link-local, has a hop limit
 		 * of greater than 1 hop, or which does not have the
 		 * IPv6 HBH Router Alert option.
 		 * As IPv6 HBH options are stripped in ip6_input() we must
 		 * check an mbuf header flag.
 		 * XXX Should we also sanity check that these messages
 		 * were directed to a link-local multicast prefix?
 		 */
 		if ((ip6->ip6_hlim != 1) || (m->m_flags & M_RTALERT_MLD) == 0)
 			goto freeit;
 		if (mld_input(&m, off, icmp6len) != 0) {
 			*mp = NULL;
 			return (IPPROTO_DONE);
 		}
 		/* m stays. */
 		break;
 
 	case ICMP6_WRUREQUEST:	/* ICMP6_FQDN_QUERY */
 	    {
 		enum { WRU, FQDN } mode;
 		struct prison *pr;
 
 		if (!V_icmp6_nodeinfo)
 			break;
 
 		if (icmp6len == sizeof(struct icmp6_hdr) + 4)
 			mode = WRU;
 		else if (icmp6len >= sizeof(struct icmp6_nodeinfo))
 			mode = FQDN;
 		else
 			goto badlen;
 
 		pr = NULL;
 		sx_slock(&allprison_lock);
 		TAILQ_FOREACH(pr, &allprison, pr_list)
 			if (pr->pr_vnet == ifp->if_vnet)
 				break; 
 		sx_sunlock(&allprison_lock);
 		if (pr == NULL)
 			pr = curthread->td_ucred->cr_prison;
 		if (mode == FQDN) {
 			if (m->m_len < off + sizeof(struct icmp6_nodeinfo)) {
 				m = m_pullup(m, off +
 				    sizeof(struct icmp6_nodeinfo));
 				if (m == NULL) {
 					IP6STAT_INC(ip6s_exthdrtoolong);
 					*mp = m;
 					return (IPPROTO_DONE);
 				}
 			}
 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 			if (n)
 				n = ni6_input(n, off, pr);
 			/* XXX meaningless if n == NULL */
 			noff = sizeof(struct ip6_hdr);
 		} else {
 			u_char *p;
 			int maxhlen, hlen;
 
 			/*
 			 * XXX: this combination of flags is pointless,
 			 * but should we keep this for compatibility?
 			 */
 			if ((V_icmp6_nodeinfo & (ICMP6_NODEINFO_FQDNOK |
 			    ICMP6_NODEINFO_TMPADDROK)) !=
 			    (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK))
 				break;
 
 			if (code != 0)
 				goto badcode;
 
 			CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) + 4 <= MHLEN);
 			n = m_gethdr(M_NOWAIT, m->m_type);
 			if (n == NULL) {
 				/* Give up remote */
 				break;
 			}
 			if (!m_dup_pkthdr(n, m, M_NOWAIT)) {
 				/*
 				 * Previous code did a blind M_COPY_PKTHDR
 				 * and said "just for rcvif".  If true, then
 				 * we could tolerate the dup failing (due to
 				 * the deep copy of the tag chain).  For now
 				 * be conservative and just fail.
 				 */
 				m_free(n);
 				n = NULL;
 				break;
 			}
 			/*
 			 * Copy IPv6 and ICMPv6 only.
 			 */
 			nip6 = mtod(n, struct ip6_hdr *);
 			bcopy(ip6, nip6, sizeof(struct ip6_hdr));
 			nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
 			bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
 			p = (u_char *)(nicmp6 + 1);
 			bzero(p, 4);
 
 			maxhlen = M_TRAILINGSPACE(n) -
 			    (sizeof(*nip6) + sizeof(*nicmp6) + 4);
 			mtx_lock(&pr->pr_mtx);
 			hlen = strlen(pr->pr_hostname);
 			if (maxhlen > hlen)
 				maxhlen = hlen;
 			/* meaningless TTL */
 			bcopy(pr->pr_hostname, p + 4, maxhlen);
 			mtx_unlock(&pr->pr_mtx);
 			noff = sizeof(struct ip6_hdr);
 			n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
 				sizeof(struct icmp6_hdr) + 4 + maxhlen;
 			nicmp6->icmp6_type = ICMP6_WRUREPLY;
 			nicmp6->icmp6_code = 0;
 		}
 		if (n) {
 			ICMP6STAT_INC(icp6s_reflect);
 			ICMP6STAT_INC(icp6s_outhist[ICMP6_WRUREPLY]);
 			icmp6_reflect(n, noff);
 		}
 		break;
 	    }
 
 	case ICMP6_WRUREPLY:
 		if (code != 0)
 			goto badcode;
 		break;
 
 	case ND_ROUTER_SOLICIT:
 		icmp6_ifstat_inc(ifp, ifs6_in_routersolicit);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_solicit))
 			goto badlen;
 		if (send_sendso_input_hook != NULL) {
 			if (m->m_len < off + icmp6len) {
 				m = m_pullup(m, off + icmp6len);
 				if (m == NULL) {
 					IP6STAT_INC(ip6s_exthdrtoolong);
 					*mp = NULL;
 					return (IPPROTO_DONE);
 				}
 			}
 			error = send_sendso_input_hook(m, ifp, SND_IN, ip6len);
 			if (error == 0) {
 				m = NULL;
 				goto freeit;
 			}
 		}
 		n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		nd6_rs_input(m, off, icmp6len);
 		m = n;
 		if (m == NULL)
 			goto freeit;
 		break;
 
 	case ND_ROUTER_ADVERT:
 		icmp6_ifstat_inc(ifp, ifs6_in_routeradvert);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_advert))
 			goto badlen;
 		if (send_sendso_input_hook != NULL) {
 			error = send_sendso_input_hook(m, ifp, SND_IN, ip6len);
 			if (error == 0) {
 				m = NULL;
 				goto freeit;
 			}
 		}
 		n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		nd6_ra_input(m, off, icmp6len);
 		m = n;
 		if (m == NULL)
 			goto freeit;
 		break;
 
 	case ND_NEIGHBOR_SOLICIT:
 		icmp6_ifstat_inc(ifp, ifs6_in_neighborsolicit);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_solicit))
 			goto badlen;
 		if (send_sendso_input_hook != NULL) {
 			error = send_sendso_input_hook(m, ifp, SND_IN, ip6len);
 			if (error == 0) {
 				m = NULL;
 				goto freeit;
 			}
 		}
 		n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		nd6_ns_input(m, off, icmp6len);
 		m = n;
 		if (m == NULL)
 			goto freeit;
 		break;
 
 	case ND_NEIGHBOR_ADVERT:
 		icmp6_ifstat_inc(ifp, ifs6_in_neighboradvert);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_advert))
 			goto badlen;
 		if (send_sendso_input_hook != NULL) {
 			error = send_sendso_input_hook(m, ifp, SND_IN, ip6len);
 			if (error == 0) {
 				m = NULL;
 				goto freeit;
 			}
 		}
 		n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		nd6_na_input(m, off, icmp6len);
 		m = n;
 		if (m == NULL)
 			goto freeit;
 		break;
 
 	case ND_REDIRECT:
 		icmp6_ifstat_inc(ifp, ifs6_in_redirect);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_redirect))
 			goto badlen;
 		if (send_sendso_input_hook != NULL) {
 			error = send_sendso_input_hook(m, ifp, SND_IN, ip6len);
 			if (error == 0) {
 				m = NULL;
 				goto freeit;
 			}
 		}
 		n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		icmp6_redirect_input(m, off);
 		m = n;
 		if (m == NULL)
 			goto freeit;
 		break;
 
 	case ICMP6_ROUTER_RENUMBERING:
 		if (code != ICMP6_ROUTER_RENUMBERING_COMMAND &&
 		    code != ICMP6_ROUTER_RENUMBERING_RESULT)
 			goto badcode;
 		if (icmp6len < sizeof(struct icmp6_router_renum))
 			goto badlen;
 		break;
 
 	default:
 		nd6log((LOG_DEBUG,
 		    "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n",
 		    icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 		    ifp ? ifp->if_index : 0));
 		if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) {
 			/* ICMPv6 error: MUST deliver it by spec... */
 			code = PRC_NCMDS;
 			/* deliver */
 		} else {
 			/* ICMPv6 informational: MUST not deliver */
 			break;
 		}
 	deliver:
 		if (icmp6_notify_error(&m, off, icmp6len, code) != 0) {
 			/* In this case, m should've been freed. */
 			*mp = NULL;
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	badcode:
 		ICMP6STAT_INC(icp6s_badcode);
 		break;
 
 	badlen:
 		ICMP6STAT_INC(icp6s_badlen);
 		break;
 	}
 
 	/* deliver the packet to appropriate sockets */
 	icmp6_rip6_input(&m, *offp);
 
 	*mp = m;
 	return (IPPROTO_DONE);
 
  freeit:
 	m_freem(m);
 	*mp = NULL;
 	return (IPPROTO_DONE);
 }
 
 static int
 icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code)
 {
 	struct mbuf *m;
 	struct icmp6_hdr *icmp6;
 	struct ip6_hdr *eip6;
 	u_int32_t notifymtu;
 	struct sockaddr_in6 icmp6src, icmp6dst;
 
 	m = *mp;
 
 	if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		goto freeit;
 	}
 
 	if (m->m_len < off + sizeof(*icmp6) + sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, off + sizeof(*icmp6) + sizeof(struct ip6_hdr));
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			*mp = m;
 			return (-1);
 		}
 	}
 	icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 	eip6 = (struct ip6_hdr *)(icmp6 + 1);
 
 	/* Detect the upper level protocol */
 	{
 		void (*ctlfunc)(int, struct sockaddr *, void *);
 		u_int8_t nxt = eip6->ip6_nxt;
 		int eoff = off + sizeof(struct icmp6_hdr) +
 		    sizeof(struct ip6_hdr);
 		struct ip6ctlparam ip6cp;
 		struct in6_addr *finaldst = NULL;
 		int icmp6type = icmp6->icmp6_type;
 		struct ip6_frag *fh;
 		struct ip6_rthdr *rth;
 		struct ip6_rthdr0 *rth0;
 		int rthlen;
 
 		while (1) { /* XXX: should avoid infinite loop explicitly? */
 			struct ip6_ext *eh;
 
 			switch (nxt) {
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_AH:
 				if (m->m_len < eoff + sizeof(struct ip6_ext)) {
 					m = m_pullup(m, eoff +
 					    sizeof(struct ip6_ext));
 					if (m == NULL) {
 						IP6STAT_INC(ip6s_exthdrtoolong);
 						*mp = m;
 						return (-1);
 					}
 				}
 				eh = (struct ip6_ext *)
 				    (mtod(m, caddr_t) + eoff);
 				if (nxt == IPPROTO_AH)
 					eoff += (eh->ip6e_len + 2) << 2;
 				else
 					eoff += (eh->ip6e_len + 1) << 3;
 				nxt = eh->ip6e_nxt;
 				break;
 			case IPPROTO_ROUTING:
 				/*
 				 * When the erroneous packet contains a
 				 * routing header, we should examine the
 				 * header to determine the final destination.
 				 * Otherwise, we can't properly update
 				 * information that depends on the final
 				 * destination (e.g. path MTU).
 				 */
 				if (m->m_len < eoff + sizeof(*rth)) {
 					m = m_pullup(m, eoff + sizeof(*rth));
 					if (m == NULL) {
 						IP6STAT_INC(ip6s_exthdrtoolong);
 						*mp = m;
 						return (-1);
 					}
 				}
 				rth = (struct ip6_rthdr *)
 				    (mtod(m, caddr_t) + eoff);
 				rthlen = (rth->ip6r_len + 1) << 3;
 				/*
 				 * XXX: currently there is no
 				 * officially defined type other
 				 * than type-0.
 				 * Note that if the segment left field
 				 * is 0, all intermediate hops must
 				 * have been passed.
 				 */
 				if (rth->ip6r_segleft &&
 				    rth->ip6r_type == IPV6_RTHDR_TYPE_0) {
 					int hops;
 
 					if (m->m_len < eoff + rthlen) {
 						m = m_pullup(m, eoff + rthlen);
 						if (m == NULL) {
 							IP6STAT_INC(
 							    ip6s_exthdrtoolong);
 							*mp = m;
 							return (-1);
 						}
 					}
 					rth0 = (struct ip6_rthdr0 *)
 					    (mtod(m, caddr_t) + eoff);
 					/* just ignore a bogus header */
 					if ((rth0->ip6r0_len % 2) == 0 &&
 					    (hops = rth0->ip6r0_len/2))
 						finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1);
 				}
 				eoff += rthlen;
 				nxt = rth->ip6r_nxt;
 				break;
 			case IPPROTO_FRAGMENT:
 				if (m->m_len < eoff + sizeof(struct ip6_frag)) {
 					m = m_pullup(m, eoff +
 					    sizeof(struct ip6_frag));
 					if (m == NULL) {
 						IP6STAT_INC(ip6s_exthdrtoolong);
 						*mp = m;
 						return (-1);
 					}
 				}
 				fh = (struct ip6_frag *)(mtod(m, caddr_t) +
 				    eoff);
 				/*
 				 * Data after a fragment header is meaningless
 				 * unless it is the first fragment, but
 				 * we'll go to the notify label for path MTU
 				 * discovery.
 				 */
 				if (fh->ip6f_offlg & IP6F_OFF_MASK)
 					goto notify;
 
 				eoff += sizeof(struct ip6_frag);
 				nxt = fh->ip6f_nxt;
 				break;
 			default:
 				/*
 				 * This case includes ESP and the No Next
 				 * Header.  In such cases going to the notify
 				 * label does not have any meaning
 				 * (i.e. ctlfunc will be NULL), but we go
 				 * anyway since we might have to update
 				 * path MTU information.
 				 */
 				goto notify;
 			}
 		}
 	  notify:
 		icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 
 		/*
 		 * retrieve parameters from the inner IPv6 header, and convert
 		 * them into sockaddr structures.
 		 * XXX: there is no guarantee that the source or destination
 		 * addresses of the inner packet are in the same scope as
 		 * the addresses of the icmp packet.  But there is no other
 		 * way to determine the zone.
 		 */
 		eip6 = (struct ip6_hdr *)(icmp6 + 1);
 
 		bzero(&icmp6dst, sizeof(icmp6dst));
 		icmp6dst.sin6_len = sizeof(struct sockaddr_in6);
 		icmp6dst.sin6_family = AF_INET6;
 		if (finaldst == NULL)
 			icmp6dst.sin6_addr = eip6->ip6_dst;
 		else
 			icmp6dst.sin6_addr = *finaldst;
 		if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL))
 			goto freeit;
 		bzero(&icmp6src, sizeof(icmp6src));
 		icmp6src.sin6_len = sizeof(struct sockaddr_in6);
 		icmp6src.sin6_family = AF_INET6;
 		icmp6src.sin6_addr = eip6->ip6_src;
 		if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL))
 			goto freeit;
 		icmp6src.sin6_flowinfo =
 		    (eip6->ip6_flow & IPV6_FLOWLABEL_MASK);
 
 		if (finaldst == NULL)
 			finaldst = &eip6->ip6_dst;
 		ip6cp.ip6c_m = m;
 		ip6cp.ip6c_icmp6 = icmp6;
 		ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1);
 		ip6cp.ip6c_off = eoff;
 		ip6cp.ip6c_finaldst = finaldst;
 		ip6cp.ip6c_src = &icmp6src;
 		ip6cp.ip6c_nxt = nxt;
 
 		if (icmp6type == ICMP6_PACKET_TOO_BIG) {
 			notifymtu = ntohl(icmp6->icmp6_mtu);
 			ip6cp.ip6c_cmdarg = (void *)&notifymtu;
 			icmp6_mtudisc_update(&ip6cp, 1);	/*XXX*/
 		}
 
 		ctlfunc = (void (*)(int, struct sockaddr *, void *))
 		    (inet6sw[ip6_protox[nxt]].pr_ctlinput);
 		if (ctlfunc) {
 			(void) (*ctlfunc)(code, (struct sockaddr *)&icmp6dst,
 			    &ip6cp);
 		}
 	}
 	*mp = m;
 	return (0);
 
   freeit:
 	m_freem(m);
 	*mp = NULL;
 	return (-1);
 }
 
 void
 icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated)
 {
 	struct in6_addr *dst = ip6cp->ip6c_finaldst;
 	struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6;
 	struct mbuf *m = ip6cp->ip6c_m;	/* will be necessary for scope issue */
 	u_int mtu = ntohl(icmp6->icmp6_mtu);
 	struct in_conninfo inc;
 
 #if 0
 	/*
 	 * RFC2460 section 5, last paragraph.
 	 * even though minimum link MTU for IPv6 is IPV6_MMTU,
 	 * we may see ICMPv6 too big with mtu < IPV6_MMTU
 	 * due to packet translator in the middle.
 	 * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for
 	 * special handling.
 	 */
 	if (mtu < IPV6_MMTU)
 		return;
 #endif
 
 	/*
 	 * we reject ICMPv6 too big with abnormally small value.
 	 * XXX what is the good definition of "abnormally small"?
 	 */
 	if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8)
 		return;
 
 	if (!validated)
 		return;
 
 	/*
 	 * In case the suggested mtu is less than IPV6_MMTU, we
 	 * only need to remember that it was for above mentioned
 	 * "alwaysfrag" case.
 	 * Try to be as close to the spec as possible.
 	 */
 	if (mtu < IPV6_MMTU)
 		mtu = IPV6_MMTU - 8;
 
 	bzero(&inc, sizeof(inc));
 	inc.inc_fibnum = M_GETFIB(m);
 	inc.inc_flags |= INC_ISIPV6;
 	inc.inc6_faddr = *dst;
 	if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL))
 		return;
 
 	if (mtu < tcp_maxmtu6(&inc, NULL)) {
 		tcp_hc_updatemtu(&inc, mtu);
 		ICMP6STAT_INC(icp6s_pmtuchg);
 	}
 }
 
 /*
  * Process a Node Information Query packet, based on
  * draft-ietf-ipngwg-icmp-name-lookups-07.
  *
  * Spec incompatibilities:
  * - IPv6 Subject address handling
  * - IPv4 Subject address handling support missing
  * - Proxy reply (answer even if it's not for me)
  * - joins NI group address at in6_ifattach() time only, does not cope
  *   with hostname changes by sethostname(3)
  */
 static struct mbuf *
 ni6_input(struct mbuf *m, int off, struct prison *pr)
 {
 	struct icmp6_nodeinfo *ni6, *nni6;
 	struct mbuf *n = NULL;
 	u_int16_t qtype;
 	int subjlen;
 	int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
 	struct ni_reply_fqdn *fqdn;
 	int addrs;		/* for NI_QTYPE_NODEADDR */
 	struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */
 	struct in6_addr in6_subj; /* subject address */
 	struct ip6_hdr *ip6;
 	int oldfqdn = 0;	/* if 1, return pascal string (03 draft) */
 	char *subj = NULL;
 	struct in6_ifaddr *ia6 = NULL;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off);
 
 	/*
 	 * Validate IPv6 source address.
 	 * The default configuration MUST be to refuse answering queries from
 	 * global-scope addresses according to RFC4602.
 	 * Notes:
 	 *  - it's not very clear what "refuse" means; this implementation
 	 *    simply drops it.
 	 *  - it's not very easy to identify global-scope (unicast) addresses
 	 *    since there are many prefixes for them.  It should be safer
 	 *    and in practice sufficient to check "all" but loopback and
 	 *    link-local (note that site-local unicast was deprecated and
 	 *    ULA is defined as global scope-wise)
 	 */
 	if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 &&
 	    !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) &&
 	    !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src))
 		goto bad;
 
 	/*
 	 * Validate IPv6 destination address.
 	 *
 	 * The Responder must discard the Query without further processing
 	 * unless it is one of the Responder's unicast or anycast addresses, or
 	 * a link-local scope multicast address which the Responder has joined.
 	 * [RFC4602, Section 5.]
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
 			goto bad;
 		/* else it's a link-local multicast, fine */
 	} else {		/* unicast or anycast */
 		ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
 		if (ia6 == NULL)
 			goto bad; /* XXX impossible */
 
 		if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) &&
 		    !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) {
 			ifa_free(&ia6->ia_ifa);
 			nd6log((LOG_DEBUG, "ni6_input: ignore node info to "
 				"a temporary address in %s:%d",
 			       __FILE__, __LINE__));
 			goto bad;
 		}
 		ifa_free(&ia6->ia_ifa);
 	}
 
 	/* validate query Subject field. */
 	qtype = ntohs(ni6->ni_qtype);
 	subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo);
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 	case NI_QTYPE_SUPTYPES:
 		/* 07 draft */
 		if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0)
 			break;
 		/* FALLTHROUGH */
 	case NI_QTYPE_FQDN:
 	case NI_QTYPE_NODEADDR:
 	case NI_QTYPE_IPV4ADDR:
 		switch (ni6->ni_code) {
 		case ICMP6_NI_SUBJ_IPV6:
 #if ICMP6_NI_SUBJ_IPV6 != 0
 		case 0:
 #endif
 			/*
 			 * backward compatibility - try to accept 03 draft
 			 * format, where no Subject is present.
 			 */
 			if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 &&
 			    subjlen == 0) {
 				oldfqdn++;
 				break;
 			}
 #if ICMP6_NI_SUBJ_IPV6 != 0
 			if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6)
 				goto bad;
 #endif
 
 			if (subjlen != sizeof(struct in6_addr))
 				goto bad;
 
 			/*
 			 * Validate Subject address.
 			 *
 			 * Not sure what exactly "address belongs to the node"
 			 * means in the spec, is it just unicast, or what?
 			 *
 			 * At this moment we consider Subject address as
 			 * "belong to the node" if the Subject address equals
 			 * to the IPv6 destination address; validation for
 			 * IPv6 destination address should have done enough
 			 * check for us.
 			 *
 			 * We do not do proxy at this moment.
 			 */
 			m_copydata(m, off + sizeof(struct icmp6_nodeinfo),
 			    subjlen, (caddr_t)&in6_subj);
 			if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL))
 				goto bad;
 
 			subj = (char *)&in6_subj;
 			if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj))
 				break;
 
 			/*
 			 * XXX if we are to allow other cases, we should really
 			 * be careful about scope here.
 			 * basically, we should disallow queries toward IPv6
 			 * destination X with subject Y,
 			 * if scope(X) > scope(Y).
 			 * if we allow scope(X) > scope(Y), it will result in
 			 * information leakage across scope boundary.
 			 */
 			goto bad;
 
 		case ICMP6_NI_SUBJ_FQDN:
 			/*
 			 * Validate Subject name with gethostname(3).
 			 *
 			 * The behavior may need some debate, since:
 			 * - we are not sure if the node has FQDN as
 			 *   hostname (returned by gethostname(3)).
 			 * - the code does wildcard match for truncated names.
 			 *   however, we are not sure if we want to perform
 			 *   wildcard match, if gethostname(3) side has
 			 *   truncated hostname.
 			 */
 			mtx_lock(&pr->pr_mtx);
 			n = ni6_nametodns(pr->pr_hostname,
 			    strlen(pr->pr_hostname), 0);
 			mtx_unlock(&pr->pr_mtx);
 			if (!n || n->m_next || n->m_len == 0)
 				goto bad;
 			if (m->m_len < off + sizeof(struct icmp6_nodeinfo) +
 			    subjlen) {
 				m = m_pullup(m, off +
 				    sizeof(struct icmp6_nodeinfo) + subjlen);
 				if (m == NULL) {
 					IP6STAT_INC(ip6s_exthdrtoolong);
 					goto bad;
 				}
 			}
 			/* ip6 possibly invalid but not used after. */
 			ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off);
 			subj = (char *)(mtod(m, caddr_t) + off +
 			    sizeof(struct icmp6_nodeinfo));
 			if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *),
 			    n->m_len)) {
 				goto bad;
 			}
 			m_freem(n);
 			n = NULL;
 			break;
 
 		case ICMP6_NI_SUBJ_IPV4:	/* XXX: to be implemented? */
 		default:
 			goto bad;
 		}
 		break;
 	}
 
 	/* refuse based on configuration.  XXX ICMP6_NI_REFUSED? */
 	switch (qtype) {
 	case NI_QTYPE_FQDN:
 		if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0)
 			goto bad;
 		break;
 	case NI_QTYPE_NODEADDR:
 	case NI_QTYPE_IPV4ADDR:
 		if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0)
 			goto bad;
 		break;
 	}
 
 	/* guess reply length */
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 		break;		/* no reply data */
 	case NI_QTYPE_SUPTYPES:
 		replylen += sizeof(u_int32_t);
 		break;
 	case NI_QTYPE_FQDN:
 		/* XXX will append an mbuf */
 		replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
 		break;
 	case NI_QTYPE_NODEADDR:
 		addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj);
 		if ((replylen += addrs * (sizeof(struct in6_addr) +
 		    sizeof(u_int32_t))) > MCLBYTES)
 			replylen = MCLBYTES; /* XXX: will truncate pkt later */
 		break;
 	case NI_QTYPE_IPV4ADDR:
 		/* unsupported - should respond with unknown Qtype? */
 		break;
 	default:
 		/*
 		 * XXX: We must return a reply with the ICMP6 code
 		 * `unknown Qtype' in this case.  However we regard the case
 		 * as an FQDN query for backward compatibility.
 		 * Older versions set a random value to this field,
 		 * so it rarely varies in the defined qtypes.
 		 * But the mechanism is not reliable...
 		 * maybe we should obsolete older versions.
 		 */
 		qtype = NI_QTYPE_FQDN;
 		/* XXX will append an mbuf */
 		replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
 		oldfqdn++;
 		break;
 	}
 
 	/* Allocate an mbuf to reply. */
 	if (replylen > MCLBYTES) {
 		/*
 		 * XXX: should we try to allocate more? But MCLBYTES
 		 * is probably much larger than IPV6_MMTU...
 		 */
 		goto bad;
 	}
 	if (replylen > MHLEN)
 		n = m_getcl(M_NOWAIT, m->m_type, M_PKTHDR);
 	else
 		n = m_gethdr(M_NOWAIT, m->m_type);
 	if (n == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	m_move_pkthdr(n, m); /* just for recvif and FIB */
 	n->m_pkthdr.len = n->m_len = replylen;
 
 	/* copy mbuf header and IPv6 + Node Information base headers */
 	bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr));
 	nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1);
 	bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo));
 
 	/* qtype dependent procedure */
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		nni6->ni_flags = 0;
 		break;
 	case NI_QTYPE_SUPTYPES:
 	{
 		u_int32_t v;
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		nni6->ni_flags = htons(0x0000);	/* raw bitmap */
 		/* supports NOOP, SUPTYPES, FQDN, and NODEADDR */
 		v = (u_int32_t)htonl(0x0000000f);
 		bcopy(&v, nni6 + 1, sizeof(u_int32_t));
 		break;
 	}
 	case NI_QTYPE_FQDN:
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) +
 		    sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo));
 		nni6->ni_flags = 0; /* XXX: meaningless TTL */
 		fqdn->ni_fqdn_ttl = 0;	/* ditto. */
 		/*
 		 * XXX do we really have FQDN in hostname?
 		 */
 		mtx_lock(&pr->pr_mtx);
 		n->m_next = ni6_nametodns(pr->pr_hostname,
 		    strlen(pr->pr_hostname), oldfqdn);
 		mtx_unlock(&pr->pr_mtx);
 		if (n->m_next == NULL)
 			goto bad;
 		/* XXX we assume that n->m_next is not a chain */
 		if (n->m_next->m_next != NULL)
 			goto bad;
 		n->m_pkthdr.len += n->m_next->m_len;
 		break;
 	case NI_QTYPE_NODEADDR:
 	{
 		int lenlim, copied;
 
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		n->m_pkthdr.len = n->m_len =
 		    sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
 		lenlim = M_TRAILINGSPACE(n);
 		copied = ni6_store_addrs(ni6, nni6, ifp, lenlim);
 		/* XXX: reset mbuf length */
 		n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
 		    sizeof(struct icmp6_nodeinfo) + copied;
 		break;
 	}
 	default:
 		break;		/* XXX impossible! */
 	}
 
 	nni6->ni_type = ICMP6_NI_REPLY;
 	m_freem(m);
 	return (n);
 
   bad:
 	m_freem(m);
 	if (n)
 		m_freem(n);
 	return (NULL);
 }
 
 /*
  * make a mbuf with DNS-encoded string.  no compression support.
  *
  * XXX names with less than 2 dots (like "foo" or "foo.section") will be
  * treated as truncated name (two \0 at the end).  this is a wild guess.
  *
  * old - return pascal string if non-zero
  */
 static struct mbuf *
 ni6_nametodns(const char *name, int namelen, int old)
 {
 	struct mbuf *m;
 	char *cp, *ep;
 	const char *p, *q;
 	int i, len, nterm;
 
 	if (old)
 		len = namelen + 1;
 	else
 		len = MCLBYTES;
 
 	/* Because MAXHOSTNAMELEN is usually 256, we use cluster mbuf. */
 	if (len > MLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		m = m_get(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		goto fail;
 
 	if (old) {
 		m->m_len = len;
 		*mtod(m, char *) = namelen;
 		bcopy(name, mtod(m, char *) + 1, namelen);
 		return m;
 	} else {
 		m->m_len = 0;
 		cp = mtod(m, char *);
 		ep = mtod(m, char *) + M_TRAILINGSPACE(m);
 
 		/* if not certain about my name, return empty buffer */
 		if (namelen == 0)
 			return m;
 
 		/*
 		 * guess if it looks like shortened hostname, or FQDN.
 		 * shortened hostname needs two trailing "\0".
 		 */
 		i = 0;
 		for (p = name; p < name + namelen; p++) {
 			if (*p && *p == '.')
 				i++;
 		}
 		if (i < 2)
 			nterm = 2;
 		else
 			nterm = 1;
 
 		p = name;
 		while (cp < ep && p < name + namelen) {
 			i = 0;
 			for (q = p; q < name + namelen && *q && *q != '.'; q++)
 				i++;
 			/* result does not fit into mbuf */
 			if (cp + i + 1 >= ep)
 				goto fail;
 			/*
 			 * DNS label length restriction, RFC1035 page 8.
 			 * "i == 0" case is included here to avoid returning
 			 * 0-length label on "foo..bar".
 			 */
 			if (i <= 0 || i >= 64)
 				goto fail;
 			*cp++ = i;
 			bcopy(p, cp, i);
 			cp += i;
 			p = q;
 			if (p < name + namelen && *p == '.')
 				p++;
 		}
 		/* termination */
 		if (cp + nterm >= ep)
 			goto fail;
 		while (nterm-- > 0)
 			*cp++ = '\0';
 		m->m_len = cp - mtod(m, char *);
 		return m;
 	}
 
 	panic("should not reach here");
 	/* NOTREACHED */
 
  fail:
 	if (m)
 		m_freem(m);
 	return NULL;
 }
 
 /*
  * check if two DNS-encoded string matches.  takes care of truncated
  * form (with \0\0 at the end).  no compression support.
  * XXX upper/lowercase match (see RFC2065)
  */
 static int
 ni6_dnsmatch(const char *a, int alen, const char *b, int blen)
 {
 	const char *a0, *b0;
 	int l;
 
 	/* simplest case - need validation? */
 	if (alen == blen && bcmp(a, b, alen) == 0)
 		return 1;
 
 	a0 = a;
 	b0 = b;
 
 	/* termination is mandatory */
 	if (alen < 2 || blen < 2)
 		return 0;
 	if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0')
 		return 0;
 	alen--;
 	blen--;
 
 	while (a - a0 < alen && b - b0 < blen) {
 		if (a - a0 + 1 > alen || b - b0 + 1 > blen)
 			return 0;
 
 		if ((signed char)a[0] < 0 || (signed char)b[0] < 0)
 			return 0;
 		/* we don't support compression yet */
 		if (a[0] >= 64 || b[0] >= 64)
 			return 0;
 
 		/* truncated case */
 		if (a[0] == 0 && a - a0 == alen - 1)
 			return 1;
 		if (b[0] == 0 && b - b0 == blen - 1)
 			return 1;
 		if (a[0] == 0 || b[0] == 0)
 			return 0;
 
 		if (a[0] != b[0])
 			return 0;
 		l = a[0];
 		if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen)
 			return 0;
 		if (bcmp(a + 1, b + 1, l) != 0)
 			return 0;
 
 		a += 1 + l;
 		b += 1 + l;
 	}
 
 	if (a - a0 == alen && b - b0 == blen)
 		return 1;
 	else
 		return 0;
 }
 
 /*
  * calculate the number of addresses to be returned in the node info reply.
  */
 static int
 ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp,
     struct in6_addr *subj)
 {
 	struct ifnet *ifp;
 	struct in6_ifaddr *ifa6;
 	struct ifaddr *ifa;
 	int addrs = 0, addrsofif, iffound = 0;
 	int niflags = ni6->ni_flags;
 
 	NET_EPOCH_ASSERT();
 
 	if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) {
 		switch (ni6->ni_code) {
 		case ICMP6_NI_SUBJ_IPV6:
 			if (subj == NULL) /* must be impossible... */
 				return (0);
 			break;
 		default:
 			/*
 			 * XXX: we only support IPv6 subject address for
 			 * this Qtype.
 			 */
 			return (0);
 		}
 	}
 
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		addrsofif = 0;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifa6 = (struct in6_ifaddr *)ifa;
 
 			if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 &&
 			    IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr))
 				iffound = 1;
 
 			/*
 			 * IPv4-mapped addresses can only be returned by a
 			 * Node Information proxy, since they represent
 			 * addresses of IPv4-only nodes, which perforce do
 			 * not implement this protocol.
 			 * [icmp-name-lookups-07, Section 5.4]
 			 * So we don't support NI_NODEADDR_FLAG_COMPAT in
 			 * this function at this moment.
 			 */
 
 			/* What do we have to do about ::1? */
 			switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
 			case IPV6_ADDR_SCOPE_LINKLOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_SITELOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_GLOBAL:
 				if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
 					continue;
 				break;
 			default:
 				continue;
 			}
 
 			/*
 			 * check if anycast is okay.
 			 * XXX: just experimental.  not in the spec.
 			 */
 			if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
 			    (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
 				continue; /* we need only unicast addresses */
 			if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) {
 				continue;
 			}
 			addrsofif++; /* count the address */
 		}
 		if (iffound) {
 			*ifpp = ifp;
 			return (addrsofif);
 		}
 
 		addrs += addrsofif;
 	}
 
 	return (addrs);
 }
 
 static int
 ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6,
     struct ifnet *ifp0, int resid)
 {
 	struct ifnet *ifp;
 	struct in6_ifaddr *ifa6;
 	struct ifaddr *ifa;
 	struct ifnet *ifp_dep = NULL;
 	int copied = 0, allow_deprecated = 0;
 	u_char *cp = (u_char *)(nni6 + 1);
 	int niflags = ni6->ni_flags;
 	u_int32_t ltime;
 
 	NET_EPOCH_ASSERT();
 
 	if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL))
 		return (0);	/* needless to copy */
 
 	ifp = ifp0 ? ifp0 : CK_STAILQ_FIRST(&V_ifnet);
   again:
 
 	for (; ifp; ifp = CK_STAILQ_NEXT(ifp, if_link)) {
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifa6 = (struct in6_ifaddr *)ifa;
 
 			if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 &&
 			    allow_deprecated == 0) {
 				/*
 				 * prefererred address should be put before
 				 * deprecated addresses.
 				 */
 
 				/* record the interface for later search */
 				if (ifp_dep == NULL)
 					ifp_dep = ifp;
 
 				continue;
 			} else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 &&
 			    allow_deprecated != 0)
 				continue; /* we now collect deprecated addrs */
 
 			/* What do we have to do about ::1? */
 			switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
 			case IPV6_ADDR_SCOPE_LINKLOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_SITELOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_GLOBAL:
 				if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
 					continue;
 				break;
 			default:
 				continue;
 			}
 
 			/*
 			 * check if anycast is okay.
 			 * XXX: just experimental.  not in the spec.
 			 */
 			if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
 			    (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
 				continue;
 			if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) {
 				continue;
 			}
 
 			/* now we can copy the address */
 			if (resid < sizeof(struct in6_addr) +
 			    sizeof(u_int32_t)) {
 				/*
 				 * We give up much more copy.
 				 * Set the truncate flag and return.
 				 */
 				nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE;
 				return (copied);
 			}
 
 			/*
 			 * Set the TTL of the address.
 			 * The TTL value should be one of the following
 			 * according to the specification:
 			 *
 			 * 1. The remaining lifetime of a DHCP lease on the
 			 *    address, or
 			 * 2. The remaining Valid Lifetime of a prefix from
 			 *    which the address was derived through Stateless
 			 *    Autoconfiguration.
 			 *
 			 * Note that we currently do not support stateful
 			 * address configuration by DHCPv6, so the former
 			 * case can't happen.
 			 */
 			if (ifa6->ia6_lifetime.ia6t_expire == 0)
 				ltime = ND6_INFINITE_LIFETIME;
 			else {
 				if (ifa6->ia6_lifetime.ia6t_expire >
 				    time_uptime)
 					ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_uptime);
 				else
 					ltime = 0;
 			}
 
 			bcopy(&ltime, cp, sizeof(u_int32_t));
 			cp += sizeof(u_int32_t);
 
 			/* copy the address itself */
 			bcopy(&ifa6->ia_addr.sin6_addr, cp,
 			    sizeof(struct in6_addr));
 			in6_clearscope((struct in6_addr *)cp); /* XXX */
 			cp += sizeof(struct in6_addr);
 
 			resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t));
 			copied += (sizeof(struct in6_addr) + sizeof(u_int32_t));
 		}
 		if (ifp0)	/* we need search only on the specified IF */
 			break;
 	}
 
 	if (allow_deprecated == 0 && ifp_dep != NULL) {
 		ifp = ifp_dep;
 		allow_deprecated = 1;
 
 		goto again;
 	}
 
 	return (copied);
 }
 
 /*
  * XXX almost dup'ed code with rip6_input.
  */
 static int
 icmp6_rip6_input(struct mbuf **mp, int off)
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct inpcb *inp;
 	struct inpcb *last = NULL;
 	struct sockaddr_in6 fromsa;
 	struct icmp6_hdr *icmp6;
 	struct mbuf *opts = NULL;
 
 	NET_EPOCH_ASSERT();
 
 	/* This is assumed to be safe; icmp6_input() does a pullup. */
 	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
 
 	/*
 	 * XXX: the address may have embedded scope zone ID, which should be
 	 * hidden from applications.
 	 */
 	bzero(&fromsa, sizeof(fromsa));
 	fromsa.sin6_family = AF_INET6;
 	fromsa.sin6_len = sizeof(struct sockaddr_in6);
 	fromsa.sin6_addr = ip6->ip6_src;
 	if (sa6_recoverscope(&fromsa)) {
 		m_freem(m);
 		*mp = NULL;
 		return (IPPROTO_DONE);
 	}
 
 	CK_LIST_FOREACH(inp, &V_ripcb, inp_list) {
 		if ((inp->inp_vflag & INP_IPV6) == 0)
 			continue;
 		if (inp->inp_ip_p != IPPROTO_ICMPV6)
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
 		   !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst))
 			continue;
 		if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 		   !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src))
 			continue;
 		INP_RLOCK(inp);
 		if (__predict_false(inp->inp_flags2 & INP_FREED)) {
 			INP_RUNLOCK(inp);
 			continue;
 		}
 		if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
 		    inp->in6p_icmp6filt)) {
 			INP_RUNLOCK(inp);
 			continue;
 		}
 		if (last != NULL) {
 			struct	mbuf *n = NULL;
 
 			/*
 			 * Recent network drivers tend to allocate a single
 			 * mbuf cluster, rather than to make a couple of
 			 * mbufs without clusters.  Also, since the IPv6 code
 			 * path tries to avoid m_pullup(), it is highly
 			 * probable that we still have an mbuf cluster here
 			 * even though the necessary length can be stored in an
 			 * mbuf's internal buffer.
 			 * Meanwhile, the default size of the receive socket
 			 * buffer for raw sockets is not so large.  This means
 			 * the possibility of packet loss is relatively higher
 			 * than before.  To avoid this scenario, we copy the
 			 * received data to a separate mbuf that does not use
 			 * a cluster, if possible.
 			 * XXX: it is better to copy the data after stripping
 			 * intermediate headers.
 			 */
 			if ((m->m_flags & M_EXT) && m->m_next == NULL &&
 			    m->m_len <= MHLEN) {
 				n = m_get(M_NOWAIT, m->m_type);
 				if (n != NULL) {
 					if (m_dup_pkthdr(n, m, M_NOWAIT)) {
 						bcopy(m->m_data, n->m_data,
 						      m->m_len);
 						n->m_len = m->m_len;
 					} else {
 						m_free(n);
 						n = NULL;
 					}
 				}
 			}
 			if (n != NULL ||
 			    (n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
 				if (last->inp_flags & INP_CONTROLOPTS)
 					ip6_savecontrol(last, n, &opts);
 				/* strip intermediate headers */
 				m_adj(n, off);
 				SOCKBUF_LOCK(&last->inp_socket->so_rcv);
 				if (sbappendaddr_locked(
 				    &last->inp_socket->so_rcv,
 				    (struct sockaddr *)&fromsa, n, opts)
 				    == 0) {
 					/* should notify about lost packet */
 					m_freem(n);
 					if (opts) {
 						m_freem(opts);
 					}
 					SOCKBUF_UNLOCK(
 					    &last->inp_socket->so_rcv);
 				} else
 					sorwakeup_locked(last->inp_socket);
 				opts = NULL;
 			}
 			INP_RUNLOCK(last);
 		}
 		last = inp;
 	}
 	if (last != NULL) {
 		if (last->inp_flags & INP_CONTROLOPTS)
 			ip6_savecontrol(last, m, &opts);
 		/* strip intermediate headers */
 		m_adj(m, off);
 
 		/* avoid using mbuf clusters if possible (see above) */
 		if ((m->m_flags & M_EXT) && m->m_next == NULL &&
 		    m->m_len <= MHLEN) {
 			struct mbuf *n;
 
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n != NULL) {
 				if (m_dup_pkthdr(n, m, M_NOWAIT)) {
 					bcopy(m->m_data, n->m_data, m->m_len);
 					n->m_len = m->m_len;
 
 					m_freem(m);
 					m = n;
 				} else {
 					m_freem(n);
 					n = NULL;
 				}
 			}
 		}
 		SOCKBUF_LOCK(&last->inp_socket->so_rcv);
 		if (sbappendaddr_locked(&last->inp_socket->so_rcv,
 		    (struct sockaddr *)&fromsa, m, opts) == 0) {
 			m_freem(m);
 			if (opts)
 				m_freem(opts);
 			SOCKBUF_UNLOCK(&last->inp_socket->so_rcv);
 		} else
 			sorwakeup_locked(last->inp_socket);
 		INP_RUNLOCK(last);
 	} else {
 		m_freem(m);
 		IP6STAT_DEC(ip6s_delivered);
 	}
 	*mp = NULL;
 	return (IPPROTO_DONE);
 }
 
 /*
  * Reflect the ip6 packet back to the source.
  * OFF points to the icmp6 header, counted from the top of the mbuf.
  */
 static void
 icmp6_reflect(struct mbuf *m, size_t off)
 {
 	struct in6_addr src6, *srcp;
 	struct ip6_hdr *ip6;
 	struct icmp6_hdr *icmp6;
 	struct in6_ifaddr *ia = NULL;
 	struct ifnet *outif = NULL;
 	int plen;
 	int type, code, hlim;
 
 	/* too short to reflect */
 	if (off < sizeof(struct ip6_hdr)) {
 		nd6log((LOG_DEBUG,
 		    "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n",
 		    (u_long)off, (u_long)sizeof(struct ip6_hdr),
 		    __FILE__, __LINE__));
 		goto bad;
 	}
 
 	/*
 	 * If there are extra headers between IPv6 and ICMPv6, strip
 	 * off that header first.
 	 */
 #ifdef DIAGNOSTIC
 	if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN)
 		panic("assumption failed in icmp6_reflect");
 #endif
 	if (off > sizeof(struct ip6_hdr)) {
 		size_t l;
 		struct ip6_hdr nip6;
 
 		l = off - sizeof(struct ip6_hdr);
 		m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6);
 		m_adj(m, l);
 		l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 		if (m->m_len < l) {
 			if ((m = m_pullup(m, l)) == NULL)
 				return;
 		}
 		bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6));
 	} else /* off == sizeof(struct ip6_hdr) */ {
 		size_t l;
 		l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 		if (m->m_len < l) {
 			if ((m = m_pullup(m, l)) == NULL)
 				return;
 		}
 	}
 	plen = m->m_pkthdr.len - sizeof(struct ip6_hdr);
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	icmp6 = (struct icmp6_hdr *)(ip6 + 1);
 	type = icmp6->icmp6_type; /* keep type for statistics */
 	code = icmp6->icmp6_code; /* ditto. */
 	hlim = 0;
 	srcp = NULL;
 
 	/*
 	 * If the incoming packet was addressed directly to us (i.e. unicast),
 	 * use dst as the src for the reply.
 	 * The IN6_IFF_NOTREADY case should be VERY rare, but is possible
 	 * (for example) when we encounter an error while forwarding procedure
 	 * destined to a duplicated address of ours.
 	 */
 	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
 		if (ia != NULL && !(ia->ia6_flags &
 		    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) {
 			src6 = ia->ia_addr.sin6_addr;
 			srcp = &src6;
 
 			if (m->m_pkthdr.rcvif != NULL) {
 				/* XXX: This may not be the outgoing interface */
 				hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim;
 			} else
 				hlim = V_ip6_defhlim;
 		}
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 	}
 
 	if (srcp == NULL) {
 		int error;
 		struct in6_addr dst6;
 		uint32_t scopeid;
 
 		/*
 		 * This case matches to multicasts, our anycast, or unicasts
 		 * that we do not own.  Select a source address based on the
 		 * source address of the erroneous packet.
 		 */
 		in6_splitscope(&ip6->ip6_src, &dst6, &scopeid);
 		error = in6_selectsrc_addr(M_GETFIB(m), &dst6,
 		    scopeid, NULL, &src6, &hlim);
 
 		if (error) {
 			char ip6buf[INET6_ADDRSTRLEN];
 			nd6log((LOG_DEBUG,
 			    "icmp6_reflect: source can't be determined: "
 			    "dst=%s, error=%d\n",
 			    ip6_sprintf(ip6buf, &ip6->ip6_dst), error));
 			goto bad;
 		}
 		srcp = &src6;
 	}
 	/*
 	 * ip6_input() drops a packet if its src is multicast.
 	 * So, the src is never multicast.
 	 */
 	ip6->ip6_dst = ip6->ip6_src;
 	ip6->ip6_src = *srcp;
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = hlim;
 
 	icmp6->icmp6_cksum = 0;
 	icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), plen);
 
 	/*
 	 * XXX option handling
 	 */
 
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 	m->m_pkthdr.rcvif = NULL;
 	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
 	if (outif)
 		icmp6_ifoutstat_inc(outif, type, code);
 
 	return;
 
  bad:
 	m_freem(m);
 	return;
 }
 
 void
 icmp6_fasttimo(void)
 {
 
 	mld_fasttimo();
 }
 
 void
 icmp6_slowtimo(void)
 {
 
 	mld_slowtimo();
 }
 
 static const char *
 icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6,
     struct in6_addr *tgt6)
 {
 	static char buf[1024];
 	char ip6bufs[INET6_ADDRSTRLEN];
 	char ip6bufd[INET6_ADDRSTRLEN];
 	char ip6buft[INET6_ADDRSTRLEN];
 	snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)",
 	    ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6),
 	    ip6_sprintf(ip6buft, tgt6));
 	return buf;
 }
 
 void
 icmp6_redirect_input(struct mbuf *m, int off)
 {
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6;
 	struct nd_redirect *nd_rd;
 	struct in6_addr src6, redtgt6, reddst6;
 	union nd_opts ndopts;
 	char ip6buf[INET6_ADDRSTRLEN];
 	char *lladdr;
 	int icmp6len, is_onlink, is_router, lladdrlen;
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: no rcvif", __func__));
 
 	/* XXX if we are router, we don't update route by icmp6 redirect */
 	if (V_ip6_forwarding)
 		goto freeit;
 	if (!V_icmp6_rediraccept)
 		goto freeit;
 
 	/* RFC 6980: Nodes MUST silently ignore fragments */
 	if(m->m_flags & M_FRAGMENTED)
 		goto freeit;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	icmp6len = ntohs(ip6->ip6_plen);
 	if (m->m_len < off + icmp6len) {
 		m = m_pullup(m, off + icmp6len);
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return;
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off);
 
 	ifp = m->m_pkthdr.rcvif;
 	redtgt6 = nd_rd->nd_rd_target;
 	reddst6 = nd_rd->nd_rd_dst;
 
 	if (in6_setscope(&redtgt6, ifp, NULL) ||
 	    in6_setscope(&reddst6, ifp, NULL)) {
 		goto freeit;
 	}
 
 	/* validation */
 	src6 = ip6->ip6_src;
 	if (!IN6_IS_ADDR_LINKLOCAL(&src6)) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect sent from %s rejected; "
 		    "must be from linklocal\n",
 		    ip6_sprintf(ip6buf, &src6)));
 		goto bad;
 	}
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect sent from %s rejected; "
 		    "hlim=%d (must be 255)\n",
 		    ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim));
 		goto bad;
 	}
     {
 	/* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */
 	struct nhop6_basic nh6;
 	struct in6_addr kdst;
 	uint32_t scopeid;
 
 	in6_splitscope(&reddst6, &kdst, &scopeid);
 	if (fib6_lookup_nh_basic(ifp->if_fib, &kdst, scopeid, 0, 0,&nh6)==0){
 		if ((nh6.nh_flags & NHF_GATEWAY) == 0) {
 			nd6log((LOG_ERR,
 			    "ICMP6 redirect rejected; no route "
 			    "with inet6 gateway found for redirect dst: %s\n",
 			    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 			goto bad;
 		}
 
 		/*
 		 * Embed scope zone id into next hop address, since
 		 * fib6_lookup_nh_basic() returns address without embedded
 		 * scope zone id.
 		 */
 		if (in6_setscope(&nh6.nh_addr, m->m_pkthdr.rcvif, NULL))
 			goto freeit;
 
 		if (IN6_ARE_ADDR_EQUAL(&src6, &nh6.nh_addr) == 0) {
 			nd6log((LOG_ERR,
 			    "ICMP6 redirect rejected; "
 			    "not equal to gw-for-src=%s (must be same): "
 			    "%s\n",
 			    ip6_sprintf(ip6buf, &nh6.nh_addr),
 			    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 			goto bad;
 		}
 	} else {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "no route found for redirect dst: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
     }
 	if (IN6_IS_ADDR_MULTICAST(&reddst6)) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "redirect dst must be unicast: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	is_router = is_onlink = 0;
 	if (IN6_IS_ADDR_LINKLOCAL(&redtgt6))
 		is_router = 1;	/* router case */
 	if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0)
 		is_onlink = 1;	/* on-link destination case */
 	if (!is_router && !is_onlink) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "neither router case nor onlink case: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	icmp6len -= sizeof(*nd_rd);
 	nd6_option_init(nd_rd + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO, "%s: invalid ND option, rejected: %s\n",
 		    __func__, icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	lladdr = NULL;
 	lladdrlen = 0;
 	if (ndopts.nd_opts_tgt_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO, "%s: lladdrlen mismatch for %s "
 		    "(if %d, icmp6 packet %d): %s\n",
 		    __func__, ip6_sprintf(ip6buf, &redtgt6),
 		    ifp->if_addrlen, lladdrlen - 2,
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	/* Validation passed. */
 
 	/* RFC 2461 8.3 */
 	nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT,
 	    is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER);
 
 	/*
 	 * Install a gateway route in the better-router case or an interface
 	 * route in the on-link-destination case.
 	 */
 	{
 		struct sockaddr_in6 sdst;
 		struct sockaddr_in6 sgw;
 		struct sockaddr_in6 ssrc;
 		struct sockaddr *gw;
 		int rt_flags;
 		u_int fibnum;
 
 		bzero(&sdst, sizeof(sdst));
 		bzero(&ssrc, sizeof(ssrc));
 		sdst.sin6_family = ssrc.sin6_family = AF_INET6;
 		sdst.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6);
 		bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
 		bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr));
 		rt_flags = 0;
 		if (is_router) {
 			bzero(&sgw, sizeof(sgw));
 			sgw.sin6_family = AF_INET6;
 			sgw.sin6_len = sizeof(struct sockaddr_in6);
 			bcopy(&redtgt6, &sgw.sin6_addr,
 				sizeof(struct in6_addr));
 			gw = (struct sockaddr *)&sgw;
 			rt_flags |= RTF_GATEWAY;
 		} else
 			gw = ifp->if_addr->ifa_addr;
 		for (fibnum = 0; fibnum < rt_numfibs; fibnum++)
 			rib_add_redirect(fibnum, (struct sockaddr *)&sdst, gw,
 			    (struct sockaddr *)&ssrc, ifp, rt_flags,
 			    V_icmp6_redirtimeout);
 	}
 	/* finally update cached route in each socket via pfctlinput */
     {
 	struct sockaddr_in6 sdst;
 
 	bzero(&sdst, sizeof(sdst));
 	sdst.sin6_family = AF_INET6;
 	sdst.sin6_len = sizeof(struct sockaddr_in6);
 	bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
 	pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst);
     }
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	ICMP6STAT_INC(icp6s_badredirect);
 	m_freem(m);
 }
 
 void
 icmp6_redirect_output(struct mbuf *m0, struct nhop_object *nh)
 {
 	struct ifnet *ifp;	/* my outgoing interface */
 	struct in6_addr *ifp_ll6;
 	struct in6_addr *router_ll6;
 	struct ip6_hdr *sip6;	/* m0 as struct ip6_hdr */
 	struct mbuf *m = NULL;	/* newly allocated one */
 	struct m_tag *mtag;
 	struct ip6_hdr *ip6;	/* m as struct ip6_hdr */
 	struct nd_redirect *nd_rd;
 	struct llentry *ln = NULL;
 	size_t maxlen;
 	u_char *p;
 	struct ifnet *outif = NULL;
 	struct sockaddr_in6 src_sa;
 
 	icmp6_errcount(ND_REDIRECT, 0);
 
 	/* if we are not router, we don't send icmp6 redirect */
 	if (!V_ip6_forwarding)
 		goto fail;
 
 	/* sanity check */
 	if (!m0 || !nh || !(NH_IS_VALID(nh)) || !(ifp = nh->nh_ifp))
 		goto fail;
 
 	/*
 	 * Address check:
 	 *  the source address must identify a neighbor, and
 	 *  the destination address must not be a multicast address
 	 *  [RFC 2461, sec 8.2]
 	 */
 	sip6 = mtod(m0, struct ip6_hdr *);
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = sip6->ip6_src;
 	if (nd6_is_addr_neighbor(&src_sa, ifp) == 0)
 		goto fail;
 	if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst))
 		goto fail;	/* what should we do here? */
 
 	/* rate limit */
 	if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0))
 		goto fail;
 
 	/*
 	 * Since we are going to append up to 1280 bytes (= IPV6_MMTU),
 	 * we almost always ask for an mbuf cluster for simplicity.
 	 * (MHLEN < IPV6_MMTU is almost always true)
 	 */
 #if IPV6_MMTU >= MCLBYTES
 # error assumption failed about IPV6_MMTU and MCLBYTES
 #endif
 	m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		goto fail;
 	M_SETFIB(m, M_GETFIB(m0));
 	maxlen = M_TRAILINGSPACE(m);
 	maxlen = min(IPV6_MMTU, maxlen);
 	/* just for safety */
 	if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) +
 	    ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) {
 		goto fail;
 	}
 
 	{
 		/* get ip6 linklocal address for ifp(my outgoing interface). */
 		struct in6_ifaddr *ia;
 		if ((ia = in6ifa_ifpforlinklocal(ifp,
 						 IN6_IFF_NOTREADY|
 						 IN6_IFF_ANYCAST)) == NULL)
 			goto fail;
 		ifp_ll6 = &ia->ia_addr.sin6_addr;
 		/* XXXRW: reference released prematurely. */
 		ifa_free(&ia->ia_ifa);
 	}
 
 	/* get ip6 linklocal address for the router. */
 	if (nh->nh_flags & NHF_GATEWAY) {
 		struct sockaddr_in6 *sin6;
 		sin6 = &nh->gw6_sa;
 		router_ll6 = &sin6->sin6_addr;
 		if (!IN6_IS_ADDR_LINKLOCAL(router_ll6))
 			router_ll6 = (struct in6_addr *)NULL;
 	} else
 		router_ll6 = (struct in6_addr *)NULL;
 
 	/* ip6 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	/* ip6->ip6_plen will be set later */
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	/* ip6->ip6_src must be linklocal addr for my outgoing if. */
 	bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr));
 	bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr));
 
 	/* ND Redirect */
 	nd_rd = (struct nd_redirect *)(ip6 + 1);
 	nd_rd->nd_rd_type = ND_REDIRECT;
 	nd_rd->nd_rd_code = 0;
 	nd_rd->nd_rd_reserved = 0;
 	if (nh->nh_flags & NHF_GATEWAY) {
 		/*
 		 * nd_rd->nd_rd_target must be a link-local address in
 		 * better router cases.
 		 */
 		if (!router_ll6)
 			goto fail;
 		bcopy(router_ll6, &nd_rd->nd_rd_target,
 		    sizeof(nd_rd->nd_rd_target));
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
 		    sizeof(nd_rd->nd_rd_dst));
 	} else {
 		/* make sure redtgt == reddst */
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target,
 		    sizeof(nd_rd->nd_rd_target));
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
 		    sizeof(nd_rd->nd_rd_dst));
 	}
 
 	p = (u_char *)(nd_rd + 1);
 
 	if (!router_ll6)
 		goto nolladdropt;
 
 	{
 		/* target lladdr option */
 		int len;
 		struct nd_opt_hdr *nd_opt;
 		char *lladdr;
 
 		ln = nd6_lookup(router_ll6, 0, ifp);
 		if (ln == NULL)
 			goto nolladdropt;
 
 		len = sizeof(*nd_opt) + ifp->if_addrlen;
 		len = (len + 7) & ~7;	/* round by 8 */
 		/* safety check */
 		if (len + (p - (u_char *)ip6) > maxlen) 			
 			goto nolladdropt;
 
 		if (ln->la_flags & LLE_VALID) {
 			nd_opt = (struct nd_opt_hdr *)p;
 			nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
 			nd_opt->nd_opt_len = len >> 3;
 			lladdr = (char *)(nd_opt + 1);
 			bcopy(ln->ll_addr, lladdr, ifp->if_addrlen);
 			p += len;
 		}
 	}
 nolladdropt:
 	if (ln != NULL)
 		LLE_RUNLOCK(ln);
 		
 	m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
 
 	/* just to be safe */
 #ifdef M_DECRYPTED	/*not openbsd*/
 	if (m0->m_flags & M_DECRYPTED)
 		goto noredhdropt;
 #endif
 	if (p - (u_char *)ip6 > maxlen)
 		goto noredhdropt;
 
 	{
 		/* redirected header option */
 		int len;
 		struct nd_opt_rd_hdr *nd_opt_rh;
 
 		/*
 		 * compute the maximum size for icmp6 redirect header option.
 		 * XXX room for auth header?
 		 */
 		len = maxlen - (p - (u_char *)ip6);
 		len &= ~7;
 
 		/* This is just for simplicity. */
 		if (m0->m_pkthdr.len != m0->m_len) {
 			if (m0->m_next) {
 				m_freem(m0->m_next);
 				m0->m_next = NULL;
 			}
 			m0->m_pkthdr.len = m0->m_len;
 		}
 
 		/*
 		 * Redirected header option spec (RFC2461 4.6.3) talks nothing
 		 * about padding/truncate rule for the original IP packet.
 		 * From the discussion on IPv6imp in Feb 1999,
 		 * the consensus was:
 		 * - "attach as much as possible" is the goal
 		 * - pad if not aligned (original size can be guessed by
 		 *   original ip6 header)
 		 * Following code adds the padding if it is simple enough,
 		 * and truncates if not.
 		 */
 		if (m0->m_next || m0->m_pkthdr.len != m0->m_len)
 			panic("assumption failed in %s:%d", __FILE__,
 			    __LINE__);
 
 		if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) {
 			/* not enough room, truncate */
 			m0->m_pkthdr.len = m0->m_len = len -
 			    sizeof(*nd_opt_rh);
 		} else {
 			/* enough room, pad or truncate */
 			size_t extra;
 
 			extra = m0->m_pkthdr.len % 8;
 			if (extra) {
 				/* pad if easy enough, truncate if not */
 				if (8 - extra <= M_TRAILINGSPACE(m0)) {
 					/* pad */
 					m0->m_len += (8 - extra);
 					m0->m_pkthdr.len += (8 - extra);
 				} else {
 					/* truncate */
 					m0->m_pkthdr.len -= extra;
 					m0->m_len -= extra;
 				}
 			}
 			len = m0->m_pkthdr.len + sizeof(*nd_opt_rh);
 			m0->m_pkthdr.len = m0->m_len = len -
 			    sizeof(*nd_opt_rh);
 		}
 
 		nd_opt_rh = (struct nd_opt_rd_hdr *)p;
 		bzero(nd_opt_rh, sizeof(*nd_opt_rh));
 		nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER;
 		nd_opt_rh->nd_opt_rh_len = len >> 3;
 		p += sizeof(*nd_opt_rh);
 		m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
 
 		/* connect m0 to m */
 		m_tag_delete_chain(m0, NULL);
 		m0->m_flags &= ~M_PKTHDR;
 		m->m_next = m0;
 		m->m_pkthdr.len = m->m_len + m0->m_len;
 		m0 = NULL;
 	}
 noredhdropt:;
 	if (m0) {
 		m_freem(m0);
 		m0 = NULL;
 	}
 
 	/* XXX: clear embedded link IDs in the inner header */
 	in6_clearscope(&sip6->ip6_src);
 	in6_clearscope(&sip6->ip6_dst);
 	in6_clearscope(&nd_rd->nd_rd_target);
 	in6_clearscope(&nd_rd->nd_rd_dst);
 
 	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
 
 	nd_rd->nd_rd_cksum = 0;
 	nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6,
 	    sizeof(*ip6), ntohs(ip6->ip6_plen));
 
         if (send_sendso_input_hook != NULL) {
 		mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short),
 			M_NOWAIT);
 		if (mtag == NULL)
 			goto fail;
 		*(unsigned short *)(mtag + 1) = nd_rd->nd_rd_type;
 		m_tag_prepend(m, mtag);
 	}
 
 	/* send the packet to outside... */
 	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
 	if (outif) {
 		icmp6_ifstat_inc(outif, ifs6_out_msg);
 		icmp6_ifstat_inc(outif, ifs6_out_redirect);
 	}
 	ICMP6STAT_INC(icp6s_outhist[ND_REDIRECT]);
 
 	return;
 
 fail:
 	if (m)
 		m_freem(m);
 	if (m0)
 		m_freem(m0);
 }
 
 /*
  * ICMPv6 socket option processing.
  */
 int
 icmp6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0;
 	int optlen;
 	struct inpcb *inp = sotoinpcb(so);
 	int level, op, optname;
 
 	if (sopt) {
 		level = sopt->sopt_level;
 		op = sopt->sopt_dir;
 		optname = sopt->sopt_name;
 		optlen = sopt->sopt_valsize;
 	} else
 		level = op = optname = optlen = 0;
 
 	if (level != IPPROTO_ICMPV6) {
 		return EINVAL;
 	}
 
 	switch (op) {
 	case PRCO_SETOPT:
 		switch (optname) {
 		case ICMP6_FILTER:
 		    {
 			struct icmp6_filter ic6f;
 
 			if (optlen != sizeof(ic6f)) {
 				error = EMSGSIZE;
 				break;
 			}
 			error = sooptcopyin(sopt, &ic6f, optlen, optlen);
 			if (error == 0) {
 				INP_WLOCK(inp);
 				*inp->in6p_icmp6filt = ic6f;
 				INP_WUNLOCK(inp);
 			}
 			break;
 		    }
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case PRCO_GETOPT:
 		switch (optname) {
 		case ICMP6_FILTER:
 		    {
 			struct icmp6_filter ic6f;
 
 			INP_RLOCK(inp);
 			ic6f = *inp->in6p_icmp6filt;
 			INP_RUNLOCK(inp);
 			error = sooptcopyout(sopt, &ic6f, sizeof(ic6f));
 			break;
 		    }
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Perform rate limit check.
  * Returns 0 if it is okay to send the icmp6 packet.
  * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate
  * limitation.
  *
  * XXX per-destination/type check necessary?
  *
  * dst - not used at this moment
  * type - not used at this moment
  * code - not used at this moment
  */
 static int
 icmp6_ratelimit(const struct in6_addr *dst, const int type,
     const int code)
 {
 	int ret;
 
 	ret = 0;	/* okay to send */
 
 	/* PPS limit */
 	if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count,
 	    V_icmp6errppslim)) {
 		/* The packet is subject to rate limit */
 		ret++;
 	}
 
 	return ret;
 }
Index: head/sys/netinet6/in6_rmx.c
===================================================================
--- head/sys/netinet6/in6_rmx.c	(revision 361703)
+++ head/sys/netinet6/in6_rmx.c	(revision 361704)
@@ -1,189 +1,190 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_rmx.c,v 1.11 2001/07/26 06:53:16 jinmei Exp $
  */
 
 /*-
  * Copyright 1994, 1995 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/mbuf.h>
 #include <sys/rwlock.h>
 #include <sys/syslog.h>
 #include <sys/callout.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
+#include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop.h>
 #include <net/route/shared.h>
 
 #include <netinet/in.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 
 extern int	in6_inithead(void **head, int off, u_int fibnum);
 #ifdef VIMAGE
 extern int	in6_detachhead(void **head, int off);
 #endif
 
 static int
 rib6_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask,
     struct nhop_object *nh)
 {
 	uint16_t nh_type;
 
 	/* XXX: RTF_LOCAL */
 
 	/*
 	 * Check route MTU:
 	 * inherit interface MTU if not set or
 	 * check if MTU is too large.
 	 */
 	if (nh->nh_mtu == 0) {
 		nh->nh_mtu = IN6_LINKMTU(nh->nh_ifp);
 	} else if (nh->nh_mtu > IN6_LINKMTU(nh->nh_ifp))
 		nh->nh_mtu = IN6_LINKMTU(nh->nh_ifp);
 
 	/* Ensure that default route nhop has special flag */
 	const struct sockaddr_in6 *mask6 = (const struct sockaddr_in6 *)mask;
 	if ((nhop_get_rtflags(nh) & RTF_HOST) == 0 && mask6 != NULL &&
 	    IN6_IS_ADDR_UNSPECIFIED(&mask6->sin6_addr))
 		nh->nh_flags |= NHF_DEFAULT;
 
 	/* Set nexthop type */
 	if (nhop_get_type(nh) == 0) {
 		if (nh->nh_flags & NHF_GATEWAY)
 			nh_type = NH_TYPE_IPV6_ETHER_NHOP;
 		else
 			nh_type = NH_TYPE_IPV6_ETHER_RSLV;
 
 		nhop_set_type(nh, nh_type);
 	}
 
 	return (0);
 }
 
 /*
  * Initialize our routing tree.
  */
 
 int
 in6_inithead(void **head, int off, u_int fibnum)
 {
 	struct rib_head *rh;
 
 	rh = rt_table_init(offsetof(struct sockaddr_in6, sin6_addr) << 3,
 	    AF_INET6, fibnum);
 	if (rh == NULL)
 		return (0);
 
 	rh->rnh_preadd = rib6_preadd;
 #ifdef	RADIX_MPATH
 	rt_mpath_init_rnh(rh);
 #endif
 	*head = (void *)rh;
 
 	return (1);
 }
 
 #ifdef VIMAGE
 int
 in6_detachhead(void **head, int off)
 {
 
 	rt_table_destroy((struct rib_head *)(*head));
 
 	return (1);
 }
 #endif
 
 /*
  * Extended API for IPv6 FIB support.
  */
 int
 in6_rtrequest(int req, struct sockaddr *dst, struct sockaddr *gw,
     struct sockaddr *mask, int flags, struct rtentry **ret_nrt, u_int fibnum)
 {
 
 	return (rtrequest_fib(req, dst, gw, mask, flags, ret_nrt, fibnum));
 }
 
Index: head/sys/netinet6/nd6_rtr.c
===================================================================
--- head/sys/netinet6/nd6_rtr.c	(revision 361703)
+++ head/sys/netinet6/nd6_rtr.c	(revision 361704)
@@ -1,2600 +1,2601 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6_rtr.c,v 1.111 2001/04/27 01:37:15 jinmei Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/refcount.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/errno.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/queue.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/radix.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <net/if_llatbl.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/scope6_var.h>
 
 static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *);
 static int prelist_update(struct nd_prefixctl *, struct nd_defrouter *,
     struct mbuf *, int);
 static int nd6_prefix_onlink(struct nd_prefix *);
 
 TAILQ_HEAD(nd6_drhead, nd_defrouter);
 VNET_DEFINE_STATIC(struct nd6_drhead, nd6_defrouter);
 #define	V_nd6_defrouter			VNET(nd6_defrouter)
 
 VNET_DECLARE(int, nd6_recalc_reachtm_interval);
 #define	V_nd6_recalc_reachtm_interval	VNET(nd6_recalc_reachtm_interval)
 
 VNET_DEFINE_STATIC(struct ifnet *, nd6_defifp);
 VNET_DEFINE(int, nd6_defifindex);
 #define	V_nd6_defifp			VNET(nd6_defifp)
 
 VNET_DEFINE(int, ip6_use_tempaddr) = 0;
 
 VNET_DEFINE(int, ip6_desync_factor);
 VNET_DEFINE(u_int32_t, ip6_temp_preferred_lifetime) = DEF_TEMP_PREFERRED_LIFETIME;
 VNET_DEFINE(u_int32_t, ip6_temp_valid_lifetime) = DEF_TEMP_VALID_LIFETIME;
 
 VNET_DEFINE(int, ip6_temp_regen_advance) = TEMPADDR_REGEN_ADVANCE;
 
 #ifdef EXPERIMENTAL
 VNET_DEFINE(int, nd6_ignore_ipv6_only_ra) = 1;
 #endif
 
 SYSCTL_DECL(_net_inet6_icmp6);
 
 /* RTPREF_MEDIUM has to be 0! */
 #define RTPREF_HIGH	1
 #define RTPREF_MEDIUM	0
 #define RTPREF_LOW	(-1)
 #define RTPREF_RESERVED	(-2)
 #define RTPREF_INVALID	(-3)	/* internal */
 
 static void
 defrouter_ref(struct nd_defrouter *dr)
 {
 
 	refcount_acquire(&dr->refcnt);
 }
 
 void
 defrouter_rele(struct nd_defrouter *dr)
 {
 
 	if (refcount_release(&dr->refcnt))
 		free(dr, M_IP6NDP);
 }
 
 /*
  * Remove a router from the global list and optionally stash it in a
  * caller-supplied queue.
  */
 static void
 defrouter_unlink(struct nd_defrouter *dr, struct nd6_drhead *drq)
 {
 
 	ND6_WLOCK_ASSERT();
 
 	TAILQ_REMOVE(&V_nd6_defrouter, dr, dr_entry);
 	V_nd6_list_genid++;
 	if (drq != NULL)
 		TAILQ_INSERT_TAIL(drq, dr, dr_entry);
 }
 
 /*
  * Receive Router Solicitation Message - just for routers.
  * Router solicitation/advertisement is mostly managed by userland program
  * (rtadvd) so here we have no function like nd6_ra_output().
  *
  * Based on RFC 2461
  */
 void
 nd6_rs_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6;
 	struct nd_router_solicit *nd_rs;
 	struct in6_addr saddr6;
 	union nd_opts ndopts;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 	char *lladdr;
 	int lladdrlen;
 
 	ifp = m->m_pkthdr.rcvif;
 
 	/*
 	 * Accept RS only when V_ip6_forwarding=1 and the interface has
 	 * no ND6_IFF_ACCEPT_RTADV.
 	 */
 	if (!V_ip6_forwarding || ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV)
 		goto freeit;
 
 	/* RFC 6980: Nodes MUST silently ignore fragments */   
 	if(m->m_flags & M_FRAGMENTED)
 		goto freeit;
 
 	/* Sanity checks */
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "%s: invalid hlim (%d) from %s to %s on %s\n", __func__,
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 	/*
 	 * Don't update the neighbor cache, if src = ::.
 	 * This indicates that the src has no IP address assigned yet.
 	 */
 	saddr6 = ip6->ip6_src;
 	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6))
 		goto freeit;
 
 	if (m->m_len < off + icmp6len) {
 		m = m_pullup(m, off + icmp6len);
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return;
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	nd_rs = (struct nd_router_solicit *)((caddr_t)ip6 + off);
 
 	icmp6len -= sizeof(*nd_rs);
 	nd6_option_init(nd_rs + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "%s: invalid ND option, ignored\n", __func__));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	lladdr = NULL;
 	lladdrlen = 0;
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 		    "%s: lladdrlen mismatch for %s (if %d, RS packet %d)\n",
 		    __func__, ip6_sprintf(ip6bufs, &saddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_SOLICIT, 0);
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	ICMP6STAT_INC(icp6s_badrs);
 	m_freem(m);
 }
 
 #ifdef EXPERIMENTAL
 /*
  * An initial update routine for draft-ietf-6man-ipv6only-flag.
  * We need to iterate over all default routers for the given
  * interface to see whether they are all advertising the "S"
  * (IPv6-Only) flag.  If they do set, otherwise unset, the
  * interface flag we later use to filter on.
  */
 static void
 defrtr_ipv6_only_ifp(struct ifnet *ifp)
 {
 	struct nd_defrouter *dr;
 	bool ipv6_only, ipv6_only_old;
 #ifdef INET
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	bool has_ipv4_addr;
 #endif
 
 	if (V_nd6_ignore_ipv6_only_ra != 0)
 		return;
 
 	ipv6_only = true;
 	ND6_RLOCK();
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry)
 		if (dr->ifp == ifp &&
 		    (dr->raflags & ND_RA_FLAG_IPV6_ONLY) == 0)
 			ipv6_only = false;
 	ND6_RUNLOCK();
 
 	IF_AFDATA_WLOCK(ifp);
 	ipv6_only_old = ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY;
 	IF_AFDATA_WUNLOCK(ifp);
 
 	/* If nothing changed, we have an early exit. */
 	if (ipv6_only == ipv6_only_old)
 		return;
 
 #ifdef INET
 	/*
 	 * Should we want to set the IPV6-ONLY flag, check if the
 	 * interface has a non-0/0 and non-link-local IPv4 address
 	 * configured on it.  If it has we will assume working
 	 * IPv4 operations and will clear the interface flag.
 	 */
 	has_ipv4_addr = false;
 	if (ipv6_only) {
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			if (in_canforward(
 			    satosin(ifa->ifa_addr)->sin_addr)) {
 				has_ipv4_addr = true;
 				break;
 			}
 		}
 		NET_EPOCH_EXIT(et);
 	}
 	if (ipv6_only && has_ipv4_addr) {
 		log(LOG_NOTICE, "%s rcvd RA w/ IPv6-Only flag set but has IPv4 "
 		    "configured, ignoring IPv6-Only flag.\n", ifp->if_xname);
 		ipv6_only = false;
 	}
 #endif
 
 	IF_AFDATA_WLOCK(ifp);
 	if (ipv6_only)
 		ND_IFINFO(ifp)->flags |= ND6_IFF_IPV6_ONLY;
 	else
 		ND_IFINFO(ifp)->flags &= ~ND6_IFF_IPV6_ONLY;
 	IF_AFDATA_WUNLOCK(ifp);
 
 #ifdef notyet
 	/* Send notification of flag change. */
 #endif
 }
 
 static void
 defrtr_ipv6_only_ipf_down(struct ifnet *ifp)
 {
 
 	IF_AFDATA_WLOCK(ifp);
 	ND_IFINFO(ifp)->flags &= ~ND6_IFF_IPV6_ONLY;
 	IF_AFDATA_WUNLOCK(ifp);
 }
 #endif	/* EXPERIMENTAL */
 
 void
 nd6_ifnet_link_event(void *arg __unused, struct ifnet *ifp, int linkstate)
 {
 
 	/*
 	 * XXX-BZ we might want to trigger re-evaluation of our default router
 	 * availability. E.g., on link down the default router might be
 	 * unreachable but a different interface might still have connectivity.
 	 */
 
 #ifdef EXPERIMENTAL
 	if (linkstate == LINK_STATE_DOWN)
 		defrtr_ipv6_only_ipf_down(ifp);
 #endif
 }
 
 /*
  * Receive Router Advertisement Message.
  *
  * Based on RFC 2461
  * TODO: on-link bit on prefix information
  * TODO: ND_RA_FLAG_{OTHER,MANAGED} processing
  */
 void
 nd6_ra_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet *ifp;
 	struct nd_ifinfo *ndi;
 	struct ip6_hdr *ip6;
 	struct nd_router_advert *nd_ra;
 	struct in6_addr saddr6;
 	struct nd_defrouter *dr;
 	union nd_opts ndopts;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 	int mcast;
 
 	/*
 	 * We only accept RAs only when the per-interface flag
 	 * ND6_IFF_ACCEPT_RTADV is on the receiving interface.
 	 */
 	ifp = m->m_pkthdr.rcvif;
 	ndi = ND_IFINFO(ifp);
 	if (!(ndi->flags & ND6_IFF_ACCEPT_RTADV))
 		goto freeit;
 
 	/* RFC 6980: Nodes MUST silently ignore fragments */
 	if(m->m_flags & M_FRAGMENTED)
 		goto freeit;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "%s: invalid hlim (%d) from %s to %s on %s\n", __func__,
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 	saddr6 = ip6->ip6_src;
 	if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) {
 		nd6log((LOG_ERR,
 		    "%s: src %s is not link-local\n", __func__,
 		    ip6_sprintf(ip6bufs, &saddr6)));
 		goto bad;
 	}
 
 	if (m->m_len < off + icmp6len) {
 		m = m_pullup(m, off + icmp6len);
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return;
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	nd_ra = (struct nd_router_advert *)((caddr_t)ip6 + off);
 
 	icmp6len -= sizeof(*nd_ra);
 	nd6_option_init(nd_ra + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "%s: invalid ND option, ignored\n", __func__));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	mcast = 0;
 	dr = NULL;
     {
 	struct nd_defrouter dr0;
 	u_int32_t advreachable = nd_ra->nd_ra_reachable;
 
 	/* remember if this is a multicasted advertisement */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
 		mcast = 1;
 
 	bzero(&dr0, sizeof(dr0));
 	dr0.rtaddr = saddr6;
 	dr0.raflags = nd_ra->nd_ra_flags_reserved;
 	/*
 	 * Effectively-disable routes from RA messages when
 	 * ND6_IFF_NO_RADR enabled on the receiving interface or
 	 * (ip6.forwarding == 1 && ip6.rfc6204w3 != 1).
 	 */
 	if (ndi->flags & ND6_IFF_NO_RADR)
 		dr0.rtlifetime = 0;
 	else if (V_ip6_forwarding && !V_ip6_rfc6204w3)
 		dr0.rtlifetime = 0;
 	else
 		dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime);
 	dr0.expire = time_uptime + dr0.rtlifetime;
 	dr0.ifp = ifp;
 	/* unspecified or not? (RFC 2461 6.3.4) */
 	if (advreachable) {
 		advreachable = ntohl(advreachable);
 		if (advreachable <= MAX_REACHABLE_TIME &&
 		    ndi->basereachable != advreachable) {
 			ndi->basereachable = advreachable;
 			ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable);
 			ndi->recalctm = V_nd6_recalc_reachtm_interval; /* reset */
 		}
 	}
 	if (nd_ra->nd_ra_retransmit)
 		ndi->retrans = ntohl(nd_ra->nd_ra_retransmit);
 	if (nd_ra->nd_ra_curhoplimit) {
 		if (ndi->chlim < nd_ra->nd_ra_curhoplimit)
 			ndi->chlim = nd_ra->nd_ra_curhoplimit;
 		else if (ndi->chlim != nd_ra->nd_ra_curhoplimit) {
 			log(LOG_ERR, "RA with a lower CurHopLimit sent from "
 			    "%s on %s (current = %d, received = %d). "
 			    "Ignored.\n", ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    if_name(ifp), ndi->chlim, nd_ra->nd_ra_curhoplimit);
 		}
 	}
 	dr = defrtrlist_update(&dr0);
 #ifdef EXPERIMENTAL
 	defrtr_ipv6_only_ifp(ifp);
 #endif
     }
 
 	/*
 	 * prefix
 	 */
 	if (ndopts.nd_opts_pi) {
 		struct nd_opt_hdr *pt;
 		struct nd_opt_prefix_info *pi = NULL;
 		struct nd_prefixctl pr;
 
 		for (pt = (struct nd_opt_hdr *)ndopts.nd_opts_pi;
 		     pt <= (struct nd_opt_hdr *)ndopts.nd_opts_pi_end;
 		     pt = (struct nd_opt_hdr *)((caddr_t)pt +
 						(pt->nd_opt_len << 3))) {
 			if (pt->nd_opt_type != ND_OPT_PREFIX_INFORMATION)
 				continue;
 			pi = (struct nd_opt_prefix_info *)pt;
 
 			if (pi->nd_opt_pi_len != 4) {
 				nd6log((LOG_INFO,
 				    "%s: invalid option len %d for prefix "
 				    "information option, ignored\n", __func__,
 				    pi->nd_opt_pi_len));
 				continue;
 			}
 
 			if (128 < pi->nd_opt_pi_prefix_len) {
 				nd6log((LOG_INFO,
 				    "%s: invalid prefix len %d for prefix "
 				    "information option, ignored\n", __func__,
 				    pi->nd_opt_pi_prefix_len));
 				continue;
 			}
 
 			if (IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix)
 			 || IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) {
 				nd6log((LOG_INFO,
 				    "%s: invalid prefix %s, ignored\n",
 				    __func__, ip6_sprintf(ip6bufs,
 					&pi->nd_opt_pi_prefix)));
 				continue;
 			}
 
 			bzero(&pr, sizeof(pr));
 			pr.ndpr_prefix.sin6_family = AF_INET6;
 			pr.ndpr_prefix.sin6_len = sizeof(pr.ndpr_prefix);
 			pr.ndpr_prefix.sin6_addr = pi->nd_opt_pi_prefix;
 			pr.ndpr_ifp = (struct ifnet *)m->m_pkthdr.rcvif;
 
 			pr.ndpr_raf_onlink = (pi->nd_opt_pi_flags_reserved &
 			    ND_OPT_PI_FLAG_ONLINK) ? 1 : 0;
 			pr.ndpr_raf_auto = (pi->nd_opt_pi_flags_reserved &
 			    ND_OPT_PI_FLAG_AUTO) ? 1 : 0;
 			pr.ndpr_plen = pi->nd_opt_pi_prefix_len;
 			pr.ndpr_vltime = ntohl(pi->nd_opt_pi_valid_time);
 			pr.ndpr_pltime = ntohl(pi->nd_opt_pi_preferred_time);
 			(void)prelist_update(&pr, dr, m, mcast);
 		}
 	}
 	if (dr != NULL) {
 		defrouter_rele(dr);
 		dr = NULL;
 	}
 
 	/*
 	 * MTU
 	 */
 	if (ndopts.nd_opts_mtu && ndopts.nd_opts_mtu->nd_opt_mtu_len == 1) {
 		u_long mtu;
 		u_long maxmtu;
 
 		mtu = (u_long)ntohl(ndopts.nd_opts_mtu->nd_opt_mtu_mtu);
 
 		/* lower bound */
 		if (mtu < IPV6_MMTU) {
 			nd6log((LOG_INFO, "%s: bogus mtu option mtu=%lu sent "
 			    "from %s, ignoring\n", __func__,
 			    mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src)));
 			goto skip;
 		}
 
 		/* upper bound */
 		maxmtu = (ndi->maxmtu && ndi->maxmtu < ifp->if_mtu)
 		    ? ndi->maxmtu : ifp->if_mtu;
 		if (mtu <= maxmtu) {
 			int change = (ndi->linkmtu != mtu);
 
 			ndi->linkmtu = mtu;
 			if (change) {
 				/* in6_maxmtu may change */
 				in6_setmaxmtu();
 				rt_updatemtu(ifp);
 			}
 		} else {
 			nd6log((LOG_INFO, "%s: bogus mtu=%lu sent from %s; "
 			    "exceeds maxmtu %lu, ignoring\n", __func__,
 			    mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src), maxmtu));
 		}
 	}
 
  skip:
 
 	/*
 	 * Source link layer address
 	 */
     {
 	char *lladdr = NULL;
 	int lladdrlen = 0;
 
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 		    "%s: lladdrlen mismatch for %s (if %d, RA packet %d)\n",
 		    __func__, ip6_sprintf(ip6bufs, &saddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	nd6_cache_lladdr(ifp, &saddr6, lladdr,
 	    lladdrlen, ND_ROUTER_ADVERT, 0);
 
 	/*
 	 * Installing a link-layer address might change the state of the
 	 * router's neighbor cache, which might also affect our on-link
 	 * detection of adveritsed prefixes.
 	 */
 	pfxlist_onlink_check();
     }
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	ICMP6STAT_INC(icp6s_badra);
 	m_freem(m);
 }
 
 /* PFXRTR */
 static struct nd_pfxrouter *
 pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr)
 {
 	struct nd_pfxrouter *search;
 
 	ND6_LOCK_ASSERT();
 
 	LIST_FOREACH(search, &pr->ndpr_advrtrs, pfr_entry) {
 		if (search->router == dr)
 			break;
 	}
 	return (search);
 }
 
 static void
 pfxrtr_add(struct nd_prefix *pr, struct nd_defrouter *dr)
 {
 	struct nd_pfxrouter *new;
 	bool update;
 
 	ND6_UNLOCK_ASSERT();
 
 	ND6_RLOCK();
 	if (pfxrtr_lookup(pr, dr) != NULL) {
 		ND6_RUNLOCK();
 		return;
 	}
 	ND6_RUNLOCK();
 
 	new = malloc(sizeof(*new), M_IP6NDP, M_NOWAIT | M_ZERO);
 	if (new == NULL)
 		return;
 	defrouter_ref(dr);
 	new->router = dr;
 
 	ND6_WLOCK();
 	if (pfxrtr_lookup(pr, dr) == NULL) {
 		LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry);
 		update = true;
 	} else {
 		/* We lost a race to add the reference. */
 		defrouter_rele(dr);
 		free(new, M_IP6NDP);
 		update = false;
 	}
 	ND6_WUNLOCK();
 
 	if (update)
 		pfxlist_onlink_check();
 }
 
 static void
 pfxrtr_del(struct nd_pfxrouter *pfr)
 {
 
 	ND6_WLOCK_ASSERT();
 
 	LIST_REMOVE(pfr, pfr_entry);
 	defrouter_rele(pfr->router);
 	free(pfr, M_IP6NDP);
 }
 
 
 /* Default router list processing sub routines. */
 static void
 defrouter_addreq(struct nd_defrouter *new)
 {
 	struct sockaddr_in6 def, mask, gate;
 	struct rtentry *newrt = NULL;
 	unsigned int fibnum;
 	int error;
 
 	bzero(&def, sizeof(def));
 	bzero(&mask, sizeof(mask));
 	bzero(&gate, sizeof(gate));
 
 	def.sin6_len = mask.sin6_len = gate.sin6_len =
 	    sizeof(struct sockaddr_in6);
 	def.sin6_family = gate.sin6_family = AF_INET6;
 	gate.sin6_addr = new->rtaddr;
 	fibnum = new->ifp->if_fib;
 
 	error = in6_rtrequest(RTM_ADD, (struct sockaddr *)&def,
 	    (struct sockaddr *)&gate, (struct sockaddr *)&mask,
 	    RTF_GATEWAY, &newrt, fibnum);
 	if (newrt != NULL)
 		rt_routemsg(RTM_ADD, newrt, new->ifp, 0, fibnum);
 	if (error == 0)
 		new->installed = 1;
 }
 
 /*
  * Remove the default route for a given router.
  * This is just a subroutine function for defrouter_select_fib(), and
  * should not be called from anywhere else.
  */
 static void
 defrouter_delreq(struct nd_defrouter *dr)
 {
 	struct sockaddr_in6 def, mask, gate;
 	struct rtentry *oldrt = NULL;
 	struct epoch_tracker et;
 	unsigned int fibnum;
 
 	bzero(&def, sizeof(def));
 	bzero(&mask, sizeof(mask));
 	bzero(&gate, sizeof(gate));
 
 	def.sin6_len = mask.sin6_len = gate.sin6_len =
 	    sizeof(struct sockaddr_in6);
 	def.sin6_family = gate.sin6_family = AF_INET6;
 	gate.sin6_addr = dr->rtaddr;
 	fibnum = dr->ifp->if_fib;
 
 	NET_EPOCH_ENTER(et);
 	in6_rtrequest(RTM_DELETE, (struct sockaddr *)&def,
 	    (struct sockaddr *)&gate,
 	    (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt, fibnum);
 	if (oldrt != NULL)
 		rt_routemsg(RTM_DELETE, oldrt, dr->ifp, 0, fibnum);
 	NET_EPOCH_EXIT(et);
 
 	dr->installed = 0;
 }
 
 static void
 defrouter_del(struct nd_defrouter *dr)
 {
 	struct nd_defrouter *deldr = NULL;
 	struct nd_prefix *pr;
 	struct nd_pfxrouter *pfxrtr;
 
 	ND6_UNLOCK_ASSERT();
 
 	/*
 	 * Flush all the routing table entries that use the router
 	 * as a next hop.
 	 */
 	if (ND_IFINFO(dr->ifp)->flags & ND6_IFF_ACCEPT_RTADV)
 		rt6_flush(&dr->rtaddr, dr->ifp);
 
 #ifdef EXPERIMENTAL
 	defrtr_ipv6_only_ifp(dr->ifp);
 #endif
 
 	if (dr->installed) {
 		deldr = dr;
 		defrouter_delreq(dr);
 	}
 
 	/*
 	 * Also delete all the pointers to the router in each prefix lists.
 	 */
 	ND6_WLOCK();
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL)
 			pfxrtr_del(pfxrtr);
 	}
 	ND6_WUNLOCK();
 
 	pfxlist_onlink_check();
 
 	/*
 	 * If the router is the primary one, choose a new one.
 	 * Note that defrouter_select_fib() will remove the current
          * gateway from the routing table.
 	 */
 	if (deldr)
 		defrouter_select_fib(deldr->ifp->if_fib);
 
 	/*
 	 * Release the list reference.
 	 */
 	defrouter_rele(dr);
 }
 
 
 struct nd_defrouter *
 defrouter_lookup_locked(const struct in6_addr *addr, struct ifnet *ifp)
 {
 	struct nd_defrouter *dr;
 
 	ND6_LOCK_ASSERT();
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry)
 		if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) {
 			defrouter_ref(dr);
 			return (dr);
 		}
 	return (NULL);
 }
 
 struct nd_defrouter *
 defrouter_lookup(const struct in6_addr *addr, struct ifnet *ifp)
 {
 	struct nd_defrouter *dr;
 
 	ND6_RLOCK();
 	dr = defrouter_lookup_locked(addr, ifp);
 	ND6_RUNLOCK();
 	return (dr);
 }
 
 /*
  * Remove all default routes from default router list.
  */
 void
 defrouter_reset(void)
 {
 	struct nd_defrouter *dr, **dra;
 	int count, i;
 
 	count = i = 0;
 
 	/*
 	 * We can't delete routes with the ND lock held, so make a copy of the
 	 * current default router list and use that when deleting routes.
 	 */
 	ND6_RLOCK();
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry)
 		count++;
 	ND6_RUNLOCK();
 
 	dra = malloc(count * sizeof(*dra), M_TEMP, M_WAITOK | M_ZERO);
 
 	ND6_RLOCK();
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) {
 		if (i == count)
 			break;
 		defrouter_ref(dr);
 		dra[i++] = dr;
 	}
 	ND6_RUNLOCK();
 
 	for (i = 0; i < count && dra[i] != NULL; i++) {
 		defrouter_delreq(dra[i]);
 		defrouter_rele(dra[i]);
 	}
 	free(dra, M_TEMP);
 
 	/*
 	 * XXX should we also nuke any default routers in the kernel, by
 	 * going through them by rtalloc1()?
 	 */
 }
 
 /*
  * Look up a matching default router list entry and remove it. Returns true if a
  * matching entry was found, false otherwise.
  */
 bool
 defrouter_remove(struct in6_addr *addr, struct ifnet *ifp)
 {
 	struct nd_defrouter *dr;
 
 	ND6_WLOCK();
 	dr = defrouter_lookup_locked(addr, ifp);
 	if (dr == NULL) {
 		ND6_WUNLOCK();
 		return (false);
 	}
 
 	defrouter_unlink(dr, NULL);
 	ND6_WUNLOCK();
 	defrouter_del(dr);
 	defrouter_rele(dr);
 	return (true);
 }
 
 /*
  * for default router selection
  * regards router-preference field as a 2-bit signed integer
  */
 static int
 rtpref(struct nd_defrouter *dr)
 {
 	switch (dr->raflags & ND_RA_FLAG_RTPREF_MASK) {
 	case ND_RA_FLAG_RTPREF_HIGH:
 		return (RTPREF_HIGH);
 	case ND_RA_FLAG_RTPREF_MEDIUM:
 	case ND_RA_FLAG_RTPREF_RSV:
 		return (RTPREF_MEDIUM);
 	case ND_RA_FLAG_RTPREF_LOW:
 		return (RTPREF_LOW);
 	default:
 		/*
 		 * This case should never happen.  If it did, it would mean a
 		 * serious bug of kernel internal.  We thus always bark here.
 		 * Or, can we even panic?
 		 */
 		log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->raflags);
 		return (RTPREF_INVALID);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Default Router Selection according to Section 6.3.6 of RFC 2461 and
  * draft-ietf-ipngwg-router-selection:
  * 1) Routers that are reachable or probably reachable should be preferred.
  *    If we have more than one (probably) reachable router, prefer ones
  *    with the highest router preference.
  * 2) When no routers on the list are known to be reachable or
  *    probably reachable, routers SHOULD be selected in a round-robin
  *    fashion, regardless of router preference values.
  * 3) If the Default Router List is empty, assume that all
  *    destinations are on-link.
  *
  * We assume nd_defrouter is sorted by router preference value.
  * Since the code below covers both with and without router preference cases,
  * we do not need to classify the cases by ifdef.
  *
  * At this moment, we do not try to install more than one default router,
  * even when the multipath routing is available, because we're not sure about
  * the benefits for stub hosts comparing to the risk of making the code
  * complicated and the possibility of introducing bugs.
  *
  * We maintain a single list of routers for multiple FIBs, only considering one
  * at a time based on the receiving interface's FIB. If @fibnum is RT_ALL_FIBS,
  * we do the whole thing multiple times.
  */
 void
 defrouter_select_fib(int fibnum)
 {
 	struct epoch_tracker et;
 	struct nd_defrouter *dr, *selected_dr, *installed_dr;
 	struct llentry *ln = NULL;
 
 	if (fibnum == RT_ALL_FIBS) {
 		for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 			defrouter_select_fib(fibnum);
 		}
 	}
 
 	ND6_RLOCK();
 	/*
 	 * Let's handle easy case (3) first:
 	 * If default router list is empty, there's nothing to be done.
 	 */
 	if (TAILQ_EMPTY(&V_nd6_defrouter)) {
 		ND6_RUNLOCK();
 		return;
 	}
 
 	/*
 	 * Search for a (probably) reachable router from the list.
 	 * We just pick up the first reachable one (if any), assuming that
 	 * the ordering rule of the list described in defrtrlist_update().
 	 */
 	selected_dr = installed_dr = NULL;
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) {
 		NET_EPOCH_ENTER(et);
 		if (selected_dr == NULL && dr->ifp->if_fib == fibnum &&
 		    (ln = nd6_lookup(&dr->rtaddr, 0, dr->ifp)) &&
 		    ND6_IS_LLINFO_PROBREACH(ln)) {
 			selected_dr = dr;
 			defrouter_ref(selected_dr);
 		}
 		NET_EPOCH_EXIT(et);
 		if (ln != NULL) {
 			LLE_RUNLOCK(ln);
 			ln = NULL;
 		}
 
 		if (dr->installed && dr->ifp->if_fib == fibnum) {
 			if (installed_dr == NULL) {
 				installed_dr = dr;
 				defrouter_ref(installed_dr);
 			} else {
 				/*
 				 * this should not happen.
 				 * warn for diagnosis.
 				 */
 				log(LOG_ERR, "defrouter_select_fib: more than "
 				             "one router is installed\n");
 			}
 		}
 	}
 	/*
 	 * If none of the default routers was found to be reachable,
 	 * round-robin the list regardless of preference.
 	 * Otherwise, if we have an installed router, check if the selected
 	 * (reachable) router should really be preferred to the installed one.
 	 * We only prefer the new router when the old one is not reachable
 	 * or when the new one has a really higher preference value.
 	 */
 	if (selected_dr == NULL) {
 		if (installed_dr == NULL ||
 		    TAILQ_NEXT(installed_dr, dr_entry) == NULL)
 			dr = TAILQ_FIRST(&V_nd6_defrouter);
 		else
 			dr = TAILQ_NEXT(installed_dr, dr_entry);
 
 		/* Ensure we select a router for this FIB. */
 		TAILQ_FOREACH_FROM(dr, &V_nd6_defrouter, dr_entry) {
 			if (dr->ifp->if_fib == fibnum) {
 				selected_dr = dr;
 				defrouter_ref(selected_dr);
 				break;
 			}
 		}
 	} else if (installed_dr != NULL) {
 		NET_EPOCH_ENTER(et);
 		if ((ln = nd6_lookup(&installed_dr->rtaddr, 0,
 		                     installed_dr->ifp)) &&
 		    ND6_IS_LLINFO_PROBREACH(ln) &&
 		    installed_dr->ifp->if_fib == fibnum &&
 		    rtpref(selected_dr) <= rtpref(installed_dr)) {
 			defrouter_rele(selected_dr);
 			selected_dr = installed_dr;
 		}
 		NET_EPOCH_EXIT(et);
 		if (ln != NULL)
 			LLE_RUNLOCK(ln);
 	}
 	ND6_RUNLOCK();
 
 	NET_EPOCH_ENTER(et);
 	/*
 	 * If we selected a router for this FIB and it's different
 	 * than the installed one, remove the installed router and
 	 * install the selected one in its place.
 	 */
 	if (installed_dr != selected_dr) {
 		if (installed_dr != NULL) {
 			defrouter_delreq(installed_dr);
 			defrouter_rele(installed_dr);
 		}
 		if (selected_dr != NULL)
 			defrouter_addreq(selected_dr);
 	}
 	if (selected_dr != NULL)
 		defrouter_rele(selected_dr);
 	NET_EPOCH_EXIT(et);
 }
 
 static struct nd_defrouter *
 defrtrlist_update(struct nd_defrouter *new)
 {
 	struct nd_defrouter *dr, *n;
 	uint64_t genid;
 	int oldpref;
 	bool writelocked;
 
 	if (new->rtlifetime == 0) {
 		defrouter_remove(&new->rtaddr, new->ifp);
 		return (NULL);
 	}
 
 	ND6_RLOCK();
 	writelocked = false;
 restart:
 	dr = defrouter_lookup_locked(&new->rtaddr, new->ifp);
 	if (dr != NULL) {
 		oldpref = rtpref(dr);
 
 		/* override */
 		dr->raflags = new->raflags; /* XXX flag check */
 		dr->rtlifetime = new->rtlifetime;
 		dr->expire = new->expire;
 
 		/*
 		 * If the preference does not change, there's no need
 		 * to sort the entries. Also make sure the selected
 		 * router is still installed in the kernel.
 		 */
 		if (dr->installed && rtpref(new) == oldpref) {
 			if (writelocked)
 				ND6_WUNLOCK();
 			else
 				ND6_RUNLOCK();
 			return (dr);
 		}
 	}
 
 	/*
 	 * The router needs to be reinserted into the default router
 	 * list, so upgrade to a write lock. If that fails and the list
 	 * has potentially changed while the lock was dropped, we'll
 	 * redo the lookup with the write lock held.
 	 */
 	if (!writelocked) {
 		writelocked = true;
 		if (!ND6_TRY_UPGRADE()) {
 			genid = V_nd6_list_genid;
 			ND6_RUNLOCK();
 			ND6_WLOCK();
 			if (genid != V_nd6_list_genid)
 				goto restart;
 		}
 	}
 
 	if (dr != NULL) {
 		/*
 		 * The preferred router may have changed, so relocate this
 		 * router.
 		 */
 		TAILQ_REMOVE(&V_nd6_defrouter, dr, dr_entry);
 		n = dr;
 	} else {
 		n = malloc(sizeof(*n), M_IP6NDP, M_NOWAIT | M_ZERO);
 		if (n == NULL) {
 			ND6_WUNLOCK();
 			return (NULL);
 		}
 		memcpy(n, new, sizeof(*n));
 		/* Initialize with an extra reference for the caller. */
 		refcount_init(&n->refcnt, 2);
 	}
 
 	/*
 	 * Insert the new router in the Default Router List;
 	 * The Default Router List should be in the descending order
 	 * of router-preferece.  Routers with the same preference are
 	 * sorted in the arriving time order.
 	 */
 
 	/* insert at the end of the group */
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) {
 		if (rtpref(n) > rtpref(dr))
 			break;
 	}
 	if (dr != NULL)
 		TAILQ_INSERT_BEFORE(dr, n, dr_entry);
 	else
 		TAILQ_INSERT_TAIL(&V_nd6_defrouter, n, dr_entry);
 	V_nd6_list_genid++;
 	ND6_WUNLOCK();
 
 	defrouter_select_fib(new->ifp->if_fib);
 
 	return (n);
 }
 
 static int
 in6_init_prefix_ltimes(struct nd_prefix *ndpr)
 {
 	if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME)
 		ndpr->ndpr_preferred = 0;
 	else
 		ndpr->ndpr_preferred = time_uptime + ndpr->ndpr_pltime;
 	if (ndpr->ndpr_vltime == ND6_INFINITE_LIFETIME)
 		ndpr->ndpr_expire = 0;
 	else
 		ndpr->ndpr_expire = time_uptime + ndpr->ndpr_vltime;
 
 	return 0;
 }
 
 static void
 in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6)
 {
 	/* init ia6t_expire */
 	if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME)
 		lt6->ia6t_expire = 0;
 	else {
 		lt6->ia6t_expire = time_uptime;
 		lt6->ia6t_expire += lt6->ia6t_vltime;
 	}
 
 	/* init ia6t_preferred */
 	if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME)
 		lt6->ia6t_preferred = 0;
 	else {
 		lt6->ia6t_preferred = time_uptime;
 		lt6->ia6t_preferred += lt6->ia6t_pltime;
 	}
 }
 
 static struct in6_ifaddr *
 in6_ifadd(struct nd_prefixctl *pr, int mcast)
 {
 	struct ifnet *ifp = pr->ndpr_ifp;
 	struct ifaddr *ifa;
 	struct in6_aliasreq ifra;
 	struct in6_ifaddr *ia, *ib;
 	int error, plen0;
 	struct in6_addr mask;
 	int prefixlen = pr->ndpr_plen;
 	int updateflags;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	in6_prefixlen2mask(&mask, prefixlen);
 
 	/*
 	 * find a link-local address (will be interface ID).
 	 * Is it really mandatory? Theoretically, a global or a site-local
 	 * address can be configured without a link-local address, if we
 	 * have a unique interface identifier...
 	 *
 	 * it is not mandatory to have a link-local address, we can generate
 	 * interface identifier on the fly.  we do this because:
 	 * (1) it should be the easiest way to find interface identifier.
 	 * (2) RFC2462 5.4 suggesting the use of the same interface identifier
 	 * for multiple addresses on a single interface, and possible shortcut
 	 * of DAD.  we omitted DAD for this reason in the past.
 	 * (3) a user can prevent autoconfiguration of global address
 	 * by removing link-local address by hand (this is partly because we
 	 * don't have other way to control the use of IPv6 on an interface.
 	 * this has been our design choice - cf. NRL's "ifconfig auto").
 	 * (4) it is easier to manage when an interface has addresses
 	 * with the same interface identifier, than to have multiple addresses
 	 * with different interface identifiers.
 	 */
 	ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); /* 0 is OK? */
 	if (ifa)
 		ib = (struct in6_ifaddr *)ifa;
 	else
 		return NULL;
 
 	/* prefixlen + ifidlen must be equal to 128 */
 	plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL);
 	if (prefixlen != plen0) {
 		ifa_free(ifa);
 		nd6log((LOG_INFO,
 		    "%s: wrong prefixlen for %s (prefix=%d ifid=%d)\n",
 		    __func__, if_name(ifp), prefixlen, 128 - plen0));
 		return NULL;
 	}
 
 	/* make ifaddr */
 	in6_prepare_ifra(&ifra, &pr->ndpr_prefix.sin6_addr, &mask);
 
 	IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr, &mask);
 	/* interface ID */
 	ifra.ifra_addr.sin6_addr.s6_addr32[0] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]);
 	ifra.ifra_addr.sin6_addr.s6_addr32[1] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]);
 	ifra.ifra_addr.sin6_addr.s6_addr32[2] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]);
 	ifra.ifra_addr.sin6_addr.s6_addr32[3] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]);
 	ifa_free(ifa);
 
 	/* lifetimes. */
 	ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime;
 	ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime;
 
 	/* XXX: scope zone ID? */
 
 	ifra.ifra_flags |= IN6_IFF_AUTOCONF; /* obey autoconf */
 
 	/*
 	 * Make sure that we do not have this address already.  This should
 	 * usually not happen, but we can still see this case, e.g., if we
 	 * have manually configured the exact address to be configured.
 	 */
 	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp,
 	    &ifra.ifra_addr.sin6_addr);
 	if (ifa != NULL) {
 		ifa_free(ifa);
 		/* this should be rare enough to make an explicit log */
 		log(LOG_INFO, "in6_ifadd: %s is already configured\n",
 		    ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr));
 		return (NULL);
 	}
 
 	/*
 	 * Allocate ifaddr structure, link into chain, etc.
 	 * If we are going to create a new address upon receiving a multicasted
 	 * RA, we need to impose a random delay before starting DAD.
 	 * [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2]
 	 */
 	updateflags = 0;
 	if (mcast)
 		updateflags |= IN6_IFAUPDATE_DADDELAY;
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) {
 		nd6log((LOG_ERR,
 		    "%s: failed to make ifaddr %s on %s (errno=%d)\n", __func__,
 		    ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr),
 		    if_name(ifp), error));
 		return (NULL);	/* ifaddr must not have been allocated. */
 	}
 
 	ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr);
 	/*
 	 * XXXRW: Assumption of non-NULLness here might not be true with
 	 * fine-grained locking -- should we validate it?  Or just return
 	 * earlier ifa rather than looking it up again?
 	 */
 	return (ia);		/* this is always non-NULL  and referenced. */
 }
 
 static struct nd_prefix *
 nd6_prefix_lookup_locked(struct nd_prefixctl *key)
 {
 	struct nd_prefix *search;
 
 	ND6_LOCK_ASSERT();
 
 	LIST_FOREACH(search, &V_nd_prefix, ndpr_entry) {
 		if (key->ndpr_ifp == search->ndpr_ifp &&
 		    key->ndpr_plen == search->ndpr_plen &&
 		    in6_are_prefix_equal(&key->ndpr_prefix.sin6_addr,
 		    &search->ndpr_prefix.sin6_addr, key->ndpr_plen)) {
 			nd6_prefix_ref(search);
 			break;
 		}
 	}
 	return (search);
 }
 
 struct nd_prefix *
 nd6_prefix_lookup(struct nd_prefixctl *key)
 {
 	struct nd_prefix *search;
 
 	ND6_RLOCK();
 	search = nd6_prefix_lookup_locked(key);
 	ND6_RUNLOCK();
 	return (search);
 }
 
 void
 nd6_prefix_ref(struct nd_prefix *pr)
 {
 
 	refcount_acquire(&pr->ndpr_refcnt);
 }
 
 void
 nd6_prefix_rele(struct nd_prefix *pr)
 {
 
 	if (refcount_release(&pr->ndpr_refcnt)) {
 		KASSERT(LIST_EMPTY(&pr->ndpr_advrtrs),
 		    ("prefix %p has advertising routers", pr));
 		free(pr, M_IP6NDP);
 	}
 }
 
 int
 nd6_prelist_add(struct nd_prefixctl *pr, struct nd_defrouter *dr,
     struct nd_prefix **newp)
 {
 	struct nd_prefix *new;
 	char ip6buf[INET6_ADDRSTRLEN];
 	int error;
 
 	new = malloc(sizeof(*new), M_IP6NDP, M_NOWAIT | M_ZERO);
 	if (new == NULL)
 		return (ENOMEM);
 	refcount_init(&new->ndpr_refcnt, newp != NULL ? 2 : 1);
 	new->ndpr_ifp = pr->ndpr_ifp;
 	new->ndpr_prefix = pr->ndpr_prefix;
 	new->ndpr_plen = pr->ndpr_plen;
 	new->ndpr_vltime = pr->ndpr_vltime;
 	new->ndpr_pltime = pr->ndpr_pltime;
 	new->ndpr_flags = pr->ndpr_flags;
 	if ((error = in6_init_prefix_ltimes(new)) != 0) {
 		free(new, M_IP6NDP);
 		return (error);
 	}
 	new->ndpr_lastupdate = time_uptime;
 
 	/* initialization */
 	LIST_INIT(&new->ndpr_advrtrs);
 	in6_prefixlen2mask(&new->ndpr_mask, new->ndpr_plen);
 	/* make prefix in the canonical form */
 	IN6_MASK_ADDR(&new->ndpr_prefix.sin6_addr, &new->ndpr_mask);
 
 	ND6_WLOCK();
 	LIST_INSERT_HEAD(&V_nd_prefix, new, ndpr_entry);
 	V_nd6_list_genid++;
 	ND6_WUNLOCK();
 
 	/* ND_OPT_PI_FLAG_ONLINK processing */
 	if (new->ndpr_raf_onlink) {
 		struct epoch_tracker et;
 
 		ND6_ONLINK_LOCK();
 		NET_EPOCH_ENTER(et);
 		if ((error = nd6_prefix_onlink(new)) != 0) {
 			nd6log((LOG_ERR, "%s: failed to make the prefix %s/%d "
 			    "on-link on %s (errno=%d)\n", __func__,
 			    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 			    pr->ndpr_plen, if_name(pr->ndpr_ifp), error));
 			/* proceed anyway. XXX: is it correct? */
 		}
 		NET_EPOCH_EXIT(et);
 		ND6_ONLINK_UNLOCK();
 	}
 
 	if (dr != NULL)
 		pfxrtr_add(new, dr);
 	if (newp != NULL)
 		*newp = new;
 	return (0);
 }
 
 /*
  * Remove a prefix from the prefix list and optionally stash it in a
  * caller-provided list.
  *
  * The ND6 lock must be held.
  */
 void
 nd6_prefix_unlink(struct nd_prefix *pr, struct nd_prhead *list)
 {
 
 	ND6_WLOCK_ASSERT();
 
 	LIST_REMOVE(pr, ndpr_entry);
 	V_nd6_list_genid++;
 	if (list != NULL)
 		LIST_INSERT_HEAD(list, pr, ndpr_entry);
 }
 
 /*
  * Free an unlinked prefix, first marking it off-link if necessary.
  */
 void
 nd6_prefix_del(struct nd_prefix *pr)
 {
 	struct nd_pfxrouter *pfr, *next;
 	int e;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	KASSERT(pr->ndpr_addrcnt == 0,
 	    ("prefix %p has referencing addresses", pr));
 	ND6_UNLOCK_ASSERT();
 
 	/*
 	 * Though these flags are now meaningless, we'd rather keep the value
 	 * of pr->ndpr_raf_onlink and pr->ndpr_raf_auto not to confuse users
 	 * when executing "ndp -p".
 	 */
 	if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) {
 		ND6_ONLINK_LOCK();
 		if ((e = nd6_prefix_offlink(pr)) != 0) {
 			nd6log((LOG_ERR,
 			    "%s: failed to make the prefix %s/%d offlink on %s "
 			    "(errno=%d)\n", __func__,
 			    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 			    pr->ndpr_plen, if_name(pr->ndpr_ifp), e));
 			/* what should we do? */
 		}
 		ND6_ONLINK_UNLOCK();
 	}
 
 	/* Release references to routers that have advertised this prefix. */
 	ND6_WLOCK();
 	LIST_FOREACH_SAFE(pfr, &pr->ndpr_advrtrs, pfr_entry, next)
 		pfxrtr_del(pfr);
 	ND6_WUNLOCK();
 
 	nd6_prefix_rele(pr);
 
 	pfxlist_onlink_check();
 }
 
 static int
 prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr,
     struct mbuf *m, int mcast)
 {
 	struct in6_ifaddr *ia6 = NULL, *ia6_match = NULL;
 	struct ifaddr *ifa;
 	struct ifnet *ifp = new->ndpr_ifp;
 	struct nd_prefix *pr;
 	int error = 0;
 	int auth;
 	struct in6_addrlifetime lt6_tmp;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	NET_EPOCH_ASSERT();
 
 	auth = 0;
 	if (m) {
 		/*
 		 * Authenticity for NA consists authentication for
 		 * both IP header and IP datagrams, doesn't it ?
 		 */
 #if defined(M_AUTHIPHDR) && defined(M_AUTHIPDGM)
 		auth = ((m->m_flags & M_AUTHIPHDR) &&
 		    (m->m_flags & M_AUTHIPDGM));
 #endif
 	}
 
 	if ((pr = nd6_prefix_lookup(new)) != NULL) {
 		/*
 		 * nd6_prefix_lookup() ensures that pr and new have the same
 		 * prefix on a same interface.
 		 */
 
 		/*
 		 * Update prefix information.  Note that the on-link (L) bit
 		 * and the autonomous (A) bit should NOT be changed from 1
 		 * to 0.
 		 */
 		if (new->ndpr_raf_onlink == 1)
 			pr->ndpr_raf_onlink = 1;
 		if (new->ndpr_raf_auto == 1)
 			pr->ndpr_raf_auto = 1;
 		if (new->ndpr_raf_onlink) {
 			pr->ndpr_vltime = new->ndpr_vltime;
 			pr->ndpr_pltime = new->ndpr_pltime;
 			(void)in6_init_prefix_ltimes(pr); /* XXX error case? */
 			pr->ndpr_lastupdate = time_uptime;
 		}
 
 		if (new->ndpr_raf_onlink &&
 		    (pr->ndpr_stateflags & NDPRF_ONLINK) == 0) {
 			ND6_ONLINK_LOCK();
 			if ((error = nd6_prefix_onlink(pr)) != 0) {
 				nd6log((LOG_ERR,
 				    "%s: failed to make the prefix %s/%d "
 				    "on-link on %s (errno=%d)\n", __func__,
 				    ip6_sprintf(ip6buf,
 				        &pr->ndpr_prefix.sin6_addr),
 				    pr->ndpr_plen, if_name(pr->ndpr_ifp),
 				    error));
 				/* proceed anyway. XXX: is it correct? */
 			}
 			ND6_ONLINK_UNLOCK();
 		}
 
 		if (dr != NULL)
 			pfxrtr_add(pr, dr);
 	} else {
 		if (new->ndpr_vltime == 0)
 			goto end;
 		if (new->ndpr_raf_onlink == 0 && new->ndpr_raf_auto == 0)
 			goto end;
 
 		error = nd6_prelist_add(new, dr, &pr);
 		if (error != 0) {
 			nd6log((LOG_NOTICE, "%s: nd6_prelist_add() failed for "
 			    "the prefix %s/%d on %s (errno=%d)\n", __func__,
 			    ip6_sprintf(ip6buf, &new->ndpr_prefix.sin6_addr),
 			    new->ndpr_plen, if_name(new->ndpr_ifp), error));
 			goto end; /* we should just give up in this case. */
 		}
 
 		/*
 		 * XXX: from the ND point of view, we can ignore a prefix
 		 * with the on-link bit being zero.  However, we need a
 		 * prefix structure for references from autoconfigured
 		 * addresses.  Thus, we explicitly make sure that the prefix
 		 * itself expires now.
 		 */
 		if (pr->ndpr_raf_onlink == 0) {
 			pr->ndpr_vltime = 0;
 			pr->ndpr_pltime = 0;
 			in6_init_prefix_ltimes(pr);
 		}
 	}
 
 	/*
 	 * Address autoconfiguration based on Section 5.5.3 of RFC 2462.
 	 * Note that pr must be non NULL at this point.
 	 */
 
 	/* 5.5.3 (a). Ignore the prefix without the A bit set. */
 	if (!new->ndpr_raf_auto)
 		goto end;
 
 	/*
 	 * 5.5.3 (b). the link-local prefix should have been ignored in
 	 * nd6_ra_input.
 	 */
 
 	/* 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. */
 	if (new->ndpr_pltime > new->ndpr_vltime) {
 		error = EINVAL;	/* XXX: won't be used */
 		goto end;
 	}
 
 	/*
 	 * 5.5.3 (d).  If the prefix advertised is not equal to the prefix of
 	 * an address configured by stateless autoconfiguration already in the
 	 * list of addresses associated with the interface, and the Valid
 	 * Lifetime is not 0, form an address.  We first check if we have
 	 * a matching prefix.
 	 * Note: we apply a clarification in rfc2462bis-02 here.  We only
 	 * consider autoconfigured addresses while RFC2462 simply said
 	 * "address".
 	 */
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in6_ifaddr *ifa6;
 		u_int32_t remaininglifetime;
 
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		ifa6 = (struct in6_ifaddr *)ifa;
 
 		/*
 		 * We only consider autoconfigured addresses as per rfc2462bis.
 		 */
 		if (!(ifa6->ia6_flags & IN6_IFF_AUTOCONF))
 			continue;
 
 		/*
 		 * Spec is not clear here, but I believe we should concentrate
 		 * on unicast (i.e. not anycast) addresses.
 		 * XXX: other ia6_flags? detached or duplicated?
 		 */
 		if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0)
 			continue;
 
 		/*
 		 * Ignore the address if it is not associated with a prefix
 		 * or is associated with a prefix that is different from this
 		 * one.  (pr is never NULL here)
 		 */
 		if (ifa6->ia6_ndpr != pr)
 			continue;
 
 		if (ia6_match == NULL) /* remember the first one */
 			ia6_match = ifa6;
 
 		/*
 		 * An already autoconfigured address matched.  Now that we
 		 * are sure there is at least one matched address, we can
 		 * proceed to 5.5.3. (e): update the lifetimes according to the
 		 * "two hours" rule and the privacy extension.
 		 * We apply some clarifications in rfc2462bis:
 		 * - use remaininglifetime instead of storedlifetime as a
 		 *   variable name
 		 * - remove the dead code in the "two-hour" rule
 		 */
 #define TWOHOUR		(120*60)
 		lt6_tmp = ifa6->ia6_lifetime;
 
 		if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME)
 			remaininglifetime = ND6_INFINITE_LIFETIME;
 		else if (time_uptime - ifa6->ia6_updatetime >
 			 lt6_tmp.ia6t_vltime) {
 			/*
 			 * The case of "invalid" address.  We should usually
 			 * not see this case.
 			 */
 			remaininglifetime = 0;
 		} else
 			remaininglifetime = lt6_tmp.ia6t_vltime -
 			    (time_uptime - ifa6->ia6_updatetime);
 
 		/* when not updating, keep the current stored lifetime. */
 		lt6_tmp.ia6t_vltime = remaininglifetime;
 
 		if (TWOHOUR < new->ndpr_vltime ||
 		    remaininglifetime < new->ndpr_vltime) {
 			lt6_tmp.ia6t_vltime = new->ndpr_vltime;
 		} else if (remaininglifetime <= TWOHOUR) {
 			if (auth) {
 				lt6_tmp.ia6t_vltime = new->ndpr_vltime;
 			}
 		} else {
 			/*
 			 * new->ndpr_vltime <= TWOHOUR &&
 			 * TWOHOUR < remaininglifetime
 			 */
 			lt6_tmp.ia6t_vltime = TWOHOUR;
 		}
 
 		/* The 2 hour rule is not imposed for preferred lifetime. */
 		lt6_tmp.ia6t_pltime = new->ndpr_pltime;
 
 		in6_init_address_ltimes(pr, &lt6_tmp);
 
 		/*
 		 * We need to treat lifetimes for temporary addresses
 		 * differently, according to
 		 * draft-ietf-ipv6-privacy-addrs-v2-01.txt 3.3 (1);
 		 * we only update the lifetimes when they are in the maximum
 		 * intervals.
 		 */
 		if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
 			u_int32_t maxvltime, maxpltime;
 
 			if (V_ip6_temp_valid_lifetime >
 			    (u_int32_t)((time_uptime - ifa6->ia6_createtime) +
 			    V_ip6_desync_factor)) {
 				maxvltime = V_ip6_temp_valid_lifetime -
 				    (time_uptime - ifa6->ia6_createtime) -
 				    V_ip6_desync_factor;
 			} else
 				maxvltime = 0;
 			if (V_ip6_temp_preferred_lifetime >
 			    (u_int32_t)((time_uptime - ifa6->ia6_createtime) +
 			    V_ip6_desync_factor)) {
 				maxpltime = V_ip6_temp_preferred_lifetime -
 				    (time_uptime - ifa6->ia6_createtime) -
 				    V_ip6_desync_factor;
 			} else
 				maxpltime = 0;
 
 			if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME ||
 			    lt6_tmp.ia6t_vltime > maxvltime) {
 				lt6_tmp.ia6t_vltime = maxvltime;
 			}
 			if (lt6_tmp.ia6t_pltime == ND6_INFINITE_LIFETIME ||
 			    lt6_tmp.ia6t_pltime > maxpltime) {
 				lt6_tmp.ia6t_pltime = maxpltime;
 			}
 		}
 		ifa6->ia6_lifetime = lt6_tmp;
 		ifa6->ia6_updatetime = time_uptime;
 	}
 	if (ia6_match == NULL && new->ndpr_vltime) {
 		int ifidlen;
 
 		/*
 		 * 5.5.3 (d) (continued)
 		 * No address matched and the valid lifetime is non-zero.
 		 * Create a new address.
 		 */
 
 		/*
 		 * Prefix Length check:
 		 * If the sum of the prefix length and interface identifier
 		 * length does not equal 128 bits, the Prefix Information
 		 * option MUST be ignored.  The length of the interface
 		 * identifier is defined in a separate link-type specific
 		 * document.
 		 */
 		ifidlen = in6_if2idlen(ifp);
 		if (ifidlen < 0) {
 			/* this should not happen, so we always log it. */
 			log(LOG_ERR, "prelist_update: IFID undefined (%s)\n",
 			    if_name(ifp));
 			goto end;
 		}
 		if (ifidlen + pr->ndpr_plen != 128) {
 			nd6log((LOG_INFO,
 			    "%s: invalid prefixlen %d for %s, ignored\n",
 			    __func__, pr->ndpr_plen, if_name(ifp)));
 			goto end;
 		}
 
 		if ((ia6 = in6_ifadd(new, mcast)) != NULL) {
 			/*
 			 * note that we should use pr (not new) for reference.
 			 */
 			pr->ndpr_addrcnt++;
 			ia6->ia6_ndpr = pr;
 
 			/*
 			 * RFC 3041 3.3 (2).
 			 * When a new public address is created as described
 			 * in RFC2462, also create a new temporary address.
 			 *
 			 * RFC 3041 3.5.
 			 * When an interface connects to a new link, a new
 			 * randomized interface identifier should be generated
 			 * immediately together with a new set of temporary
 			 * addresses.  Thus, we specifiy 1 as the 2nd arg of
 			 * in6_tmpifadd().
 			 */
 			if (V_ip6_use_tempaddr) {
 				int e;
 				if ((e = in6_tmpifadd(ia6, 1, 1)) != 0) {
 					nd6log((LOG_NOTICE, "%s: failed to "
 					    "create a temporary address "
 					    "(errno=%d)\n", __func__, e));
 				}
 			}
 			ifa_free(&ia6->ia_ifa);
 
 			/*
 			 * A newly added address might affect the status
 			 * of other addresses, so we check and update it.
 			 * XXX: what if address duplication happens?
 			 */
 			pfxlist_onlink_check();
 		} else {
 			/* just set an error. do not bark here. */
 			error = EADDRNOTAVAIL; /* XXX: might be unused. */
 		}
 	}
 
 end:
 	if (pr != NULL)
 		nd6_prefix_rele(pr);
 	return (error);
 }
 
 /*
  * A supplement function used in the on-link detection below;
  * detect if a given prefix has a (probably) reachable advertising router.
  * XXX: lengthy function name...
  */
 static struct nd_pfxrouter *
 find_pfxlist_reachable_router(struct nd_prefix *pr)
 {
 	struct epoch_tracker et;
 	struct nd_pfxrouter *pfxrtr;
 	struct llentry *ln;
 	int canreach;
 
 	ND6_LOCK_ASSERT();
 
 	NET_EPOCH_ENTER(et);
 	LIST_FOREACH(pfxrtr, &pr->ndpr_advrtrs, pfr_entry) {
 		ln = nd6_lookup(&pfxrtr->router->rtaddr, 0, pfxrtr->router->ifp);
 		if (ln == NULL)
 			continue;
 		canreach = ND6_IS_LLINFO_PROBREACH(ln);
 		LLE_RUNLOCK(ln);
 		if (canreach)
 			break;
 	}
 	NET_EPOCH_EXIT(et);
 	return (pfxrtr);
 }
 
 /*
  * Check if each prefix in the prefix list has at least one available router
  * that advertised the prefix (a router is "available" if its neighbor cache
  * entry is reachable or probably reachable).
  * If the check fails, the prefix may be off-link, because, for example,
  * we have moved from the network but the lifetime of the prefix has not
  * expired yet.  So we should not use the prefix if there is another prefix
  * that has an available router.
  * But, if there is no prefix that has an available router, we still regard
  * all the prefixes as on-link.  This is because we can't tell if all the
  * routers are simply dead or if we really moved from the network and there
  * is no router around us.
  */
 void
 pfxlist_onlink_check(void)
 {
 	struct nd_prefix *pr;
 	struct in6_ifaddr *ifa;
 	struct nd_defrouter *dr;
 	struct nd_pfxrouter *pfxrtr = NULL;
 	struct rm_priotracker in6_ifa_tracker;
 	uint64_t genid;
 	uint32_t flags;
 
 	ND6_ONLINK_LOCK();
 	ND6_RLOCK();
 
 	/*
 	 * Check if there is a prefix that has a reachable advertising
 	 * router.
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr))
 			break;
 	}
 
 	/*
 	 * If we have no such prefix, check whether we still have a router
 	 * that does not advertise any prefixes.
 	 */
 	if (pr == NULL) {
 		TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) {
 			struct nd_prefix *pr0;
 
 			LIST_FOREACH(pr0, &V_nd_prefix, ndpr_entry) {
 				if ((pfxrtr = pfxrtr_lookup(pr0, dr)) != NULL)
 					break;
 			}
 			if (pfxrtr != NULL)
 				break;
 		}
 	}
 	if (pr != NULL || (!TAILQ_EMPTY(&V_nd6_defrouter) && pfxrtr == NULL)) {
 		/*
 		 * There is at least one prefix that has a reachable router,
 		 * or at least a router which probably does not advertise
 		 * any prefixes.  The latter would be the case when we move
 		 * to a new link where we have a router that does not provide
 		 * prefixes and we configure an address by hand.
 		 * Detach prefixes which have no reachable advertising
 		 * router, and attach other prefixes.
 		 */
 		LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 			/* XXX: a link-local prefix should never be detached */
 			if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) ||
 			    pr->ndpr_raf_onlink == 0 ||
 			    pr->ndpr_raf_auto == 0)
 				continue;
 
 			if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 &&
 			    find_pfxlist_reachable_router(pr) == NULL)
 				pr->ndpr_stateflags |= NDPRF_DETACHED;
 			else if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 &&
 			    find_pfxlist_reachable_router(pr) != NULL)
 				pr->ndpr_stateflags &= ~NDPRF_DETACHED;
 		}
 	} else {
 		/* there is no prefix that has a reachable router */
 		LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 			if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) ||
 			    pr->ndpr_raf_onlink == 0 ||
 			    pr->ndpr_raf_auto == 0)
 				continue;
 			pr->ndpr_stateflags &= ~NDPRF_DETACHED;
 		}
 	}
 
 	/*
 	 * Remove each interface route associated with a (just) detached
 	 * prefix, and reinstall the interface route for a (just) attached
 	 * prefix.  Note that all attempt of reinstallation does not
 	 * necessarily success, when a same prefix is shared among multiple
 	 * interfaces.  Such cases will be handled in nd6_prefix_onlink,
 	 * so we don't have to care about them.
 	 */
 restart:
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		char ip6buf[INET6_ADDRSTRLEN];
 		int e;
 
 		if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) ||
 		    pr->ndpr_raf_onlink == 0 ||
 		    pr->ndpr_raf_auto == 0)
 			continue;
 
 		flags = pr->ndpr_stateflags & (NDPRF_DETACHED | NDPRF_ONLINK);
 		if (flags == 0 || flags == (NDPRF_DETACHED | NDPRF_ONLINK)) {
 			genid = V_nd6_list_genid;
 			ND6_RUNLOCK();
 			if ((flags & NDPRF_ONLINK) != 0 &&
 			    (e = nd6_prefix_offlink(pr)) != 0) {
 				nd6log((LOG_ERR,
 				    "%s: failed to make %s/%d offlink "
 				    "(errno=%d)\n", __func__, 
 				    ip6_sprintf(ip6buf,
 					    &pr->ndpr_prefix.sin6_addr),
 					    pr->ndpr_plen, e));
 			} else if ((flags & NDPRF_ONLINK) == 0 &&
 			    (e = nd6_prefix_onlink(pr)) != 0) {
 				nd6log((LOG_ERR,
 				    "%s: failed to make %s/%d onlink "
 				    "(errno=%d)\n", __func__,
 				    ip6_sprintf(ip6buf,
 					    &pr->ndpr_prefix.sin6_addr),
 					    pr->ndpr_plen, e));
 			}
 			ND6_RLOCK();
 			if (genid != V_nd6_list_genid)
 				goto restart;
 		}
 	}
 
 	/*
 	 * Changes on the prefix status might affect address status as well.
 	 * Make sure that all addresses derived from an attached prefix are
 	 * attached, and that all addresses derived from a detached prefix are
 	 * detached.  Note, however, that a manually configured address should
 	 * always be attached.
 	 * The precise detection logic is same as the one for prefixes.
 	 */
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	CK_STAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) {
 		if (!(ifa->ia6_flags & IN6_IFF_AUTOCONF))
 			continue;
 
 		if (ifa->ia6_ndpr == NULL) {
 			/*
 			 * This can happen when we first configure the address
 			 * (i.e. the address exists, but the prefix does not).
 			 * XXX: complicated relationships...
 			 */
 			continue;
 		}
 
 		if (find_pfxlist_reachable_router(ifa->ia6_ndpr))
 			break;
 	}
 	if (ifa) {
 		CK_STAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) {
 			if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 				continue;
 
 			if (ifa->ia6_ndpr == NULL) /* XXX: see above. */
 				continue;
 
 			if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) {
 				if (ifa->ia6_flags & IN6_IFF_DETACHED) {
 					ifa->ia6_flags &= ~IN6_IFF_DETACHED;
 					ifa->ia6_flags |= IN6_IFF_TENTATIVE;
 					nd6_dad_start((struct ifaddr *)ifa, 0);
 				}
 			} else {
 				ifa->ia6_flags |= IN6_IFF_DETACHED;
 			}
 		}
 	} else {
 		CK_STAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) {
 			if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 				continue;
 
 			if (ifa->ia6_flags & IN6_IFF_DETACHED) {
 				ifa->ia6_flags &= ~IN6_IFF_DETACHED;
 				ifa->ia6_flags |= IN6_IFF_TENTATIVE;
 				/* Do we need a delay in this case? */
 				nd6_dad_start((struct ifaddr *)ifa, 0);
 			}
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	ND6_RUNLOCK();
 	ND6_ONLINK_UNLOCK();
 }
 
 static int
 nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa)
 {
 	struct sockaddr_dl_short sdl;
 	struct rtentry *rt;
 	struct sockaddr_in6 mask6;
 	u_long rtflags;
 	int error, a_failure, fibnum, maxfib;
 
 	/*
 	 * in6_ifinit() sets nd6_rtrequest to ifa_rtrequest for all ifaddrs.
 	 * ifa->ifa_rtrequest = nd6_rtrequest;
 	 */
 	bzero(&mask6, sizeof(mask6));
 	mask6.sin6_len = sizeof(mask6);
 	mask6.sin6_addr = pr->ndpr_mask;
 	rtflags = (ifa->ifa_flags & ~IFA_RTSELF) | RTF_UP;
 
 	bzero(&sdl, sizeof(struct sockaddr_dl_short));
 	sdl.sdl_len = sizeof(struct sockaddr_dl_short);
 	sdl.sdl_family = AF_LINK;
 	sdl.sdl_type = ifa->ifa_ifp->if_type;
 	sdl.sdl_index = ifa->ifa_ifp->if_index;
 
 	if(V_rt_add_addr_allfibs) {
 		fibnum = 0;
 		maxfib = rt_numfibs;
 	} else {
 		fibnum = ifa->ifa_ifp->if_fib;
 		maxfib = fibnum + 1;
 	}
 	a_failure = 0;
 	for (; fibnum < maxfib; fibnum++) {
 
 		rt = NULL;
 		error = in6_rtrequest(RTM_ADD,
 		    (struct sockaddr *)&pr->ndpr_prefix, (struct sockaddr *)&sdl,
 		    (struct sockaddr *)&mask6, rtflags, &rt, fibnum);
 		if (error != 0) {
 			char ip6buf[INET6_ADDRSTRLEN];
 			char ip6bufg[INET6_ADDRSTRLEN];
 			char ip6bufm[INET6_ADDRSTRLEN];
 			struct sockaddr_in6 *sin6;
 
 			sin6 = (struct sockaddr_in6 *)ifa->ifa_addr;
 			nd6log((LOG_ERR, "%s: failed to add "
 			    "route for a prefix (%s/%d) on %s, gw=%s, mask=%s, "
 			    "flags=%lx errno = %d\n", __func__,
 			    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 			    pr->ndpr_plen, if_name(pr->ndpr_ifp),
 			    ip6_sprintf(ip6bufg, &sin6->sin6_addr),
 			    ip6_sprintf(ip6bufm, &mask6.sin6_addr),
 			    rtflags, error));
 
 			/* Save last error to return, see rtinit(). */
 			a_failure = error;
 			continue;
 		}
 
 		pr->ndpr_stateflags |= NDPRF_ONLINK;
 		rt_routemsg(RTM_ADD, rt, pr->ndpr_ifp, 0, fibnum);
 	}
 
 	/* Return the last error we got. */
 	return (a_failure);
 }
 
 static int
 nd6_prefix_onlink(struct nd_prefix *pr)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	struct ifnet *ifp = pr->ndpr_ifp;
 	struct nd_prefix *opr;
 	char ip6buf[INET6_ADDRSTRLEN];
 	int error;
 
 	ND6_ONLINK_LOCK_ASSERT();
 	ND6_UNLOCK_ASSERT();
 
 	if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0)
 		return (EEXIST);
 
 	/*
 	 * Add the interface route associated with the prefix.  Before
 	 * installing the route, check if there's the same prefix on another
 	 * interface, and the prefix has already installed the interface route.
 	 * Although such a configuration is expected to be rare, we explicitly
 	 * allow it.
 	 */
 	ND6_RLOCK();
 	LIST_FOREACH(opr, &V_nd_prefix, ndpr_entry) {
 		if (opr == pr)
 			continue;
 
 		if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0)
 			continue;
 
 		if (!V_rt_add_addr_allfibs &&
 		    opr->ndpr_ifp->if_fib != pr->ndpr_ifp->if_fib)
 			continue;
 
 		if (opr->ndpr_plen == pr->ndpr_plen &&
 		    in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr,
 		    &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) {
 			ND6_RUNLOCK();
 			return (0);
 		}
 	}
 	ND6_RUNLOCK();
 
 	/*
 	 * We prefer link-local addresses as the associated interface address.
 	 */
 	/* search for a link-local addr */
 	NET_EPOCH_ENTER(et);
 	ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp,
 	    IN6_IFF_NOTREADY | IN6_IFF_ANYCAST);
 	if (ifa == NULL) {
 		/* XXX: freebsd does not have ifa_ifwithaf */
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family == AF_INET6) {
 				ifa_ref(ifa);
 				break;
 			}
 		}
 		/* should we care about ia6_flags? */
 	}
 	if (ifa == NULL) {
 		/*
 		 * This can still happen, when, for example, we receive an RA
 		 * containing a prefix with the L bit set and the A bit clear,
 		 * after removing all IPv6 addresses on the receiving
 		 * interface.  This should, of course, be rare though.
 		 */
 		nd6log((LOG_NOTICE,
 		    "%s: failed to find any ifaddr to add route for a "
 		    "prefix(%s/%d) on %s\n", __func__,
 		    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 		    pr->ndpr_plen, if_name(ifp)));
 		error = 0;
 	} else {
 		error = nd6_prefix_onlink_rtrequest(pr, ifa);
 		ifa_free(ifa);
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 int
 nd6_prefix_offlink(struct nd_prefix *pr)
 {
 	int error = 0;
 	struct ifnet *ifp = pr->ndpr_ifp;
 	struct nd_prefix *opr;
 	struct sockaddr_in6 sa6, mask6;
 	struct rtentry *rt;
 	char ip6buf[INET6_ADDRSTRLEN];
 	uint64_t genid;
 	int fibnum, maxfib, a_failure;
 	struct epoch_tracker et;
 
 	ND6_ONLINK_LOCK_ASSERT();
 	ND6_UNLOCK_ASSERT();
 
 	if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0)
 		return (EEXIST);
 
 	bzero(&sa6, sizeof(sa6));
 	sa6.sin6_family = AF_INET6;
 	sa6.sin6_len = sizeof(sa6);
 	bcopy(&pr->ndpr_prefix.sin6_addr, &sa6.sin6_addr,
 	    sizeof(struct in6_addr));
 	bzero(&mask6, sizeof(mask6));
 	mask6.sin6_family = AF_INET6;
 	mask6.sin6_len = sizeof(sa6);
 	bcopy(&pr->ndpr_mask, &mask6.sin6_addr, sizeof(struct in6_addr));
 
 	if (V_rt_add_addr_allfibs) {
 		fibnum = 0;
 		maxfib = rt_numfibs;
 	} else {
 		fibnum = ifp->if_fib;
 		maxfib = fibnum + 1;
 	}
 
 	a_failure = 0;
 	NET_EPOCH_ENTER(et);
 	for (; fibnum < maxfib; fibnum++) {
 		rt = NULL;
 		error = in6_rtrequest(RTM_DELETE, (struct sockaddr *)&sa6, NULL,
 		    (struct sockaddr *)&mask6, 0, &rt, fibnum);
 		if (error != 0) {
 			/* Save last error to return, see rtinit(). */
 			a_failure = error;
 			continue;
 		}
 
 		/* report route deletion to the routing socket. */
 		rt_routemsg(RTM_DELETE, rt, ifp, 0, fibnum);
 	}
 	NET_EPOCH_EXIT(et);
 	error = a_failure;
 	a_failure = 1;
 	if (error == 0) {
 		pr->ndpr_stateflags &= ~NDPRF_ONLINK;
 
 		/*
 		 * There might be the same prefix on another interface,
 		 * the prefix which could not be on-link just because we have
 		 * the interface route (see comments in nd6_prefix_onlink).
 		 * If there's one, try to make the prefix on-link on the
 		 * interface.
 		 */
 		ND6_RLOCK();
 restart:
 		LIST_FOREACH(opr, &V_nd_prefix, ndpr_entry) {
 			/*
 			 * KAME specific: detached prefixes should not be
 			 * on-link.
 			 */
 			if (opr == pr || (opr->ndpr_stateflags &
 			    (NDPRF_ONLINK | NDPRF_DETACHED)) != 0)
 				continue;
 
 			if (opr->ndpr_plen == pr->ndpr_plen &&
 			    in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr,
 			    &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) {
 				int e;
 
 				genid = V_nd6_list_genid;
 				ND6_RUNLOCK();
 				if ((e = nd6_prefix_onlink(opr)) != 0) {
 					nd6log((LOG_ERR,
 					    "%s: failed to recover a prefix "
 					    "%s/%d from %s to %s (errno=%d)\n",
 					    __func__, ip6_sprintf(ip6buf,
 						&opr->ndpr_prefix.sin6_addr),
 					    opr->ndpr_plen, if_name(ifp),
 					    if_name(opr->ndpr_ifp), e));
 				} else
 					a_failure = 0;
 				ND6_RLOCK();
 				if (genid != V_nd6_list_genid)
 					goto restart;
 			}
 		}
 		ND6_RUNLOCK();
 	} else {
 		/* XXX: can we still set the NDPRF_ONLINK flag? */
 		nd6log((LOG_ERR,
 		    "%s: failed to delete route: %s/%d on %s (errno=%d)\n",
 		    __func__, ip6_sprintf(ip6buf, &sa6.sin6_addr),
 		    pr->ndpr_plen, if_name(ifp), error));
 	}
 
 	if (a_failure)
 		lltable_prefix_free(AF_INET6, (struct sockaddr *)&sa6,
 		    (struct sockaddr *)&mask6, LLE_STATIC);
 
 	return (error);
 }
 
 /*
  * ia0 - corresponding public address
  */
 int
 in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay)
 {
 	struct ifnet *ifp = ia0->ia_ifa.ifa_ifp;
 	struct in6_ifaddr *newia;
 	struct in6_aliasreq ifra;
 	int error;
 	int trylimit = 3;	/* XXX: adhoc value */
 	int updateflags;
 	u_int32_t randid[2];
 	time_t vltime0, pltime0;
 
 	in6_prepare_ifra(&ifra, &ia0->ia_addr.sin6_addr,
 	    &ia0->ia_prefixmask.sin6_addr);
 
 	ifra.ifra_addr = ia0->ia_addr;	/* XXX: do we need this ? */
 	/* clear the old IFID */
 	IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr,
 	    &ifra.ifra_prefixmask.sin6_addr);
 
   again:
 	if (in6_get_tmpifid(ifp, (u_int8_t *)randid,
 	    (const u_int8_t *)&ia0->ia_addr.sin6_addr.s6_addr[8], forcegen)) {
 		nd6log((LOG_NOTICE, "%s: failed to find a good random IFID\n",
 		    __func__));
 		return (EINVAL);
 	}
 	ifra.ifra_addr.sin6_addr.s6_addr32[2] |=
 	    (randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2]));
 	ifra.ifra_addr.sin6_addr.s6_addr32[3] |=
 	    (randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3]));
 
 	/*
 	 * in6_get_tmpifid() quite likely provided a unique interface ID.
 	 * However, we may still have a chance to see collision, because
 	 * there may be a time lag between generation of the ID and generation
 	 * of the address.  So, we'll do one more sanity check.
 	 */
 
 	if (in6_localip(&ifra.ifra_addr.sin6_addr) != 0) {
 		if (trylimit-- > 0) {
 			forcegen = 1;
 			goto again;
 		}
 
 		/* Give up.  Something strange should have happened.  */
 		nd6log((LOG_NOTICE, "%s: failed to find a unique random IFID\n",
 		    __func__));
 		return (EEXIST);
 	}
 
 	/*
 	 * The Valid Lifetime is the lower of the Valid Lifetime of the
          * public address or TEMP_VALID_LIFETIME.
 	 * The Preferred Lifetime is the lower of the Preferred Lifetime
          * of the public address or TEMP_PREFERRED_LIFETIME -
          * DESYNC_FACTOR.
 	 */
 	if (ia0->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 		vltime0 = IFA6_IS_INVALID(ia0) ? 0 :
 		    (ia0->ia6_lifetime.ia6t_vltime -
 		    (time_uptime - ia0->ia6_updatetime));
 		if (vltime0 > V_ip6_temp_valid_lifetime)
 			vltime0 = V_ip6_temp_valid_lifetime;
 	} else
 		vltime0 = V_ip6_temp_valid_lifetime;
 	if (ia0->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 		pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 :
 		    (ia0->ia6_lifetime.ia6t_pltime -
 		    (time_uptime - ia0->ia6_updatetime));
 		if (pltime0 > V_ip6_temp_preferred_lifetime - V_ip6_desync_factor){
 			pltime0 = V_ip6_temp_preferred_lifetime -
 			    V_ip6_desync_factor;
 		}
 	} else
 		pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor;
 	ifra.ifra_lifetime.ia6t_vltime = vltime0;
 	ifra.ifra_lifetime.ia6t_pltime = pltime0;
 
 	/*
 	 * A temporary address is created only if this calculated Preferred
 	 * Lifetime is greater than REGEN_ADVANCE time units.
 	 */
 	if (ifra.ifra_lifetime.ia6t_pltime <= V_ip6_temp_regen_advance)
 		return (0);
 
 	/* XXX: scope zone ID? */
 
 	ifra.ifra_flags |= (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY);
 
 	/* allocate ifaddr structure, link into chain, etc. */
 	updateflags = 0;
 	if (delay)
 		updateflags |= IN6_IFAUPDATE_DADDELAY;
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0)
 		return (error);
 
 	newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr);
 	if (newia == NULL) {	/* XXX: can it happen? */
 		nd6log((LOG_ERR,
 		    "%s: ifa update succeeded, but we got no ifaddr\n",
 		    __func__));
 		return (EINVAL); /* XXX */
 	}
 	newia->ia6_ndpr = ia0->ia6_ndpr;
 	newia->ia6_ndpr->ndpr_addrcnt++;
 	ifa_free(&newia->ia_ifa);
 
 	/*
 	 * A newly added address might affect the status of other addresses.
 	 * XXX: when the temporary address is generated with a new public
 	 * address, the onlink check is redundant.  However, it would be safe
 	 * to do the check explicitly everywhere a new address is generated,
 	 * and, in fact, we surely need the check when we create a new
 	 * temporary address due to deprecation of an old temporary address.
 	 */
 	pfxlist_onlink_check();
 
 	return (0);
 }
 
 static int
 rt6_deleteroute(const struct rtentry *rt, const struct nhop_object *nh,
     void *arg)
 {
 	struct in6_addr *gate = (struct in6_addr *)arg;
 	int nh_rt_flags;
 
 	if (nh->gw_sa.sa_family != AF_INET6)
 		return (0);
 
 	if (!IN6_ARE_ADDR_EQUAL(gate, &nh->gw6_sa.sin6_addr)) {
 		return (0);
 	}
 
 	/*
 	 * Do not delete a static route.
 	 * XXX: this seems to be a bit ad-hoc. Should we consider the
 	 * 'cloned' bit instead?
 	 */
 	nh_rt_flags = nhop_get_rtflags(nh);
 	if ((nh_rt_flags & RTF_STATIC) != 0)
 		return (0);
 
 	/*
 	 * We delete only host route. This means, in particular, we don't
 	 * delete default route.
 	 */
 	if ((nh_rt_flags & RTF_HOST) == 0)
 		return (0);
 
 	return (1);
 #undef SIN6
 }
 
 /*
  * Delete all the routing table entries that use the specified gateway.
  * XXX: this function causes search through all entries of routing table, so
  * it shouldn't be called when acting as a router.
  */
 void
 rt6_flush(struct in6_addr *gateway, struct ifnet *ifp)
 {
 
 	/* We'll care only link-local addresses */
 	if (!IN6_IS_ADDR_LINKLOCAL(gateway))
 		return;
 
 	/* XXX Do we really need to walk any but the default FIB? */
 	rt_foreach_fib_walk_del(AF_INET6, rt6_deleteroute, (void *)gateway);
 }
 
 int
 nd6_setdefaultiface(int ifindex)
 {
 	int error = 0;
 
 	if (ifindex < 0 || V_if_index < ifindex)
 		return (EINVAL);
 	if (ifindex != 0 && !ifnet_byindex(ifindex))
 		return (EINVAL);
 
 	if (V_nd6_defifindex != ifindex) {
 		V_nd6_defifindex = ifindex;
 		if (V_nd6_defifindex > 0)
 			V_nd6_defifp = ifnet_byindex(V_nd6_defifindex);
 		else
 			V_nd6_defifp = NULL;
 
 		/*
 		 * Our current implementation assumes one-to-one maping between
 		 * interfaces and links, so it would be natural to use the
 		 * default interface as the default link.
 		 */
 		scope6_setdefault(V_nd6_defifp);
 	}
 
 	return (error);
 }
 
 bool
 nd6_defrouter_list_empty(void)
 {
 
 	return (TAILQ_EMPTY(&V_nd6_defrouter));
 }
 
 void
 nd6_defrouter_timer(void)
 {
 	struct nd_defrouter *dr, *ndr;
 	struct nd6_drhead drq;
 
 	TAILQ_INIT(&drq);
 
 	ND6_WLOCK();
 	TAILQ_FOREACH_SAFE(dr, &V_nd6_defrouter, dr_entry, ndr)
 		if (dr->expire && dr->expire < time_uptime)
 			defrouter_unlink(dr, &drq);
 	ND6_WUNLOCK();
 
 	while ((dr = TAILQ_FIRST(&drq)) != NULL) {
 		TAILQ_REMOVE(&drq, dr, dr_entry);
 		defrouter_del(dr);
 	}
 }
 
 /*
  * Nuke default router list entries toward ifp.
  * We defer removal of default router list entries that is installed in the
  * routing table, in order to keep additional side effects as small as possible.
  */
 void
 nd6_defrouter_purge(struct ifnet *ifp)
 {
 	struct nd_defrouter *dr, *ndr;
 	struct nd6_drhead drq;
 
 	TAILQ_INIT(&drq);
 
 	ND6_WLOCK();
 	TAILQ_FOREACH_SAFE(dr, &V_nd6_defrouter, dr_entry, ndr) {
 		if (dr->installed)
 			continue;
 		if (dr->ifp == ifp)
 			defrouter_unlink(dr, &drq);
 	}
 	TAILQ_FOREACH_SAFE(dr, &V_nd6_defrouter, dr_entry, ndr) {
 		if (!dr->installed)
 			continue;
 		if (dr->ifp == ifp)
 			defrouter_unlink(dr, &drq);
 	}
 	ND6_WUNLOCK();
 
 	/* Delete the unlinked router objects. */
 	while ((dr = TAILQ_FIRST(&drq)) != NULL) {
 		TAILQ_REMOVE(&drq, dr, dr_entry);
 		defrouter_del(dr);
 	}
 }
 
 void
 nd6_defrouter_flush_all(void)
 {
 	struct nd_defrouter *dr;
 	struct nd6_drhead drq;
 
 	TAILQ_INIT(&drq);
 
 	ND6_WLOCK();
 	while ((dr = TAILQ_FIRST(&V_nd6_defrouter)) != NULL)
 		defrouter_unlink(dr, &drq);
 	ND6_WUNLOCK();
 
 	while ((dr = TAILQ_FIRST(&drq)) != NULL) {
 		TAILQ_REMOVE(&drq, dr, dr_entry);
 		defrouter_del(dr);
 	}
 }
 
 void
 nd6_defrouter_init(void)
 {
 
 	TAILQ_INIT(&V_nd6_defrouter);
 }
 
 static int
 nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_defrouter d;
 	struct nd_defrouter *dr;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 
 	bzero(&d, sizeof(d));
 	d.rtaddr.sin6_family = AF_INET6;
 	d.rtaddr.sin6_len = sizeof(d.rtaddr);
 
 	ND6_RLOCK();
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) {
 		d.rtaddr.sin6_addr = dr->rtaddr;
 		error = sa6_recoverscope(&d.rtaddr);
 		if (error != 0)
 			break;
 		d.flags = dr->raflags;
 		d.rtlifetime = dr->rtlifetime;
 		d.expire = dr->expire + (time_second - time_uptime);
 		d.if_index = dr->ifp->if_index;
 		error = SYSCTL_OUT(req, &d, sizeof(d));
 		if (error != 0)
 			break;
 	}
 	ND6_RUNLOCK();
 	return (error);
 }
 SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist,
 	CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	NULL, 0, nd6_sysctl_drlist, "S,in6_defrouter",
 	"NDP default router list");