Index: projects/routing/sys/net/route.c
===================================================================
--- projects/routing/sys/net/route.c	(revision 274335)
+++ projects/routing/sys/net/route.c	(revision 274336)
@@ -1,1994 +1,2013 @@
 /*-
  * Copyright (c) 1980, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
  * $FreeBSD$
  */
 /************************************************************************
  * Note: In this file a 'fib' is a "forwarding information base"	*
  * Which is the new name for an in kernel routing (next hop) table.	*
  ***********************************************************************/
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 #include "opt_sctp.h"
 #include "opt_mrouting.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/syslog.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/rmlock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route_internal.h>
 #include <net/vnet.h>
 #include <net/flowtable.h>
 
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/ip_mroute.h>
 
 #include <vm/uma.h>
 
 #define	RT_MAXFIBS	UINT16_MAX
 
 /* Kernel config default option. */
 #ifdef ROUTETABLES
 #if ROUTETABLES <= 0
 #error "ROUTETABLES defined too low"
 #endif
 #if ROUTETABLES > RT_MAXFIBS
 #error "ROUTETABLES defined too big"
 #endif
 #define	RT_NUMFIBS	ROUTETABLES
 #endif /* ROUTETABLES */
 /* Initialize to default if not otherwise set. */
 #ifndef	RT_NUMFIBS
 #define	RT_NUMFIBS	1
 #endif
 
 #if defined(INET) || defined(INET6)
 #ifdef SCTP
 extern void sctp_addr_change(struct ifaddr *ifa, int cmd);
 #endif /* SCTP */
 #endif
 
 
 /* This is read-only.. */
 u_int rt_numfibs = RT_NUMFIBS;
 SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, "");
 
 /*
  * By default add routes to all fibs for new interfaces.
  * Once this is set to 0 then only allocate routes on interface
  * changes for the FIB of the caller when adding a new set of addresses
  * to an interface.  XXX this is a shotgun aproach to a problem that needs
  * a more fine grained solution.. that will come.
  * XXX also has the problems getting the FIB from curthread which will not
  * always work given the fib can be overridden and prefixes can be added
  * from the network stack context.
  */
 VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1;
 SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET,
     &VNET_NAME(rt_add_addr_allfibs), 0, "");
 
 VNET_DEFINE(struct rtstat, rtstat);
 #define	V_rtstat	VNET(rtstat)
 
 VNET_DEFINE(struct rib_head *, rt_tables);
 #define	V_rt_tables	VNET(rt_tables)
 
 VNET_DEFINE(int, rttrash);		/* routes not in table but not freed */
 #define	V_rttrash	VNET(rttrash)
 
 
 /*
  * Convert a 'struct radix_node *' to a 'struct rtentry *'.
  * The operation can be done safely (in this code) because a
  * 'struct rtentry' starts with two 'struct radix_node''s, the first
  * one representing leaf nodes in the routing tree, which is
  * what the code in radix.c passes us as a 'struct radix_node'.
  *
  * But because there are a lot of assumptions in this conversion,
  * do not cast explicitly, but always use the macro below.
  */
 #define RNTORT(p)	((struct rtentry *)(p))
 
 static VNET_DEFINE(uma_zone_t, rtzone);		/* Routing table UMA zone. */
 #define	V_rtzone	VNET(rtzone)
 
 static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *,
     struct rtentry **, u_int);
 static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *);
 static int rt_ifdelroute(struct rtentry *rt, void *arg);
 
 /*
  * handler for net.my_fibnum
  */
 static int
 sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
 {
         int fibnum;
         int error;
  
         fibnum = curthread->td_proc->p_fibnum;
         error = sysctl_handle_int(oidp, &fibnum, 0, req);
         return (error);
 }
 
 SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD,
             NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");
 
 static __inline struct rib_head **
 rt_tables_get_rnh_ptr(int table, int fam)
 {
 	struct rib_head **rh;
 
 	KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.",
 	    __func__));
 	KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.",
 	    __func__));
 
 	/* rh is [fib=0][af=0]. */
 	rh = (struct rib_head **)V_rt_tables;
 	/* Get the offset to the requested table and fam. */
 	rh += table * (AF_MAX+1) + fam;
 
 	return (rh);
 }
 
 struct rib_head *
 rt_tables_get_rnh(int table, int fam)
 {
 
 	return (*rt_tables_get_rnh_ptr(table, fam));
 }
 
 /*
  * route initialization must occur before ip6_init2(), which happenas at
  * SI_ORDER_MIDDLE.
  */
 static void
 route_init(void)
 {
 
 	/* whack the tunable ints into  line. */
 	if (rt_numfibs > RT_MAXFIBS)
 		rt_numfibs = RT_MAXFIBS;
 	if (rt_numfibs == 0)
 		rt_numfibs = 1;
 }
 SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
 
 static void
 vnet_route_init(const void *unused __unused)
 {
 	struct domain *dom;
 	struct rib_head **rh;
 	int table;
 	int fam;
 
 	V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
 	    sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO);
 
 	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	for (dom = domains; dom; dom = dom->dom_next) {
 		if (dom->dom_rtattach == NULL)
 			continue;
 
 		for  (table = 0; table < rt_numfibs; table++) {
 			fam = dom->dom_family;
 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
 				break;
 
 			rh = rt_tables_get_rnh_ptr(table, fam);
 			if (rh == NULL)
 				panic("%s: rh NULL", __func__);
 			dom->dom_rtattach((void **)rh, 0);
 		}
 	}
 }
 VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
     vnet_route_init, 0);
 
 #ifdef VIMAGE
 static void
 vnet_route_uninit(const void *unused __unused)
 {
 	int table;
 	int fam;
 	struct domain *dom;
 	struct rib_head **rh;
 
 	for (dom = domains; dom; dom = dom->dom_next) {
 		if (dom->dom_rtdetach == NULL)
 			continue;
 
 		for (table = 0; table < rt_numfibs; table++) {
 			fam = dom->dom_family;
 
 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
 				break;
 
 			rh = rt_tables_get_rnh_ptr(table, fam);
 			if (rh == NULL)
 				panic("%s: rh NULL", __func__);
 			dom->dom_rtdetach((void **)rh, 0);
 		}
 	}
 
 	free(V_rt_tables, M_RTABLE);
 	uma_zdestroy(V_rtzone);
 }
 VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_route_uninit, 0);
 #endif
 
 struct rib_head *
 rt_table_init(int offset)
 {
 	struct rib_head *rh;
 
 	rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO);
 
 	/* XXX: These details should be hidded inside radix.c */
 	/* Init masks tree */
 	rn_inithead_internal(&rh->head, rh->rnh_nodes, offset);
 	rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0);
 	rh->head.s.rnh_masks = &rh->rmhead.head;
 	rh->rmhead.head.s.mask_nodes = rh->rmhead.mask_nodes;
 
 	/* Init locks */
-	rw_init(&rh->rib_lock, "rib head");
+	rm_init(&rh->rib_lock, "rib head run");
+	rw_init(&rh->rib_cfglock, "rib head cfg");
 
 	/* Finally, set base callbacks */
 	rh->rnh_addaddr = rn_addroute;
 	rh->rnh_deladdr = rn_delete;
 	rh->rnh_matchaddr = rn_match;
 	rh->rnh_lookup = rn_lookup;
 	rh->rnh_walktree = rn_walktree;
 	rh->rnh_walktree_from = rn_walktree_from;
 
 	return (rh);
 }
 
 void
 rt_table_destroy(struct rib_head *rh)
 {
 
 	/* Assume table is already empty */
-	rw_destroy(&rh->rib_lock);
+	rw_destroy(&rh->rib_cfglock);
+	rm_destroy(&rh->rib_lock);
 	free(rh, M_RTABLE);
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct setfib_args {
 	int     fibnum;
 };
 #endif
 int
 sys_setfib(struct thread *td, struct setfib_args *uap)
 {
 	if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
 		return EINVAL;
 	td->td_proc->p_fibnum = uap->fibnum;
 	return (0);
 }
 
 /*
  * Packet routing routines.
  */
 
 /*
  * Legacy function for SCTP support.
  */
 void
 rtalloc_ign(struct route *ro, u_long ignore)
 {
 	struct rtentry *rt;
 
 	if ((rt = ro->ro_rt) != NULL) {
 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
 			return;
 		RTFREE(rt);
 		ro->ro_rt = NULL;
 	}
 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, RT_DEFAULT_FIB);
 	if (ro->ro_rt)
 		RT_UNLOCK(ro->ro_rt);
 }
 
 
 /*
  * Look up the route that matches the address given
  * Or, at least try.. Create a cloned route if needed.
  *
  * The returned route, if any, is locked.
  */
 struct rtentry *
 rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
 {
 
 	return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB));
 }
 
 struct rtentry *
 rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
 		    u_int fibnum)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct rtentry *newrt;
 	struct rt_addrinfo info;
 	int err = 0, msgtype = RTM_MISS;
 	int needlock;
+	RIB_LOCK_READER;
 
 	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	newrt = NULL;
 	if (rh == NULL)
 		goto miss;
 
 	/*
 	 * Look up the address in the table for that Address Family
 	 */
 	needlock = !(ignflags & RTF_RNH_LOCKED);
 	if (needlock)
 		RIB_RLOCK(rh);
 #ifdef INVARIANTS	
 	else
 		RIB_LOCK_ASSERT(rh);
 #endif
 	rn = rh->rnh_matchaddr(dst, &rh->head);
 	if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		newrt = RNTORT(rn);
 		RT_LOCK(newrt);
 		RT_ADDREF(newrt);
 		if (needlock)
 			RIB_RUNLOCK(rh);
 		goto done;
 
 	} else if (needlock)
 		RIB_RUNLOCK(rh);
 	
 	/*
 	 * Either we hit the root or couldn't find any match,
 	 * Which basically means
 	 * "caint get there frm here"
 	 */
 miss:
 	V_rtstat.rts_unreach++;
 
 	if (report) {
 		/*
 		 * If required, report the failure to the supervising
 		 * Authorities.
 		 * For a delete, this is not an error. (report == 0)
 		 */
 		bzero(&info, sizeof(info));
 		info.rti_info[RTAX_DST] = dst;
 		rt_missmsg_fib(msgtype, &info, 0, err, fibnum);
 	}	
 done:
 	if (newrt)
 		RT_LOCK_ASSERT(newrt);
 	return (newrt);
 }
 
 /*
  * Remove a reference count from an rtentry.
  * If the count gets low enough, take it out of the routing table
  */
 void
 rtfree(struct rtentry *rt)
 {
 	struct rib_head *rh;
 
 	KASSERT(rt != NULL,("%s: NULL rt", __func__));
 	rh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
 	KASSERT(rh != NULL,("%s: NULL rh", __func__));
 
 	RT_LOCK_ASSERT(rt);
 
 	/*
 	 * The callers should use RTFREE_LOCKED() or RTFREE(), so
 	 * we should come here exactly with the last reference.
 	 */
 	RT_REMREF(rt);
 	if (rt->rt_refcnt > 0) {
 		log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
 		goto done;
 	}
 
 	/*
 	 * On last reference give the "close method" a chance
 	 * to cleanup private state.  This also permits (for
 	 * IPv4 and IPv6) a chance to decide if the routing table
 	 * entry should be purged immediately or at a later time.
 	 * When an immediate purge is to happen the close routine
 	 * typically calls rtexpunge which clears the RTF_UP flag
 	 * on the entry so that the code below reclaims the storage.
 	 */
 	if (rt->rt_refcnt == 0 && rh->rnh_close)
 		rh->rnh_close((struct radix_node *)rt, &rh->head);
 
 	/*
 	 * If we are no longer "up" (and ref == 0)
 	 * then we can free the resources associated
 	 * with the route.
 	 */
 	if ((rt->rt_flags & RTF_UP) == 0) {
 		if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 			panic("rtfree 2");
 		/*
 		 * the rtentry must have been removed from the routing table
 		 * so it is represented in rttrash.. remove that now.
 		 */
 		V_rttrash--;
 #ifdef	DIAGNOSTIC
 		if (rt->rt_refcnt < 0) {
 			printf("rtfree: %p not freed (neg refs)\n", rt);
 			goto done;
 		}
 #endif
 		/*
 		 * release references on items we hold them on..
 		 * e.g other routes and ifaddrs.
 		 */
 		if (rt->rt_ifa)
 			ifa_free(rt->rt_ifa);
 		/*
 		 * The key is separatly alloc'd so free it (see rt_setgate()).
 		 * This also frees the gateway, as they are always malloc'd
 		 * together.
 		 */
 		Free(rt_key(rt));
 
 		/*
 		 * and the rtentry itself of course
 		 */
 		uma_zfree(V_rtzone, rt);
 		return;
 	}
 done:
 	RT_UNLOCK(rt);
 }
 
 
 /*
  * Force a routing table entry to the specified
  * destination to go through the given gateway.
  * Normally called as a result of a routing redirect
  * message from the network layer.
  */
 void
 rtredirect(struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct sockaddr *src)
 {
 
 	rtredirect_fib(dst, gateway, netmask, flags, src, RT_DEFAULT_FIB);
 }
 
 void
 rtredirect_fib(struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct sockaddr *src,
 	u_int fibnum)
 {
 	struct rtentry *rt, *rt0 = NULL;
 	int error = 0;
 	short *stat = NULL;
 	struct rt_addrinfo info;
 	struct ifaddr *ifa;
 	struct rib_head *rh;
 
 	ifa = NULL;
 	rh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rh == NULL) {
 		error = EAFNOSUPPORT;
 		goto out;
 	}
 
 	/* verify the gateway is directly reachable */
 	if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) {
 		error = ENETUNREACH;
 		goto out;
 	}
 	rt = rtalloc1_fib(dst, 0, 0UL, fibnum);	/* NB: rt is locked */
 	/*
 	 * If the redirect isn't from our current router for this dst,
 	 * it's either old or wrong.  If it redirects us to ourselves,
 	 * we have a routing loop, perhaps as a result of an interface
 	 * going down recently.
 	 */
 	if (!(flags & RTF_DONE) && rt &&
 	     (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
 		error = EINVAL;
 	else if (ifa_ifwithaddr_check(gateway))
 		error = EHOSTUNREACH;
 	if (error)
 		goto done;
 	/*
 	 * Create a new entry if we just got back a wildcard entry
 	 * or the lookup failed.  This is necessary for hosts
 	 * which use routing redirects generated by smart gateways
 	 * to dynamically build the routing tables.
 	 */
 	if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
 		goto create;
 	/*
 	 * Don't listen to the redirect if it's
 	 * for a route to an interface.
 	 */
 	if (rt->rt_flags & RTF_GATEWAY) {
 		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
 			/*
 			 * Changing from route to net => route to host.
 			 * Create new route, rather than smashing route to net.
 			 */
 		create:
 			rt0 = rt;
 			rt = NULL;
 		
 			flags |=  RTF_GATEWAY | RTF_DYNAMIC;
 			bzero((caddr_t)&info, sizeof(info));
 			info.rti_info[RTAX_DST] = dst;
 			info.rti_info[RTAX_GATEWAY] = gateway;
 			info.rti_info[RTAX_NETMASK] = netmask;
 			info.rti_ifa = ifa;
 			info.rti_flags = flags;
 			if (rt0 != NULL)
 				RT_UNLOCK(rt0);	/* drop lock to avoid LOR with rh */
 			error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
 			if (rt != NULL) {
 				RT_LOCK(rt);
 				if (rt0 != NULL)
 					EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
 				flags = rt->rt_flags;
 			}
 			if (rt0 != NULL)
 				RTFREE(rt0);
 			
 			stat = &V_rtstat.rts_dynamic;
 		} else {
 			struct rtentry *gwrt;
 
 			/*
 			 * Smash the current notion of the gateway to
 			 * this destination.  Should check about netmask!!!
 			 */
 			rt->rt_flags |= RTF_MODIFIED;
 			flags |= RTF_MODIFIED;
 			stat = &V_rtstat.rts_newgateway;
 			/*
 			 * add the key and gateway (in one malloc'd chunk).
 			 */
 			RT_UNLOCK(rt);
+			RIB_CFG_WLOCK(rh);
 			RIB_WLOCK(rh);
 			RT_LOCK(rt);
 			rt_setgate(rt, rt_key(rt), gateway);
 			gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED);
 			RIB_WUNLOCK(rh);
+			RIB_CFG_WUNLOCK(rh);
 			EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
 			RTFREE_LOCKED(gwrt);
 		}
 	} else
 		error = EHOSTUNREACH;
 done:
 	if (rt)
 		RTFREE_LOCKED(rt);
 out:
 	if (error)
 		V_rtstat.rts_badredirect++;
 	else if (stat != NULL)
 		(*stat)++;
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_info[RTAX_NETMASK] = netmask;
 	info.rti_info[RTAX_AUTHOR] = src;
 	rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum);
 	if (ifa != NULL)
 		ifa_free(ifa);
 }
 
 int
 rtioctl(u_long req, caddr_t data)
 {
 
 	return (rtioctl_fib(req, data, RT_DEFAULT_FIB));
 }
 
 /*
  * Routing table ioctl interface.
  */
 int
 rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
 {
 
 	/*
 	 * If more ioctl commands are added here, make sure the proper
 	 * super-user checks are being performed because it is possible for
 	 * prison-root to make it this far if raw sockets have been enabled
 	 * in jails.
 	 */
 #ifdef INET
 	/* Multicast goop, grrr... */
 	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
 #else /* INET */
 	return ENXIO;
 #endif /* INET */
 }
 
 struct ifaddr *
 ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway,
 				u_int fibnum)
 {
 	struct ifaddr *ifa;
 	int not_found = 0;
 
 	if ((flags & RTF_GATEWAY) == 0) {
 		/*
 		 * If we are adding a route to an interface,
 		 * and the interface is a pt to pt link
 		 * we should search for the destination
 		 * as our clue to the interface.  Otherwise
 		 * we can use the local address.
 		 */
 		ifa = NULL;
 		if (flags & RTF_HOST)
 			ifa = ifa_ifwithdstaddr(dst, fibnum);
 		if (ifa == NULL)
 			ifa = ifa_ifwithaddr(gateway);
 	} else {
 		/*
 		 * If we are adding a route to a remote net
 		 * or host, the gateway may still be on the
 		 * other end of a pt to pt link.
 		 */
 		ifa = ifa_ifwithdstaddr(gateway, fibnum);
 	}
 	if (ifa == NULL)
 		ifa = ifa_ifwithnet(gateway, 0, fibnum);
 	if (ifa == NULL) {
 		struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum);
 		if (rt == NULL)
 			return (NULL);
 		/*
 		 * dismiss a gateway that is reachable only
 		 * through the default router
 		 */
 		switch (gateway->sa_family) {
 		case AF_INET:
 			if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
 				not_found = 1;
 			break;
 		case AF_INET6:
 			if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
 				not_found = 1;
 			break;
 		default:
 			break;
 		}
 		if (!not_found && rt->rt_ifa != NULL) {
 			ifa = rt->rt_ifa;
 			ifa_ref(ifa);
 		}
 		RT_REMREF(rt);
 		RT_UNLOCK(rt);
 		if (not_found || ifa == NULL)
 			return (NULL);
 	}
 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
 		struct ifaddr *oifa = ifa;
 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
 		if (ifa == NULL)
 			ifa = oifa;
 		else
 			ifa_free(oifa);
 	}
 	return (ifa);
 }
 
 /*
  * Do appropriate manipulations of a routing tree given
  * all the bits of info needed
  */
 int
 rtrequest(int req,
 	struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct rtentry **ret_nrt)
 {
 
 	return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt,
 	    RT_DEFAULT_FIB));
 }
 
 int
 rtrequest_fib(int req,
 	struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct rtentry **ret_nrt,
 	u_int fibnum)
 {
 	struct rt_addrinfo info;
 
 	if (dst->sa_len == 0)
 		return(EINVAL);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_flags = flags;
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_info[RTAX_NETMASK] = netmask;
 	return rtrequest1_fib(req, &info, ret_nrt, fibnum);
 }
 
 
 void
 rt_foreach_fib(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f, void *arg)
 {
 	struct rib_head *rh;
 	uint32_t fibnum;
 	int i;
 
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		/* Do we want some specific family? */
 		if (af != AF_UNSPEC) {
 			rh = rt_tables_get_rnh(fibnum, af);
 			if (rh == NULL)
 				continue;
 			if (setwa_f != NULL)
 				setwa_f(rh, fibnum, i, arg);
 
+			RIB_CFG_WLOCK(rh);
+			/* Do runtime locking for now */
 			RIB_WLOCK(rh);
 			rh->rnh_walktree(&rh->head, (walktree_f_t *)wa_f, arg);
 			RIB_WUNLOCK(rh);
+			RIB_CFG_WUNLOCK(rh);
 			continue;
 		}
 
 		for (i = 1; i <= AF_MAX; i++) {
 			rh = rt_tables_get_rnh(fibnum, i);
 			if (rh == NULL)
 				continue;
 			if (setwa_f != NULL)
 				setwa_f(rh, fibnum, i, arg);
 
+			RIB_CFG_WLOCK(rh);
 			RIB_WLOCK(rh);
+			/* Do runtime locking for now */
 			rh->rnh_walktree(&rh->head, (walktree_f_t *)wa_f, arg);
 			RIB_WUNLOCK(rh);
+			RIB_CFG_WUNLOCK(rh);
 		}
 	}
 }
 
 /*
  * Delete Routes for a Network Interface
  *
  * Called for each routing entry via the rh->rnh_walktree() call above
  * to delete all route entries referencing a detaching network interface.
  *
  * Arguments:
  *	rt	pointer to rtentry
  *	arg	argument passed to rh->rnh_walktree() - detaching interface
  *
  * Returns:
  *	0	successful
  *	errno	failed - reason indicated
  */
 static int
 rt_ifdelroute(struct rtentry *rt, void *arg)
 {
 	struct ifnet	*ifp = arg;
 	int		err;
 
 	if (rt->rt_ifp != ifp)
 		return (0);
 
 	/*
 	 * Protect (sorta) against walktree recursion problems
 	 * with cloned routes
 	 */
 	if ((rt->rt_flags & RTF_UP) == 0)
 		return (0);
 
 	err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway,
 			rt_mask(rt),
 			rt->rt_flags | RTF_RNH_LOCKED | RTF_PINNED,
 			(struct rtentry **) NULL, rt->rt_fibnum);
 	if (err != 0)
 		log(LOG_WARNING, "rt_ifdelroute: error %d\n", err);
 
 	return (0);
 }
 
 /*
  * Delete all remaining routes using this interface
  * Unfortuneatly the only way to do this is to slog through
  * the entire routing table looking for routes which point
  * to this interface...oh well...
  */
 void
 rt_flushifroutes(struct ifnet *ifp)
 {
 
 	rt_foreach_fib(AF_UNSPEC, NULL, rt_ifdelroute, ifp);
 }
 
 /*
  * These (questionable) definitions of apparent local variables apply
  * to the next two functions.  XXXXXX!!!
  */
 #define	dst	info->rti_info[RTAX_DST]
 #define	gateway	info->rti_info[RTAX_GATEWAY]
 #define	netmask	info->rti_info[RTAX_NETMASK]
 #define	ifaaddr	info->rti_info[RTAX_IFA]
 #define	ifpaddr	info->rti_info[RTAX_IFP]
 #define	flags	info->rti_flags
 
 int
 rt_getifa(struct rt_addrinfo *info)
 {
 
 	return (rt_getifa_fib(info, RT_DEFAULT_FIB));
 }
 
 /*
  * Look up rt_addrinfo for a specific fib.  Note that if rti_ifa is defined,
  * it will be referenced so the caller must free it.
  */
 int
 rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
 {
 	struct ifaddr *ifa;
 	int error = 0;
 
 	/*
 	 * ifp may be specified by sockaddr_dl
 	 * when protocol address is ambiguous.
 	 */
 	if (info->rti_ifp == NULL && ifpaddr != NULL &&
 	    ifpaddr->sa_family == AF_LINK &&
 	    (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) {
 		info->rti_ifp = ifa->ifa_ifp;
 		ifa_free(ifa);
 	}
 	if (info->rti_ifa == NULL && ifaaddr != NULL)
 		info->rti_ifa = ifa_ifwithaddr(ifaaddr);
 	if (info->rti_ifa == NULL) {
 		struct sockaddr *sa;
 
 		sa = ifaaddr != NULL ? ifaaddr :
 		    (gateway != NULL ? gateway : dst);
 		if (sa != NULL && info->rti_ifp != NULL)
 			info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
 		else if (dst != NULL && gateway != NULL)
 			info->rti_ifa = ifa_ifwithroute(flags, dst, gateway,
 							fibnum);
 		else if (sa != NULL)
 			info->rti_ifa = ifa_ifwithroute(flags, sa, sa,
 							fibnum);
 	}
 	if ((ifa = info->rti_ifa) != NULL) {
 		if (info->rti_ifp == NULL)
 			info->rti_ifp = ifa->ifa_ifp;
 	} else
 		error = ENETUNREACH;
 	return (error);
 }
 
 /*
  * Expunges references to a route that's about to be reclaimed.
  * The route must be locked.
  */
 int
 rt_expunge(struct rib_head *rh, struct rtentry *rt)
 {
 #if !defined(RADIX_MPATH)
 	struct radix_node *rn;
 #else
 	struct rt_addrinfo info;
 	int fib;
 	struct rtentry *rt0;
 #endif
 	struct ifaddr *ifa;
 	int error = 0;
 
 	RT_LOCK_ASSERT(rt);
 	RIB_LOCK_ASSERT(rh);
 
 #ifdef RADIX_MPATH
 	fib = rt->rt_fibnum;
 	bzero(&info, sizeof(info));
 	info.rti_ifp = rt->rt_ifp;
 	info.rti_flags = RTF_RNH_LOCKED;
 	info.rti_info[RTAX_DST] = rt_key(rt);
 	info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr;
 
 	RT_UNLOCK(rt);
 	error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib);
 
 	if (error == 0 && rt0 != NULL) {
 		rt = rt0;
 		RT_LOCK(rt);
 	} else if (error != 0) {
 		RT_LOCK(rt);
 		return (error);
 	}
 #else
 	/*
 	 * Remove the item from the tree; it should be there,
 	 * but when callers invoke us blindly it may not (sigh).
 	 */
 	rn = rh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rh->head);
 	if (rn == NULL) {
 		error = ESRCH;
 		goto bad;
 	}
 	KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0,
 		("unexpected flags 0x%x", rn->rn_flags));
 	KASSERT(rt == RNTORT(rn),
 		("lookup mismatch, rt %p rn %p", rt, rn));
 #endif /* RADIX_MPATH */
 
 	rt->rt_flags &= ~RTF_UP;
 
 	/*
 	 * Give the protocol a chance to keep things in sync.
 	 */
 	if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
 		struct rt_addrinfo info;
 
 		bzero((caddr_t)&info, sizeof(info));
 		info.rti_flags = rt->rt_flags;
 		info.rti_info[RTAX_DST] = rt_key(rt);
 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 		info.rti_info[RTAX_NETMASK] = rt_mask(rt);
 		ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
 	}
 
 	/*
 	 * one more rtentry floating around that is not
 	 * linked to the routing table.
 	 */
 	V_rttrash++;
 #if !defined(RADIX_MPATH)
 bad:
 #endif
 	return (error);
 }
 
 #if 0
 int p_sockaddr(char *buf, int buflen, struct sockaddr *s);
 int rt_print(char *buf, int buflen, struct rtentry *rt);
 
 int
 p_sockaddr(char *buf, int buflen, struct sockaddr *s)
 {
 	void *paddr = NULL;
 
 	switch (s->sa_family) {
 	case AF_INET:
 		paddr = &((struct sockaddr_in *)s)->sin_addr;
 		break;
 	case AF_INET6:
 		paddr = &((struct sockaddr_in6 *)s)->sin6_addr;
 		break;
 	}
 
 	if (paddr == NULL)
 		return (0);
 
 	if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL)
 		return (0);
 	
 	return (strlen(buf));
 }
 
 int
 rt_print(char *buf, int buflen, struct rtentry *rt)
 {
 	struct sockaddr *addr, *mask;
 	int i = 0;
 
 	addr = rt_key(rt);
 	mask = rt_mask(rt);
 
 	i = p_sockaddr(buf, buflen, addr);
 	if (!(rt->rt_flags & RTF_HOST)) {
 		buf[i++] = '/';
 		i += p_sockaddr(buf + i, buflen - i, mask);
 	}
 
 	if (rt->rt_flags & RTF_GATEWAY) {
 		buf[i++] = '>';
 		i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway);
 	}
 
 	return (i);
 }
 #endif
 
 #ifdef RADIX_MPATH
 static int
 rn_mpath_update(int req, struct rt_addrinfo *info,
     struct rib_head *rh, struct rtentry **ret_nrt)
 {
 	/*
 	 * if we got multipath routes, we require users to specify
 	 * a matching RTAX_GATEWAY.
 	 */
 	struct rtentry *rt, *rto = NULL;
 	struct radix_node *rn;
 	int error = 0;
 
 	rn = rh->rnh_lookup(dst, netmask, rh);
 	if (rn == NULL)
 		return (ESRCH);
 	rto = rt = RNTORT(rn);
 
 	rt = rt_mpath_matchgate(rt, gateway);
 	if (rt == NULL)
 		return (ESRCH);
 	/*
 	 * this is the first entry in the chain
 	 */
 	if (rto == rt) {
 		rn = rn_mpath_next((struct radix_node *)rt);
 		/*
 		 * there is another entry, now it's active
 		 */
 		if (rn) {
 			rto = RNTORT(rn);
 			RT_LOCK(rto);
 			rto->rt_flags |= RTF_UP;
 			RT_UNLOCK(rto);
 		} else if (rt->rt_flags & RTF_GATEWAY) {
 			/*
 			 * For gateway routes, we need to 
 			 * make sure that we we are deleting
 			 * the correct gateway. 
 			 * rt_mpath_matchgate() does not 
 			 * check the case when there is only
 			 * one route in the chain.  
 			 */
 			if (gateway &&
 			    (rt->rt_gateway->sa_len != gateway->sa_len ||
 				memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
 				error = ESRCH;
 			else {
 				/*
 				 * remove from tree before returning it
 				 * to the caller
 				 */
 				rn = rh->rnh_deladdr(dst, netmask, rh);
 				KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
 				goto gwdelete;
 			}
 			
 		}
 		/*
 		 * use the normal delete code to remove
 		 * the first entry
 		 */
 		if (req != RTM_DELETE) 
 			goto nondelete;
 
 		error = ENOENT;
 		goto done;
 	}
 		
 	/*
 	 * if the entry is 2nd and on up
 	 */
 	if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
 		panic ("rtrequest1: rt_mpath_deldup");
 gwdelete:
 	RT_LOCK(rt);
 	RT_ADDREF(rt);
 	if (req == RTM_DELETE) {
 		rt->rt_flags &= ~RTF_UP;
 		/*
 		 * One more rtentry floating around that is not
 		 * linked to the routing table. rttrash will be decremented
 		 * when RTFREE(rt) is eventually called.
 		 */
 		V_rttrash++;
 	}
 	
 nondelete:
 	if (req != RTM_DELETE)
 		panic("unrecognized request %d", req);
 	
 
 	/*
 	 * If the caller wants it, then it can have it,
 	 * but it's up to it to free the rtentry as we won't be
 	 * doing it.
 	 */
 	if (ret_nrt) {
 		*ret_nrt = rt;
 		RT_UNLOCK(rt);
 	} else
 		RTFREE_LOCKED(rt);
 done:
 	return (error);
 }
 #endif
 
 int
 rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
 				u_int fibnum)
 {
 	int error = 0, needlock = 0;
 	struct rtentry *rt;
 #ifdef FLOWTABLE
 	struct rtentry *rt0;
 #endif
 	struct radix_node *rn;
 	struct rib_head *rh;
 	struct ifaddr *ifa;
 	struct sockaddr *ndst;
 	struct sockaddr_storage mdst;
 #define senderr(x) { error = x ; goto bad; }
 
 	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We support multiple FIBs. */
 		break;
 	default:
 		fibnum = RT_DEFAULT_FIB;
 		break;
 	}
 
 	/*
 	 * Find the correct routing tree to use for this Address Family
 	 */
 	rh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rh == NULL)
 		return (EAFNOSUPPORT);
 	needlock = ((flags & RTF_RNH_LOCKED) == 0);
 	flags &= ~RTF_RNH_LOCKED;
-	if (needlock)
+	if (needlock) {
+		RIB_CFG_WLOCK(rh);
 		RIB_WLOCK(rh);
-	else
+	} else
 		RIB_LOCK_ASSERT(rh);
 	/*
 	 * If we are adding a host route then we don't want to put
 	 * a netmask in the tree, nor do we want to clone it.
 	 */
 	if (flags & RTF_HOST)
 		netmask = NULL;
 
 	switch (req) {
 	case RTM_DELETE:
 		if (netmask) {
 			rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
 			dst = (struct sockaddr *)&mdst;
 		}
 #ifdef RADIX_MPATH
 		if (rn_mpath_capable(rh)) {
 			error = rn_mpath_update(req, info, rh, ret_nrt);
 			/*
 			 * "bad" holds true for the success case
 			 * as well
 			 */
 			if (error != ENOENT)
 				goto bad;
 			error = 0;
 		}
 #endif
 		if ((flags & RTF_PINNED) == 0) {
 			/* Check if target route can be deleted */
 			rt = (struct rtentry *)rh->rnh_lookup(dst,
 			    netmask, &rh->head);
 			if ((rt != NULL) && (rt->rt_flags & RTF_PINNED))
 				senderr(EADDRINUSE);
 		}
 
 		/*
 		 * Remove the item from the tree and return it.
 		 * Complain if it is not there and do no more processing.
 		 */
 		rn = rh->rnh_deladdr(dst, netmask, &rh->head);
 		if (rn == NULL)
 			senderr(ESRCH);
 		if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 			panic ("rtrequest delete");
 		rt = RNTORT(rn);
 		RT_LOCK(rt);
 		RT_ADDREF(rt);
 		rt->rt_flags &= ~RTF_UP;
 
 		/*
 		 * give the protocol a chance to keep things in sync.
 		 */
 		if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
 			ifa->ifa_rtrequest(RTM_DELETE, rt, info);
 
 		/*
 		 * One more rtentry floating around that is not
 		 * linked to the routing table. rttrash will be decremented
 		 * when RTFREE(rt) is eventually called.
 		 */
 		V_rttrash++;
 
 		/*
 		 * If the caller wants it, then it can have it,
 		 * but it's up to it to free the rtentry as we won't be
 		 * doing it.
 		 */
 		if (ret_nrt) {
 			*ret_nrt = rt;
 			RT_UNLOCK(rt);
 		} else
 			RTFREE_LOCKED(rt);
 		break;
 	case RTM_RESOLVE:
 		/*
 		 * resolve was only used for route cloning
 		 * here for compat
 		 */
 		break;
 	case RTM_ADD:
 		if ((flags & RTF_GATEWAY) && !gateway)
 			senderr(EINVAL);
 		if (dst && gateway && (dst->sa_family != gateway->sa_family) && 
 		    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
 			senderr(EINVAL);
 
 		if (info->rti_ifa == NULL) {
 			error = rt_getifa_fib(info, fibnum);
 			if (error)
 				senderr(error);
 		} else
 			ifa_ref(info->rti_ifa);
 		ifa = info->rti_ifa;
 		rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO);
 		if (rt == NULL) {
 			ifa_free(ifa);
 			senderr(ENOBUFS);
 		}
 		RT_LOCK_INIT(rt);
 		rt->rt_flags = RTF_UP | flags;
 		rt->rt_fibnum = fibnum;
 		/*
 		 * Add the gateway. Possibly re-malloc-ing the storage for it.
 		 */
 		RT_LOCK(rt);
 		if ((error = rt_setgate(rt, dst, gateway)) != 0) {
 			RT_LOCK_DESTROY(rt);
 			ifa_free(ifa);
 			uma_zfree(V_rtzone, rt);
 			senderr(error);
 		}
 
 		/*
 		 * point to the (possibly newly malloc'd) dest address.
 		 */
 		ndst = (struct sockaddr *)rt_key(rt);
 
 		/*
 		 * make sure it contains the value we want (masked if needed).
 		 */
 		if (netmask) {
 			rt_maskedcopy(dst, ndst, netmask);
 		} else
 			bcopy(dst, ndst, dst->sa_len);
 
 		/*
 		 * We use the ifa reference returned by rt_getifa_fib().
 		 * This moved from below so that rh->rnh_addaddr() can
 		 * examine the ifa and  ifa->ifa_ifp if it so desires.
 		 */
 		rt->rt_ifa = ifa;
 		rt->rt_ifp = ifa->ifa_ifp;
 		rt->rt_weight = 1;
 
 		rt_setmetrics(info, rt);
 
 #ifdef RADIX_MPATH
 		/* do not permit exactly the same dst/mask/gw pair */
 		if (rn_mpath_capable(rh) &&
 			rt_mpath_conflict(rh, rt, netmask)) {
 			ifa_free(rt->rt_ifa);
 			Free(rt_key(rt));
 			RT_LOCK_DESTROY(rt);
 			uma_zfree(V_rtzone, rt);
 			senderr(EEXIST);
 		}
 #endif
 
 #ifdef FLOWTABLE
 		rt0 = NULL;
 		/* "flow-table" only supports IPv6 and IPv4 at the moment. */
 		switch (dst->sa_family) {
 #ifdef INET6
 		case AF_INET6:
 #endif
 #ifdef INET
 		case AF_INET:
 #endif
 #if defined(INET6) || defined(INET)
 			rn = rh->rnh_matchaddr(dst, rh);
 			if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
 				struct sockaddr *mask;
 				u_char *m, *n;
 				int len;
 				
 				/*
 				 * compare mask to see if the new route is
 				 * more specific than the existing one
 				 */
 				rt0 = RNTORT(rn);
 				RT_LOCK(rt0);
 				RT_ADDREF(rt0);
 				RT_UNLOCK(rt0);
 				/*
 				 * A host route is already present, so 
 				 * leave the flow-table entries as is.
 				 */
 				if (rt0->rt_flags & RTF_HOST) {
 					RTFREE(rt0);
 					rt0 = NULL;
 				} else if (!(flags & RTF_HOST) && netmask) {
 					mask = rt_mask(rt0);
 					len = mask->sa_len;
 					m = (u_char *)mask;
 					n = (u_char *)netmask;
 					while (len-- > 0) {
 						if (*n != *m)
 							break;
 						n++;
 						m++;
 					}
 					if (len == 0 || (*n < *m)) {
 						RTFREE(rt0);
 						rt0 = NULL;
 					}
 				}
 			}
 #endif/* INET6 || INET */
 		}
 #endif /* FLOWTABLE */
 
 		/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
 		rn = rh->rnh_addaddr(ndst, netmask, &rh->head, rt->rt_nodes);
 		/*
 		 * If it still failed to go into the tree,
 		 * then un-make it (this should be a function)
 		 */
 		if (rn == NULL) {
 			ifa_free(rt->rt_ifa);
 			Free(rt_key(rt));
 			uma_zfree(V_rtzone, rt);
 #ifdef FLOWTABLE
 			if (rt0 != NULL)
 				RTFREE(rt0);
 #endif
 			senderr(EEXIST);
 		} 
 #ifdef FLOWTABLE
 		else if (rt0 != NULL) {
 			flowtable_route_flush(dst->sa_family, rt0);
 			RTFREE(rt0);
 		}
 #endif
 
 		/*
 		 * If this protocol has something to add to this then
 		 * allow it to do that as well.
 		 */
 		if (ifa->ifa_rtrequest)
 			ifa->ifa_rtrequest(req, rt, info);
 
 		/*
 		 * actually return a resultant rtentry and
 		 * give the caller a single reference.
 		 */
 		if (ret_nrt) {
 			*ret_nrt = rt;
 			RT_ADDREF(rt);
 		}
 		RT_UNLOCK(rt);
 		break;
 	case RTM_CHANGE:
 		error = rtrequest1_fib_change(rh, info, ret_nrt, fibnum);
 		break;
 	default:
 		error = EOPNOTSUPP;
 	}
 bad:
-	if (needlock)
+	if (needlock) {
 		RIB_WUNLOCK(rh);
+		RIB_CFG_WUNLOCK(rh);
+	}
 	return (error);
 #undef senderr
 }
 
 #undef dst
 #undef gateway
 #undef netmask
 #undef ifaaddr
 #undef ifpaddr
 #undef flags
 
 static int
 rtrequest1_fib_change(struct rib_head *rh, struct rt_addrinfo *info,
     struct rtentry **ret_nrt, u_int fibnum)
 {
 	struct rtentry *rt = NULL;
 	int error = 0;
 	int free_ifa = 0;
 	int family, mtu;
 
 	rt = (struct rtentry *)rh->rnh_lookup(info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], &rh->head);
 
 	if (rt == NULL)
 		return (ESRCH);
 
 #ifdef RADIX_MPATH
 	/*
 	 * If we got multipath routes,
 	 * we require users to specify a matching RTAX_GATEWAY.
 	 */
 	if (rn_mpath_capable(rh)) {
 		rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]);
 		if (rt == NULL)
 			return (ESRCH);
 	}
 #endif
 
 	RT_LOCK(rt);
 
 	rt_setmetrics(info, rt);
 
 	/*
 	 * New gateway could require new ifaddr, ifp;
 	 * flags may also be different; ifp may be specified
 	 * by ll sockaddr when protocol address is ambiguous
 	 */
 	if (((rt->rt_flags & RTF_GATEWAY) &&
 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
 	    info->rti_info[RTAX_IFP] != NULL ||
 	    (info->rti_info[RTAX_IFA] != NULL &&
 	     !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) {
 
 		error = rt_getifa_fib(info, fibnum);
 		if (info->rti_ifa != NULL)
 			free_ifa = 1;
 
 		if (error != 0)
 			goto bad;
 	}
 
 	/* Check if outgoing interface has changed */
 	if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa &&
 	    rt->rt_ifa != NULL && rt->rt_ifa->ifa_rtrequest != NULL) {
 		rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info);
 		ifa_free(rt->rt_ifa);
 	}
 	/* Update gateway address */
 	if (info->rti_info[RTAX_GATEWAY] != NULL) {
 		error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]);
 		if (error != 0)
 			goto bad;
 
 		rt->rt_flags &= ~RTF_GATEWAY;
 		rt->rt_flags |= (RTF_GATEWAY & info->rti_flags);
 	}
 
 	if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) {
 		ifa_ref(info->rti_ifa);
 		rt->rt_ifa = info->rti_ifa;
 		rt->rt_ifp = info->rti_ifp;
 	}
 	/* Allow some flags to be toggled on change. */
 	rt->rt_flags &= ~RTF_FMASK;
 	rt->rt_flags |= info->rti_flags & RTF_FMASK;
 
 	if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL)
 	       rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info);
 
 	/* Ensure route MTU is not bigger than interface MTU */
 	if (rt->rt_ifp != NULL) {
 		family = info->rti_info[RTAX_DST]->sa_family;
 		mtu = if_getmtu_family(rt->rt_ifp, family);
 		if (rt->rt_mtu > mtu)
 			rt->rt_mtu = mtu;
 	}
 
 	if (ret_nrt) {
 		*ret_nrt = rt;
 		RT_ADDREF(rt);
 	}
 bad:
 	RT_UNLOCK(rt);
 	if (free_ifa != 0)
 		ifa_free(info->rti_ifa);
 	return (error);
 }
 
 static void
 rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt)
 {
 
 	if (info->rti_mflags & RTV_MTU)
 		rt->rt_mtu = info->rti_rmx->rmx_mtu;
 	if (info->rti_mflags & RTV_WEIGHT)
 		rt->rt_weight = info->rti_rmx->rmx_weight;
 	/* Kernel -> userland timebase conversion. */
 	if (info->rti_mflags & RTV_EXPIRE)
 		rt->rt_expire = info->rti_rmx->rmx_expire ?
 		    info->rti_rmx->rmx_expire - time_second + time_uptime : 0;
 }
 
 int
 rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
 {
 	/* XXX dst may be overwritten, can we move this to below */
 	int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
 #ifdef INVARIANTS
 	struct rib_head *rh;
 
 	rh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family);
 #endif
 
 	RT_LOCK_ASSERT(rt);
 	RIB_LOCK_ASSERT(rh);
 	
 	/*
 	 * Prepare to store the gateway in rt->rt_gateway.
 	 * Both dst and gateway are stored one after the other in the same
 	 * malloc'd chunk. If we have room, we can reuse the old buffer,
 	 * rt_gateway already points to the right place.
 	 * Otherwise, malloc a new block and update the 'dst' address.
 	 */
 	if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
 		caddr_t new;
 
 		R_Malloc(new, caddr_t, dlen + glen);
 		if (new == NULL)
 			return ENOBUFS;
 		/*
 		 * XXX note, we copy from *dst and not *rt_key(rt) because
 		 * rt_setgate() can be called to initialize a newly
 		 * allocated route entry, in which case rt_key(rt) == NULL
 		 * (and also rt->rt_gateway == NULL).
 		 * Free()/free() handle a NULL argument just fine.
 		 */
 		bcopy(dst, new, dlen);
 		Free(rt_key(rt));	/* free old block, if any */
 		rt_key(rt) = (struct sockaddr *)new;
 		rt->rt_gateway = (struct sockaddr *)(new + dlen);
 	}
 
 	/*
 	 * Copy the new gateway value into the memory chunk.
 	 */
 	bcopy(gate, rt->rt_gateway, glen);
 
 	return (0);
 }
 
 void
 rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
 {
 	u_char *cp1 = (u_char *)src;
 	u_char *cp2 = (u_char *)dst;
 	u_char *cp3 = (u_char *)netmask;
 	u_char *cplim = cp2 + *cp3;
 	u_char *cplim2 = cp2 + *cp1;
 
 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
 	cp3 += 2;
 	if (cplim > cplim2)
 		cplim = cplim2;
 	while (cp2 < cplim)
 		*cp2++ = *cp1++ & *cp3++;
 	if (cp2 < cplim2)
 		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
 }
 
 /*
  * Set up a routing table entry, normally
  * for an interface.
  */
 #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
 static inline  int
 rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
 {
 	struct sockaddr *dst;
 	struct sockaddr *netmask;
 	struct rtentry *rt = NULL;
 	struct rt_addrinfo info;
 	int error = 0;
 	int startfib, endfib;
 	char tempbuf[_SOCKADDR_TMPSIZE];
 	int didwork = 0;
 	int a_failure = 0;
 	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
 	struct rib_head *rh;
 
 	if (flags & RTF_HOST) {
 		dst = ifa->ifa_dstaddr;
 		netmask = NULL;
 	} else {
 		dst = ifa->ifa_addr;
 		netmask = ifa->ifa_netmask;
 	}
 	if (dst->sa_len == 0)
 		return(EINVAL);
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We support multiple FIBs. */
 		break;
 	default:
 		fibnum = RT_DEFAULT_FIB;
 		break;
 	}
 	if (fibnum == RT_ALL_FIBS) {
 		if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD)
 			startfib = endfib = ifa->ifa_ifp->if_fib;
 		else {
 			startfib = 0;
 			endfib = rt_numfibs - 1;
 		}
 	} else {
 		KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
 		startfib = fibnum;
 		endfib = fibnum;
 	}
 
 	/*
 	 * If it's a delete, check that if it exists,
 	 * it's on the correct interface or we might scrub
 	 * a route to another ifa which would
 	 * be confusing at best and possibly worse.
 	 */
 	if (cmd == RTM_DELETE) {
 		/*
 		 * It's a delete, so it should already exist..
 		 * If it's a net, mask off the host bits
 		 * (Assuming we have a mask)
 		 * XXX this is kinda inet specific..
 		 */
 		if (netmask != NULL) {
 			rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
 			dst = (struct sockaddr *)tempbuf;
 		}
 	}
 	/*
 	 * Now go through all the requested tables (fibs) and do the
 	 * requested action. Realistically, this will either be fib 0
 	 * for protocols that don't do multiple tables or all the
 	 * tables for those that do.
 	 */
 	for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
 		if (cmd == RTM_DELETE) {
 			struct radix_node *rn;
 			/*
 			 * Look up an rtentry that is in the routing tree and
 			 * contains the correct info.
 			 */
 			rh = rt_tables_get_rnh(fibnum, dst->sa_family);
 			if (rh == NULL)
 				/* this table doesn't exist but others might */
 				continue;
-			RIB_RLOCK(rh);
+			RIB_CFG_RLOCK(rh);
 			rn = rh->rnh_lookup(dst, netmask, &rh->head);
 #ifdef RADIX_MPATH
 			if (rn_mpath_capable(rh)) {
 
 				if (rn == NULL) 
 					error = ESRCH;
 				else {
 					rt = RNTORT(rn);
 					/*
 					 * for interface route the
 					 * rt->rt_gateway is sockaddr_intf
 					 * for cloning ARP entries, so
 					 * rt_mpath_matchgate must use the
 					 * interface address
 					 */
 					rt = rt_mpath_matchgate(rt,
 					    ifa->ifa_addr);
 					if (rt == NULL) 
 						error = ESRCH;
 				}
 			}
 #endif
 			error = (rn == NULL ||
 			    (rn->rn_flags & RNF_ROOT) ||
 			    RNTORT(rn)->rt_ifa != ifa);
-			RIB_RUNLOCK(rh);
+			RIB_CFG_RUNLOCK(rh);
 			if (error) {
 				/* this is only an error if bad on ALL tables */
 				continue;
 			}
 		}
 		/*
 		 * Do the actual request
 		 */
 		bzero((caddr_t)&info, sizeof(info));
 		info.rti_ifa = ifa;
 		info.rti_flags = flags |
 		    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
 		info.rti_info[RTAX_DST] = dst;
 		/* 
 		 * doing this for compatibility reasons
 		 */
 		if (cmd == RTM_ADD)
 			info.rti_info[RTAX_GATEWAY] =
 			    (struct sockaddr *)&null_sdl;
 		else
 			info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
 		info.rti_info[RTAX_NETMASK] = netmask;
 		error = rtrequest1_fib(cmd, &info, &rt, fibnum);
 
 		if ((error == EEXIST) && (cmd == RTM_ADD)) {
 			/*
 			 * Interface route addition failed.
 			 * Atomically delete current prefix generating
 			 * RTM_DELETE message, and retry adding
 			 * interface prefix.
 			 */
 			rh = rt_tables_get_rnh(fibnum, dst->sa_family);
+			RIB_CFG_WLOCK(rh);
 			RIB_WLOCK(rh);
 
 			/* Delete old prefix */
 			info.rti_ifa = NULL;
 			info.rti_flags = RTF_RNH_LOCKED;
 
 			error = rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum);
 			if (error == 0) {
 				info.rti_ifa = ifa;
 				info.rti_flags = flags | RTF_RNH_LOCKED |
 				    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
 				error = rtrequest1_fib(cmd, &info, &rt, fibnum);
 			}
 
 			RIB_WUNLOCK(rh);
+			RIB_CFG_WUNLOCK(rh);
 		}
 
 
 		if (error == 0 && rt != NULL) {
 			/*
 			 * notify any listening routing agents of the change
 			 */
 			RT_LOCK(rt);
 #ifdef RADIX_MPATH
 			/*
 			 * in case address alias finds the first address
 			 * e.g. ifconfig bge0 192.0.2.246/24
 			 * e.g. ifconfig bge0 192.0.2.247/24
 			 * the address set in the route is 192.0.2.246
 			 * so we need to replace it with 192.0.2.247
 			 */
 			if (memcmp(rt->rt_ifa->ifa_addr,
 			    ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
 				ifa_free(rt->rt_ifa);
 				ifa_ref(ifa);
 				rt->rt_ifp = ifa->ifa_ifp;
 				rt->rt_ifa = ifa;
 			}
 #endif
 			/* 
 			 * doing this for compatibility reasons
 			 */
 			if (cmd == RTM_ADD) {
 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type  =
 				rt->rt_ifp->if_type;
 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
 				rt->rt_ifp->if_index;
 			}
 			RT_ADDREF(rt);
 			RT_UNLOCK(rt);
 			rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum);
 			RT_LOCK(rt);
 			RT_REMREF(rt);
 			if (cmd == RTM_DELETE) {
 				/*
 				 * If we are deleting, and we found an entry,
 				 * then it's been removed from the tree..
 				 * now throw it away.
 				 */
 				RTFREE_LOCKED(rt);
 			} else {
 				if (cmd == RTM_ADD) {
 					/*
 					 * We just wanted to add it..
 					 * we don't actually need a reference.
 					 */
 					RT_REMREF(rt);
 				}
 				RT_UNLOCK(rt);
 			}
 			didwork = 1;
 		}
 		if (error)
 			a_failure = error;
 	}
 	if (cmd == RTM_DELETE) {
 		if (didwork) {
 			error = 0;
 		} else {
 			/* we only give an error if it wasn't in any table */
 			error = ((flags & RTF_HOST) ?
 			    EHOSTUNREACH : ENETUNREACH);
 		}
 	} else {
 		if (a_failure) {
 			/* return an error if any of them failed */
 			error = a_failure;
 		}
 	}
 	return (error);
 }
 
 /*
  * Set up a routing table entry, normally
  * for an interface.
  */
 int
 rtinit(struct ifaddr *ifa, int cmd, int flags)
 {
 	struct sockaddr *dst;
 	int fib = RT_DEFAULT_FIB;
 
 	if (flags & RTF_HOST) {
 		dst = ifa->ifa_dstaddr;
 	} else {
 		dst = ifa->ifa_addr;
 	}
 
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We do support multiple FIBs. */
 		fib = RT_ALL_FIBS;
 		break;
 	}
 	return (rtinit1(ifa, cmd, flags, fib));
 }
 
 /*
  * Announce interface address arrival/withdraw
  * Returns 0 on success.
  */
 int
 rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 	    ("unexpected cmd %d", cmd));
 	
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 #if defined(INET) || defined(INET6)
 #ifdef SCTP
 	/*
 	 * notify the SCTP stack
 	 * this will only get called when an address is added/deleted
 	 * XXX pass the ifaddr struct instead if ifa->ifa_addr...
 	 */
 	sctp_addr_change(ifa, cmd);
 #endif /* SCTP */
 #endif
 	return (rtsock_addrmsg(cmd, ifa, fibnum));
 }
 
 /*
  * Announce route addition/removal.
  * Users of this function MUST validate input data BEFORE calling.
  * However we have to be able to handle invalid data:
  * if some userland app sends us "invalid" route message (invalid mask,
  * no dst, wrong address families, etc...) we need to pass it back
  * to app (and any other rtsock consumers) with rtm_errno field set to
  * non-zero value.
  * Returns 0 on success.
  */
 int
 rt_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt,
     int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 	    ("unexpected cmd %d", cmd));
 	
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__));
 
 	return (rtsock_routemsg(cmd, ifp, error, rt, fibnum));
 }
 
 void
 rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt)
 {
 
 	rt_newaddrmsg_fib(cmd, ifa, error, rt, RT_ALL_FIBS);
 }
 
 /*
  * This is called to generate messages from the routing socket
  * indicating a network interface has had addresses associated with it.
  */
 void
 rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt,
     int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 		("unexpected cmd %u", cmd));
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	if (cmd == RTM_ADD) {
 		rt_addrmsg(cmd, ifa, fibnum);
 		if (rt != NULL)
 			rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum);
 	} else {
 		if (rt != NULL)
 			rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum);
 		rt_addrmsg(cmd, ifa, fibnum);
 	}
 }
 
Index: projects/routing/sys/net/route_internal.h
===================================================================
--- projects/routing/sys/net/route_internal.h	(revision 274335)
+++ projects/routing/sys/net/route_internal.h	(revision 274336)
@@ -1,137 +1,146 @@
 /*-
  * Copyright (c) 2014
  * 	Alexander V. Chernikov <melifaro@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef _NET_ROUTE_INTERNAL_H_
 #define	_NET_ROUTE_INTERNAL_H_
 
 struct rib_head {
 	struct radix_head head;
+	struct rmlock	rib_lock;	/* data path lock */
 	rn_matchaddr_f_t	*rnh_matchaddr;	/* longest match for sockaddr */
 	rn_addaddr_f_t	*rnh_addaddr;	/* add based on sockaddr*/
 	rn_deladdr_f_t	*rnh_deladdr;	/* remove based on sockaddr */
 	rn_lookup_f_t	*rnh_lookup;	/* exact match for sockaddr */
 	rn_walktree_t	*rnh_walktree;	/* traverse tree */
 	rn_walktree_from_t	*rnh_walktree_from; /* traverse tree below a */
 	rn_close_t	*rnh_close;	/*do something when the last ref drops*/
 	struct	radix_node rnh_nodes[3];	/* empty tree for common case */
-	struct	rwlock rib_lock;		/* locks entire radix tree */
+	struct	rwlock rib_cfglock;		/* config lock */
 	struct radix_mask_head rmhead;	/* masks radix head */
 };
 
-#define	RIB_RLOCK(rh)	rw_rlock(&(rh)->rib_lock)
-#define	RIB_RUNLOCK(rh)	rw_runlock(&(rh)->rib_lock)
-#define	RIB_WLOCK(rh)	rw_wlock(&(rh)->rib_lock)
-#define	RIB_WUNLOCK(rh)	rw_wunlock(&(rh)->rib_lock)
-#define	RIB_LOCK_ASSERT(rh)	rw_assert(&(rh)->rib_lock, RA_LOCKED)
-#define	RIB_WLOCK_ASSERT(rh)	rw_assert(&(rh)->rib_lock, RA_WLOCKED)
+#define	RIB_RLOCK(rh)		rm_rlock(&(rh)->rib_lock, &tracker)
+#define	RIB_RUNLOCK(rh)		rm_runlock(&(rh)->rib_lock, &tracker)
+#define	RIB_WLOCK(rh)		rm_wlock(&(rh)->rib_lock)
+#define	RIB_WUNLOCK(rh)		rm_wunlock(&(rh)->rib_lock)
+#define	RIB_WLOCK_ASSERT(rh)	rm_assert(&(rh)->rib_lock, RA_WLOCKED)
+#define	RIB_LOCK_READER		struct rm_priotracker tracker
+#define	RIB_LOCK_ASSERT(rh)	rm_assert(&(rh)->rib_lock, RA_LOCKED)
+
+#define	RIB_CFG_RLOCK(rh)		rw_rlock(&(rh)->rib_cfglock)
+#define	RIB_CFG_RUNLOCK(rh)		rw_runlock(&(rh)->rib_cfglock)
+#define	RIB_CFG_WLOCK(rh)		rw_wlock(&(rh)->rib_cfglock)
+#define	RIB_CFG_WUNLOCK(rh)		rw_wunlock(&(rh)->rib_cfglock)
+#define	RIB_CFG_LOCK_ASSERT(rh)		rw_assert(&(rh)->rib_cfglock, RA_LOCKED)
+#define	RIB_CFG_WLOCK_ASSERT(rh)	rw_assert(&(rh)->rib_cfglock, RA_WLOCKED)
 
 struct rib_head *rt_table_init(int offset);
 void rt_table_destroy(struct rib_head *rh);
 
 
 struct rtentry {
 	struct	radix_node rt_nodes[2];	/* tree glue, and other values */
 	/*
 	 * XXX struct rtentry must begin with a struct radix_node (or two!)
 	 * because the code does some casts of a 'struct radix_node *'
 	 * to a 'struct rtentry *'
 	 */
 #define	rt_key(r)	(*((struct sockaddr **)(&(r)->rt_nodes->rn_key)))
 #define	rt_mask(r)	(*((struct sockaddr **)(&(r)->rt_nodes->rn_mask)))
 	struct	sockaddr *rt_gateway;	/* value */
 	struct	ifnet *rt_ifp;		/* the answer: interface to use */
 	struct	ifaddr *rt_ifa;		/* the answer: interface address to use */
 	int		rt_flags;	/* up/down?, host/net */
 	int		rt_refcnt;	/* # held references */
 	u_int		rt_fibnum;	/* which FIB */
 	u_long		rt_mtu;		/* MTU for this path */
 	u_long		rt_weight;	/* absolute weight */ 
 	u_long		rt_expire;	/* lifetime for route, e.g. redirect */
 #define	rt_endzero	rt_mtx
 	struct mtx	rt_mtx;		/* mutex for routing entry */
 };
 
 #define	RT_LOCK_INIT(_rt) \
 	mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK)
 #define	RT_LOCK(_rt)		mtx_lock(&(_rt)->rt_mtx)
 #define	RT_UNLOCK(_rt)		mtx_unlock(&(_rt)->rt_mtx)
 #define	RT_LOCK_DESTROY(_rt)	mtx_destroy(&(_rt)->rt_mtx)
 #define	RT_LOCK_ASSERT(_rt)	mtx_assert(&(_rt)->rt_mtx, MA_OWNED)
 #define	RT_UNLOCK_COND(_rt)	do {				\
 	if (mtx_owned(&(_rt)->rt_mtx))				\
 		mtx_unlock(&(_rt)->rt_mtx);			\
 } while (0)
 
 #define	RT_ADDREF(_rt)	do {					\
 	RT_LOCK_ASSERT(_rt);					\
 	KASSERT((_rt)->rt_refcnt >= 0,				\
 		("negative refcnt %d", (_rt)->rt_refcnt));	\
 	(_rt)->rt_refcnt++;					\
 } while (0)
 
 #define	RT_REMREF(_rt)	do {					\
 	RT_LOCK_ASSERT(_rt);					\
 	KASSERT((_rt)->rt_refcnt > 0,				\
 		("bogus refcnt %d", (_rt)->rt_refcnt));	\
 	(_rt)->rt_refcnt--;					\
 } while (0)
 
 #define	RTFREE_LOCKED(_rt) do {					\
 	if ((_rt)->rt_refcnt <= 1)				\
 		rtfree(_rt);					\
 	else {							\
 		RT_REMREF(_rt);					\
 		RT_UNLOCK(_rt);					\
 	}							\
 	/* guard against invalid refs */			\
 	_rt = 0;						\
 } while (0)
 
 #define	RTFREE(_rt) do {					\
 	RT_LOCK(_rt);						\
 	RTFREE_LOCKED(_rt);					\
 } while (0)
 
 #define	RO_RTFREE(_ro) do {					\
 	if ((_ro)->ro_rt) {					\
 		if ((_ro)->ro_flags & RT_NORTREF) {		\
 			(_ro)->ro_flags &= ~RT_NORTREF;		\
 			(_ro)->ro_rt = NULL;			\
 		} else {					\
 			RT_LOCK((_ro)->ro_rt);			\
 			RTFREE_LOCKED((_ro)->ro_rt);		\
 		}						\
 	}							\
 } while (0)
 
 
 
 #endif
 
 
Index: projects/routing/sys/net/rt_nhops.c
===================================================================
--- projects/routing/sys/net/rt_nhops.c	(revision 274335)
+++ projects/routing/sys/net/rt_nhops.c	(revision 274336)
@@ -1,1378 +1,1390 @@
 /*-
  * Copyright (c) 2014
  * 	Alexander V. Chernikov <melifaro@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Temporary file. In future it should be split between net/route.c
  * and per-AF files like netinet/in_rmx.c | netinet6/in6_rmx.c
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/syslog.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/rmlock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 #include <sys/sbuf.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route_internal.h>
 #include <net/vnet.h>
 
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_mroute.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/scope6_var.h>
 
 #include <net/if_llatbl.h>
 
 #include <net/if_types.h>
 #include <netinet/if_ether.h>
 #include <net/ethernet.h>
 #include <net/rt_nhops.h>
 
 #include <vm/uma.h>
 
 struct fwd_info {
 	fib_lookup_t	*lookup;
 	void		*state;
 };
 
 #define	FWD_FSM_NONE	0
 #define	FWD_FSM_INIT	1
 #define	FWD_FSM_FWD	2
 struct fwd_control {
 	int		fwd_state;	/* FSM */
 	struct fwd_module	*fm;
 };
 
 #if 0
 static struct fwd_info *fwd_db[FWD_SIZE];
 static struct fwd_control *fwd_ctl[FWD_SIZE];
 
 static TAILQ_HEAD(fwd_module_list, fwd_module)	modulehead = TAILQ_HEAD_INITIALIZER(modulehead);
 static struct fwd_module_list fwd_modules[FWD_SIZE];
 
 static uint8_t fwd_map_af[] = {
 	AF_INET,
 	AF_INET6,
 };
 
 static struct rwlock fwd_lock;
 #define	FWD_LOCK_INIT()	rw_init(&fwd_lock, "fwd_lock")
 #define	FWD_RLOCK()	rw_rlock(&fwd_lock)
 #define	FWD_RUNLOCK()	rw_runlock(&fwd_lock)
 #define	FWD_WLOCK()	rw_wlock(&fwd_lock)
 #define	FWD_WUNLOCK()	rw_wunlock(&fwd_lock)
 
 int fwd_attach_fib(struct fwd_module *fm, u_int fib);
 int fwd_destroy_fib(struct fwd_module *fm, u_int fib);
 #endif
 
 static inline uint16_t fib_rte_to_nh_flags(int rt_flags);
 #ifdef INET
 static void rib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
     struct rt4_extended *prt4);
 static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
     struct nhop4_extended *pnh4);
 static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
     struct nhop4_basic *pnh4);
 #endif
 #ifdef INET6
 static void fib6_rte_to_nh_extended(struct rtentry *rte, struct in6_addr *dst,
     struct nhop6_extended *pnh6);
 static void fib6_rte_to_nh_basic(struct rtentry *rte, struct in6_addr *dst,
     struct nhop6_basic *pnh6);
 static int fib6_storelladdr(struct ifnet *ifp, struct in6_addr *dst,
     int mm_flags, u_char *desten);
 static uint16_t fib6_get_ifa(struct rtentry *rte);
 static int fib6_lla_to_nh_basic(struct in6_addr *dst, uint32_t scopeid,
     struct nhop6_basic *pnh6);
 static int fib6_lla_to_nh_extended(struct in6_addr *dst, uint32_t scopeid,
     struct nhop6_extended *pnh6);
 static int fib6_lla_to_nh(struct in6_addr *dst, uint32_t scopeid,
     struct nhop_prepend *nh, struct ifnet **lifp);
 #endif
 
 MALLOC_DEFINE(M_RTFIB, "rtfib", "routing fwd");
 
 
 
 /*
  * Per-AF fast routines returning minimal needed info.
  * It is not safe to dereference any pointers since it
  * may end up with use-after-free case.
  * Typically it may be used to check if outgoing
  * interface matches or to calculate proper MTU.
  *
  * Note that returned interface pointer is logical one,
  * e.g. actual transmit ifp may be different.
  * Difference may be triggered by
  * 1) loopback routes installed for interface addresses.
  *  e.g. for address 10.0.0.1 with prefix /24 bound to
  *  interface ix0, "logical" interface will be "ix0",
  *  while "trasmit" interface will be "lo0" since this is
  *  loopback route. You should consider using other
  *  functions if you need "transmit" interface or both.
  *
  *
  * Returns 0 on match, error code overwise.
  */
 
 //#define	NHOP_DIRECT	
 #define RNTORT(p)	((struct rtentry *)(p))
 
 
 /*
  * Copies proper nexthop data based on @nh_src nexthop.
  *
  * For non-ECMP nexthop function simply copies @nh_src.
  * For ECMP nexthops flowid is used to select proper
  * nexthop.
  *
  */
 static inline void
 fib_choose_prepend(uint32_t fibnum, struct nhop_prepend *nh_src,
     uint32_t flowid, struct nhop_prepend *nh, int af)
 {
 	struct nhop_multi *nh_multi;
 	int idx;
 
 	if ((nh_src->nh_flags & NHF_RECURSE) != 0) {
 
 		/*
 		 * Recursive nexthop. Choose direct nexthop
 		 * based on flowid.
 		 */
 		nh_multi = (struct nhop_multi *)nh_src;
 		idx = nh_multi->nh_nhops[flowid % nh_multi->nh_count];
 #if 0
 		KASSERT((fibnum < rt_numfibs), ("fib4_lookup_prepend§: bad fibnum"));
 		rh = rt_tables_get_rnh(fibnum, AF_INET);
 		//nh_src = &rh->nhops[i];
 #endif
 	}
 
 	*nh = *nh_src; 
 	/* TODO: Do some light-weight refcounting on egress ifp's */
 }
 
 static inline void
 fib_free_nh_prepend(uint32_t fibnum, struct nhop_prepend *nh, int af)
 {
 
 	/* TODO: Do some light-weight refcounting on egress ifp's */
 }
 
 #ifdef INET
 void
 fib4_free_nh_prepend(uint32_t fibnum, struct nhop_prepend *nh)
 {
 
 	fib_free_nh_prepend(fibnum, nh, AF_INET);
 }
 
 void
 fib4_choose_prepend(uint32_t fibnum, struct nhop_prepend *nh_src,
     uint32_t flowid, struct nhop_prepend *nh, struct nhop4_extended *nh_ext)
 {
 
 	fib_choose_prepend(fibnum, nh_src, flowid, nh, AF_INET);
 	if (nh_ext == NULL)
 		return;
 
 	nh_ext->nh_ifp = NH_LIFP(nh);
 	nh_ext->nh_mtu = nh->nh_mtu;
 	nh_ext->nh_flags = nh->nh_flags;
 #if 0
 	/* TODO: copy source/gw address from extended nexthop data */
 	nh_ext->nh_addr = ;
 	nh_ext->nh_src= ;
 #endif
 }
 
 /*
  * Function performs lookup in IPv4 table fib @fibnum.
  *
  * In case of successful lookup @nh header is filled with
  * appropriate interface info and full L2 header to prepend.
  *
  * If no valid ARP record is present, NHF_L2_INCOMPLETE flag
  * is set and gateway address is stored into nh->d.gw4
  *
  * If @nh_ext is not NULL, additional nexthop data is stored there.
  *
  * Returns 0 on success.
  *
  */
 int
 fib4_lookup_prepend(uint32_t fibnum, struct in_addr dst, struct mbuf *m,
     struct nhop_prepend *nh, struct nhop4_extended *nh_ext)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct sockaddr_in *gw_sa, sin;
 	struct ifnet *lifp;
 	struct in_addr gw;
 	struct ether_header *eh;
 	int error, flags;
-	//uint32_t flowid;
 	struct rtentry *rte;
+	RIB_LOCK_READER;
 
 	KASSERT((fibnum < rt_numfibs), ("fib4_lookup_prepend: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET);
 	if (rh == NULL)
 		return (EHOSTUNREACH);
 
 	/* Prepare lookup key */
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_addr = dst;
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin, &rh->head);
 	rte = RNTORT(rn);
 	if (rn == NULL || ((rn->rn_flags & RNF_ROOT) != 0) ||
 	    RT_LINK_IS_UP(rte->rt_ifp) == 0) {
 		RIB_RUNLOCK(rh);
 		return (EHOSTUNREACH);
 	}
 
 	/*
 	 * Currently we fill in @nh ourselves.
 	 * In near future rte will have nhop index to copy from.
 	 */
 
 	/* Calculate L3 info */
 	flags = 0;
 	nh->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
 	if (rte->rt_flags & RTF_GATEWAY) {
 		gw_sa = (struct sockaddr_in *)rte->rt_gateway;
 		gw = gw_sa->sin_addr;
 	} else
 		gw = dst;
 	/* Set flags */
 	flags = fib_rte_to_nh_flags(rte->rt_flags);
 	gw_sa = (struct sockaddr_in *)rt_key(rte);
 	if (gw_sa->sin_addr.s_addr == 0)
 		flags |= NHF_DEFAULT;
 
 	/*
 	 * TODO: nh L2/L3 resolve.
 	 * Currently all we have is rte ifp.
 	 * Simply use it.
 	 */
 	/* Save interface address ifp */
 	lifp = rte->rt_ifa->ifa_ifp;
 	nh->aifp_idx = lifp->if_index;
 	/* Save both logical and transmit interface indexes */
 	lifp = rte->rt_ifp;
 	nh->lifp_idx = lifp->if_index;
 	nh->i.ifp_idx = nh->lifp_idx;
 
 	if (nh_ext != NULL) {
 		/* Fill in extended info */
 		fib4_rte_to_nh_extended(rte, dst, nh_ext);
 	}
 
 	RIB_RUNLOCK(rh);
 
 	nh->nh_flags = flags;
 	/*
 	 * Try to lookup L2 info.
 	 * Do this using separate LLE locks.
 	 * TODO: move this under radix lock.
 	 */
 	if (lifp->if_type == IFT_ETHER) {
 		eh = (struct ether_header *)nh->d.data;
 
 		/*
 		 * Fill in ethernet header.
 		 * It should be already presented if we're
 		 * sending data via known gateway.
 		 */
 		error = arpresolve_fast(lifp, gw, m ? m->m_flags : 0,
 		    eh->ether_dhost);
 		if (error == 0) {
 			memcpy(&eh->ether_shost, IF_LLADDR(lifp), ETHER_ADDR_LEN);
 			eh->ether_type = htons(ETHERTYPE_IP);
 			nh->nh_count = ETHER_HDR_LEN;
 			return (0);
 		}
 	}
 
 	/* Notify caller that no L2 info is linked */
 	nh->nh_count = 0;
 	nh->nh_flags |= NHF_L2_INCOMPLETE;
 	/* ..And save gateway address */
 	nh->d.gw4 = gw;
 	return (0);
 }
 
 int
 fib4_sendmbuf(struct ifnet *ifp, struct mbuf *m, struct nhop_prepend *nh,
     struct in_addr dst)
 {
 	int error;
 
 	if (nh != NULL && (nh->nh_flags & NHF_L2_INCOMPLETE) == 0) {
 
 		/*
 		 * Fast path case. Most packets should
 		 * be sent from here.
 		 * TODO: Make special ifnet
 		 * 'if_output_frame' handler for that.
 		 */
 		struct nhop_info ni;
 		struct ether_header *eh;
 		bzero(&ni, sizeof(ni));
 		ni.ni_flags = RT_NHOP;
 		ni.ni_family = AF_INET;
 		ni.ni_nh = nh;
 
 		M_PREPEND(m, nh->nh_count, M_NOWAIT);
 		if (m == NULL)
 			return (ENOBUFS);
 		eh = mtod(m, struct ether_header *);
 		memcpy(eh, nh->d.data, nh->nh_count);
 		error = (*ifp->if_output)(ifp, m, NULL, &ni);
 	} else {
 		struct sockaddr_in gw_out;
 		memset(&gw_out, 0, sizeof(gw_out));
 		gw_out.sin_len = sizeof(gw_out);
 		gw_out.sin_family = AF_INET;
 		gw_out.sin_addr = nh ? nh->d.gw4 : dst;
 		error = (*ifp->if_output)(ifp, m,
 		    (const struct sockaddr *)&gw_out, NULL);
 	}
 
 	return (error);
 }
 
 static inline uint16_t
 fib_rte_to_nh_flags(int rt_flags)
 {
 	uint16_t res;
 
 	res = (rt_flags & RTF_REJECT) ? NHF_REJECT : 0;
 	res |= (rt_flags & RTF_BLACKHOLE) ? NHF_BLACKHOLE : 0;
 	res |= (rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) ? NHF_REDIRECT : 0;
 	res |= (rt_flags & RTF_BROADCAST) ? NHF_BROADCAST : 0;
 	res |= (rt_flags & RTF_GATEWAY) ? NHF_GATEWAY : 0;
 
 	return (res);
 }
 
 static void
 fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
     struct nhop4_basic *pnh4)
 {
 	struct sockaddr_in *gw;
 
 	pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
 	pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
 	if (rte->rt_flags & RTF_GATEWAY) {
 		gw = (struct sockaddr_in *)rte->rt_gateway;
 		pnh4->nh_addr = gw->sin_addr;
 	} else
 		pnh4->nh_addr = dst;
 	/* Set flags */
 	pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
 	gw = (struct sockaddr_in *)rt_key(rte);
 	if (gw->sin_addr.s_addr == 0)
 		pnh4->nh_flags |= NHF_DEFAULT;
 	/* XXX: Set RTF_BROADCAST if GW address is broadcast */
 }
 
 static void
 fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
     struct nhop4_extended *pnh4)
 {
 	struct sockaddr_in *gw;
 	struct in_ifaddr *ia;
 
 	pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
 	pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
 	if (rte->rt_flags & RTF_GATEWAY) {
 		gw = (struct sockaddr_in *)rte->rt_gateway;
 		pnh4->nh_addr = gw->sin_addr;
 	} else
 		pnh4->nh_addr = dst;
 	/* Set flags */
 	pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
 	gw = (struct sockaddr_in *)rt_key(rte);
 	if (gw->sin_addr.s_addr == 0)
 		pnh4->nh_flags |= NHF_DEFAULT;
 	/* XXX: Set RTF_BROADCAST if GW address is broadcast */
 
 	ia = ifatoia(rte->rt_ifa);
 	pnh4->nh_src = IA_SIN(ia)->sin_addr;
 }
 
 static void
 rib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
     struct rt4_extended *prt4)
 {
 	struct sockaddr_in *gw;
 	struct in_ifaddr *ia;
 
 	/* Do explicit nexthop zero unless we're copying it */
 	memset(prt4, 0, sizeof(*prt4));
 
     	gw = ((struct sockaddr_in *)rt_key(rte));
 	prt4->rt_addr = gw->sin_addr;
     	gw = ((struct sockaddr_in *)rt_mask(rte));
 	prt4->rt_mask.s_addr = (gw != NULL) ?
 	    gw->sin_addr.s_addr : INADDR_BROADCAST;
 
 	if (rte->rt_flags & RTF_GATEWAY) {
 		gw = (struct sockaddr_in *)rte->rt_gateway;
 		prt4->rt_gateway = gw->sin_addr;
 	} else
 		prt4->rt_gateway = dst;
 
 	prt4->rt_lifp = rte->rt_ifp;
 	prt4->rt_aifp = rte->rt_ifa->ifa_ifp;
 	prt4->rt_flags = rte->rt_flags;
 	prt4->rt_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
 
 	prt4->rt_nhop = 0; /* XXX: fill real nexthop */
 
 	ia = ifatoia(rte->rt_ifa);
 	prt4->rt_src = IA_SIN(ia)->sin_addr;
 }
 
 /*
  * Performs IPv4 route table lookup on @dst. Returns 0 on success.
  * Stores nexthop info provided @pnh4 structure.
  * Note that
  * - nh_ifp cannot be safely dereferenced
  * - nh_ifp represents ifaddr ifp (e.g. if looking up address on
  *   interface "ix0" pointer to "ix0" interface will be returned instead
  *   of "lo0")
  * - howewer mtu from "transmit" interface will be returned.
  */
 int
 fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flowid,
     struct nhop4_basic *pnh4)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct sockaddr_in sin;
 	struct rtentry *rte;
+	RIB_LOCK_READER;
 
 	KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET);
 	if (rh == NULL)
 		return (ENOENT);
 
 	/* Prepare lookup key */
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_addr = dst;
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rte = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rte->rt_ifp)) {
 			fib4_rte_to_nh_basic(rte, dst, pnh4);
 			RIB_RUNLOCK(rh);
 
 			return (0);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 int
 fib4_lookup_nh_ifp(uint32_t fibnum, struct in_addr dst, uint32_t flowid,
     struct nhop4_basic *pnh4)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct sockaddr_in sin;
 	struct rtentry *rte;
+	RIB_LOCK_READER;
 
 	KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ifp: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET);
 	if (rh == NULL)
 		return (ENOENT);
 
 	/* Prepare lookup key */
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_addr = dst;
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rte = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rte->rt_ifp)) {
 			fib4_rte_to_nh_basic(rte, dst, pnh4);
 			RIB_RUNLOCK(rh);
 			pnh4->nh_ifp = rte->rt_ifp;
 			return (0);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 /*
  * Performs IPv4 route table lookup on @dst. Returns 0 on success.
  * Stores extende nexthop info provided @pnh4 structure.
  * Note that
  * - nh_ifp cannot be safely dereferenced unless NHOP_LOOKUP_REF is specified.
  * - in that case you need to call fib4_free_nh_ext()
  * - nh_ifp represents logical transmit interface (rt_ifp)
  * - mtu from logical transmit interface will be returned.
  */
 int
 fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flowid,
     uint32_t flags, struct nhop4_extended *pnh4)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct sockaddr_in sin;
 	struct rtentry *rte;
+	RIB_LOCK_READER;
 
 	KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET);
 	if (rh == NULL)
 		return (ENOENT);
 
 	/* Prepare lookup key */
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_addr = dst;
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rte = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rte->rt_ifp)) {
 			fib4_rte_to_nh_extended(rte, dst, pnh4);
 			if ((flags & NHOP_LOOKUP_REF) != 0) {
 				/* TODO: Do lwref on egress ifp's */
 			}
 			RIB_RUNLOCK(rh);
 
 			return (0);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 void
 fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4)
 {
 
 }
 
 int
 rib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flowid,
     uint32_t flags, struct rt4_extended *prt4)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct sockaddr_in sin;
 	struct rtentry *rte;
+	RIB_LOCK_READER;
 
 	KASSERT((fibnum < rt_numfibs), ("rib4_lookup_nh_ext: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET);
 	if (rh == NULL)
 		return (ENOENT);
 
 	/* Prepare lookup key */
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_addr = dst;
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rte = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rte->rt_ifp)) {
 			rib4_rte_to_nh_extended(rte, dst, prt4);
 			if ((flags & NHOP_LOOKUP_REF) != 0) {
 				/* TODO: Do lwref on egress ifp's */
 			}
 			RIB_RUNLOCK(rh);
 			return (0);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 void
 rib4_free_nh_ext(uint32_t fibnum, struct rt4_extended *prt4)
 {
 
 }
 
 #endif
 
 #ifdef INET6
 void
 fib6_free_nh_prepend(uint32_t fibnum, struct nhop_prepend *nh)
 {
 
 	fib_free_nh_prepend(fibnum, nh, AF_INET6);
 }
 
 void
 fib6_choose_prepend(uint32_t fibnum, struct nhop_prepend *nh_src,
     uint32_t flowid, struct nhop_prepend *nh, struct nhop6_extended *nh_ext)
 {
 
 	fib_choose_prepend(fibnum, nh_src, flowid, nh, AF_INET6);
 	if (nh_ext == NULL)
 		return;
 
 	nh_ext->nh_ifp = NH_LIFP(nh);
 	nh_ext->nh_mtu = nh->nh_mtu;
 	nh_ext->nh_flags = nh->nh_flags;
 /*
 	nh_ext->nh_addr = ;
 	nh_ext->nh_src= ;
 */
 }
 
 /*
  * Temporary function to copy ethernet address from valid lle
  */
 static int
 fib6_storelladdr(struct ifnet *ifp, struct in6_addr *dst, int mm_flags,
     u_char *desten)
 {
 	struct llentry *ln;
 	struct sockaddr_in6 dst_sa;
 
 	if (mm_flags & M_MCAST) {
 		ETHER_MAP_IPV6_MULTICAST(&dst, desten);
 		return (0);
 	}
 
 	memset(&dst_sa, 0, sizeof(dst_sa));
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(dst_sa);
 	dst_sa.sin6_addr = *dst;
 	dst_sa.sin6_scope_id = ifp->if_index;
 	
 
 	/*
 	 * the entry should have been created in nd6_store_lladdr
 	 */
 	IF_AFDATA_RLOCK(ifp);
 	ln = lla_lookup(LLTABLE6(ifp), 0, (struct sockaddr *)&dst_sa);
 
 	/*
 	 * Perform fast path for the following cases:
 	 * 1) lle state is REACHABLE
 	 * 2) lle state is DELAY (NS message sentNS message sent)
 	 *
 	 * Every other case involves lle modification, so we handle
 	 * them separately.
 	 */
 	if (ln == NULL || (ln->ln_state != ND6_LLINFO_REACHABLE &&
 	    ln->ln_state != ND6_LLINFO_DELAY)) {
 		if (ln != NULL)
 			LLE_RUNLOCK(ln);
 		IF_AFDATA_RUNLOCK(ifp);
 		return (1);
 	}
 	bcopy(&ln->ll_addr, desten, ifp->if_addrlen);
 	LLE_RUNLOCK(ln);
 	IF_AFDATA_RUNLOCK(ifp);
 
 	return (0);
 }
 
 int
 fib6_lookup_prepend(uint32_t fibnum, struct in6_addr *dst, uint32_t scopeid,
     struct mbuf *m, struct nhop_prepend *nh, struct nhop6_extended *nh_ext)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct sockaddr_in6 sin6, *gw_sa;
 	struct in6_addr gw6;
 	struct rtentry *rte;
 	struct ifnet *lifp;
 	struct ether_header *eh;
+	RIB_LOCK_READER;
 	uint32_t flags;
 	int error;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(dst)) {
 		/* Do not lookup link-local addresses in rtable */
 		error = fib6_lla_to_nh(dst, scopeid, nh, &lifp);
 		if (error != 0)
 			return (error);
 		/* */
 		gw6 = *dst;
 		goto do_l2;
 	}
 
 
 	KASSERT((fibnum < rt_numfibs), ("fib6_lookup_prepend: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET6);
 	if (rh == NULL)
 		return (ENOENT);
 
 	/* Prepare lookup key */
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_addr = *dst;
 	sin6.sin6_scope_id = scopeid;
 	sa6_embedscope(&sin6, 0);
 	
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
 	rte = RNTORT(rn);
 	if (rn == NULL || ((rn->rn_flags & RNF_ROOT) != 0) ||
 	    RT_LINK_IS_UP(rte->rt_ifp) == 0) {
 		RIB_RUNLOCK(rh);
 		return (EHOSTUNREACH);
 	}
 
 	/* Explicitly zero nexthop */
 	memset(nh, 0, sizeof(*nh));
 	flags = 0;
 	nh->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp));
 	if (rte->rt_flags & RTF_GATEWAY) {
 		gw_sa = (struct sockaddr_in6 *)rte->rt_gateway;
 		gw6 = gw_sa->sin6_addr;
 		in6_clearscope(&gw6);
 	} else
 		gw6 = *dst;
 	/* Set flags */
 	flags = fib_rte_to_nh_flags(rte->rt_flags);
 	gw_sa = (struct sockaddr_in6 *)rt_key(rte);
 	if (IN6_IS_ADDR_UNSPECIFIED(&gw_sa->sin6_addr))
 		flags |= NHF_DEFAULT;
 
 	/*
 	 * TODO: nh L2/L3 resolve.
 	 * Currently all we have is rte ifp.
 	 * Simply use it.
 	 */
 	/* Save interface address ifp */
 	nh->aifp_idx = fib6_get_ifa(rte);
 	/* Save both logical and transmit interface indexes */
 	lifp = rte->rt_ifp;
 	nh->lifp_idx = lifp->if_index;
 	nh->i.ifp_idx = nh->lifp_idx;
 
 	RIB_RUNLOCK(rh);
 
 	nh->nh_flags = flags;
 do_l2:
 	/*
 	 * Try to lookup L2 info.
 	 * Do this using separate LLE locks.
 	 * TODO: move this under radix lock.
 	 */
 	if (lifp->if_type == IFT_ETHER) {
 		eh = (struct ether_header *)nh->d.data;
 
 		/*
 		 * Fill in ethernet header.
 		 * It should be already presented if we're
 		 * sending data via known gateway.
 		 */
 		error = fib6_storelladdr(lifp, &gw6, m ? m->m_flags : 0,
 		    eh->ether_dhost);
 		if (error == 0) {
 			memcpy(&eh->ether_shost, IF_LLADDR(lifp), ETHER_ADDR_LEN);
 			eh->ether_type = htons(ETHERTYPE_IPV6);
 			nh->nh_count = ETHER_HDR_LEN;
 			return (0);
 		}
 	}
 
 	/* Notify caller that no L2 info is linked */
 	nh->nh_count = 0;
 	nh->nh_flags |= NHF_L2_INCOMPLETE;
 	/* ..And save gateway address */
 	nh->d.gw6 = gw6;
 	return (0);
 }
 
 int
 fib6_sendmbuf(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m,
     struct nhop_prepend *nh)
 {
 	int error;
 
 	if (nh != NULL && (nh->nh_flags & NHF_L2_INCOMPLETE) == 0) {
 
 		/*
 		 * Fast path case. Most packets should
 		 * be sent from here.
 		 * TODO: Make special ifnet
 		 * 'if_output_frame' handler for that.
 		 */
 		struct nhop_info ni;
 		struct ether_header *eh;
 		bzero(&ni, sizeof(ni));
 		ni.ni_family = AF_INET6;
 		ni.ni_flags = RT_NHOP;
 		ni.ni_nh = nh;
 
 		M_PREPEND(m, nh->nh_count, M_NOWAIT);
 		if (m == NULL)
 			return (ENOBUFS);
 		eh = mtod(m, struct ether_header *);
 		memcpy(eh, nh->d.data, nh->nh_count);
 		error = (*ifp->if_output)(ifp, m, NULL, &ni);
 	} else {
 		/* We need to perform ND lookup */
 		struct sockaddr_in6 gw_out;
 
 		memset(&gw_out, 0, sizeof(gw_out));
 		gw_out.sin6_family = AF_INET6;
 		gw_out.sin6_len = sizeof(gw_out);
 		gw_out.sin6_addr = nh->d.gw6;
 		gw_out.sin6_scope_id = ifp->if_index;
 		sa6_embedscope(&gw_out, 0);
 
 		error = nd6_output(ifp, origifp, m, &gw_out, NULL);
 	}
 
 	return (error);
 }
 
 static uint16_t
 fib6_get_ifa(struct rtentry *rte)
 {
 	struct ifnet *ifp;
 	struct sockaddr_dl *sdl;
 
 	ifp = rte->rt_ifp;
 	if ((ifp->if_flags & IFF_LOOPBACK) &&
 	    rte->rt_gateway->sa_family == AF_LINK) {
 		sdl = (struct sockaddr_dl *)rte->rt_gateway;
 		return (sdl->sdl_index);
 	}
 
 	return (ifp->if_index);
 #if 0
 	/* IPv6 case */
 	/* Alternative way to get interface address ifp */
 	/*
 	 * Adjust the "outgoing" interface.  If we're going to loop 
 	 * the packet back to ourselves, the ifp would be the loopback 
 	 * interface. However, we'd rather know the interface associated 
 	 * to the destination address (which should probably be one of 
 	 * our own addresses.)
 	 */
 	if (rt) {
 		if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) &&
 		    (rt->rt_gateway->sa_family == AF_LINK))
 			*retifp = 
 				ifnet_byindex(((struct sockaddr_dl *)
 					       rt->rt_gateway)->sdl_index);
 	}
 	/* IPv4 case */
 	//pnh6->nh_ifp = rte->rt_ifa->ifa_ifp;
 #endif
 }
 
 static int
 fib6_lla_to_nh_basic(struct in6_addr *dst, uint32_t scopeid,
     struct nhop6_basic *pnh6)
 {
 	struct ifnet *ifp;
 
 	ifp = ifnet_byindex_locked(scopeid);
 	if (ifp == NULL)
 		return (ENOENT);
 
 	/* Do explicit nexthop zero unless we're copying it */
 	memset(pnh6, 0, sizeof(*pnh6));
 
 	pnh6->nh_ifp = ifp;
 	pnh6->nh_mtu = IN6_LINKMTU(ifp);
 	/* No flags set */
 	pnh6->nh_addr = *dst;
 
 	return (0);
 }
 
 static int
 fib6_lla_to_nh_extended(struct in6_addr *dst, uint32_t scopeid,
     struct nhop6_extended *pnh6)
 {
 	struct ifnet *ifp;
 
 	ifp = ifnet_byindex_locked(scopeid);
 	if (ifp == NULL)
 		return (ENOENT);
 
 	/* Do explicit nexthop zero unless we're copying it */
 	memset(pnh6, 0, sizeof(*pnh6));
 
 	pnh6->nh_ifp = ifp;
 	pnh6->nh_mtu = IN6_LINKMTU(ifp);
 	/* No flags set */
 	pnh6->nh_addr = *dst;
 
 	return (0);
 }
 
 static int
 rib6_lla_to_nh_extended(struct in6_addr *dst, uint32_t scopeid,
     struct rt6_extended *prt6)
 {
 	struct ifnet *ifp;
 
 	ifp = ifnet_byindex_locked(scopeid);
 	if (ifp == NULL)
 		return (ENOENT);
 
 	/* Do explicit nexthop zero unless we're copying it */
 	memset(prt6, 0, sizeof(*prt6));
 
 	prt6->rt_addr.s6_addr16[0] = htons(0xFE80);
 	prt6->rt_mask = 64; /* XXX check RFC */
 
 	prt6->rt_aifp = ifp;
 	prt6->rt_lifp = ifp;
 	/* Check id this is for-us address */
 	if (in6_ifawithifp_lla(ifp, dst)) {
 		if ((ifp = V_loif) != NULL)
 			prt6->rt_lifp = ifp;
 	}
 
 	prt6->rt_mtu = IN6_LINKMTU(ifp);
 	/* No flags set */
 
 	return (0);
 }
 
 static int
 fib6_lla_to_nh(struct in6_addr *dst, uint32_t scopeid,
     struct nhop_prepend *nh, struct ifnet **lifp)
 {
 	struct ifnet *ifp;
 
 	ifp = ifnet_byindex_locked(scopeid);
 	if (ifp == NULL)
 		return (ENOENT);
 
 	/* Do explicit nexthop zero unless we're copying it */
 	memset(nh, 0, sizeof(*nh));
 	/* No flags set */
 	nh->nh_mtu = IN6_LINKMTU(ifp);
 
 	/* Save lifp */
 	*lifp = ifp;
 
 	nh->aifp_idx = scopeid;
 	nh->lifp_idx = scopeid;
 	/* Check id this is for-us address */
 	if (in6_ifawithifp_lla(ifp, dst)) {
 		if ((ifp = V_loif) != NULL)
 			nh->lifp_idx = ifp->if_index;
 	}
 
 	return (0);
 }
 
 
 static void
 fib6_rte_to_nh_basic(struct rtentry *rte, struct in6_addr *dst,
     struct nhop6_basic *pnh6)
 {
 	struct sockaddr_in6 *gw;
 
 	/* Do explicit nexthop zero unless we're copying it */
 	memset(pnh6, 0, sizeof(*pnh6));
 
 	pnh6->nh_ifp = ifnet_byindex(fib6_get_ifa(rte));
 
 	pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp));
 	if (rte->rt_flags & RTF_GATEWAY) {
 		gw = (struct sockaddr_in6 *)rte->rt_gateway;
 		pnh6->nh_addr = gw->sin6_addr;
 		in6_clearscope(&pnh6->nh_addr);
 	} else
 		pnh6->nh_addr = *dst;
 	/* Set flags */
 	pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
 	gw = (struct sockaddr_in6 *)rt_key(rte);
 	if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
 		pnh6->nh_flags |= NHF_DEFAULT;
 }
 
 static void
 fib6_rte_to_nh_extended(struct rtentry *rte, struct in6_addr *dst,
     struct nhop6_extended *pnh6)
 {
 	struct sockaddr_in6 *gw;
 	struct in6_ifaddr *ia;
 
 	/* Do explicit nexthop zero unless we're copying it */
 	memset(pnh6, 0, sizeof(*pnh6));
 
 	pnh6->nh_ifp = ifnet_byindex(fib6_get_ifa(rte));
 	pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp));
 	if (rte->rt_flags & RTF_GATEWAY) {
 		gw = (struct sockaddr_in6 *)rte->rt_gateway;
 		pnh6->nh_addr = gw->sin6_addr;
 		in6_clearscope(&pnh6->nh_addr);
 	} else
 		pnh6->nh_addr = *dst;
 	/* Set flags */
 	pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
 	gw = (struct sockaddr_in6 *)rt_key(rte);
 	if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
 		pnh6->nh_flags |= NHF_DEFAULT;
 
 	ia = ifatoia6(rte->rt_ifa);
 }
 
 #define ipv6_masklen(x)		bitcount32((x).__u6_addr.__u6_addr32[0]) + \
 				bitcount32((x).__u6_addr.__u6_addr32[1]) + \
 				bitcount32((x).__u6_addr.__u6_addr32[2]) + \
 				bitcount32((x).__u6_addr.__u6_addr32[3])
 static void
 rib6_rte_to_nh_extended(struct rtentry *rte, struct in6_addr *dst,
     struct rt6_extended *prt6)
 {
 	struct sockaddr_in6 *gw;
 
 	/* Do explicit nexthop zero unless we're copying it */
 	memset(prt6, 0, sizeof(*prt6));
 
     	gw = ((struct sockaddr_in6 *)rt_key(rte));
 	prt6->rt_addr = gw->sin6_addr;
     	gw = ((struct sockaddr_in6 *)rt_mask(rte));
 	prt6->rt_mask = (gw != NULL) ? ipv6_masklen(gw->sin6_addr) : 128;
 
 	if (rte->rt_flags & RTF_GATEWAY) {
 		gw = (struct sockaddr_in6 *)rte->rt_gateway;
 		prt6->rt_gateway = gw->sin6_addr;
 		in6_clearscope(&prt6->rt_gateway);
 	} else
 		prt6->rt_gateway = *dst;
 
 	prt6->rt_lifp = rte->rt_ifp;
 	prt6->rt_aifp = ifnet_byindex(fib6_get_ifa(rte));
 	prt6->rt_flags = fib_rte_to_nh_flags(rte->rt_flags);
 	prt6->rt_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp));
 }
 
 int
 fib6_lookup_nh_ifp(uint32_t fibnum, struct in6_addr *dst, uint32_t scopeid,
     uint32_t flowid, struct nhop6_basic *pnh6)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct sockaddr_in6 sin6;
 	struct rtentry *rte;
+	RIB_LOCK_READER;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(dst)) {
 		/* Do not lookup link-local addresses in rtable */
 		/* XXX: Check if dst is local */
 		return (fib6_lla_to_nh_basic(dst, scopeid, pnh6));
 	}
 
 	KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET6);
 	if (rh == NULL)
 		return (ENOENT);
 
 	/* Prepare lookup key */
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_addr = *dst;
 	sin6.sin6_scope_id = scopeid;
 	sa6_embedscope(&sin6, 0);
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rte = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rte->rt_ifp)) {
 			fib6_rte_to_nh_basic(rte, dst, pnh6);
 			pnh6->nh_ifp = rte->rt_ifp;
 			RIB_RUNLOCK(rh);
 			return (0);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 int
 fib6_lookup_nh_basic(uint32_t fibnum, struct in6_addr *dst, uint32_t scopeid,
     uint32_t flowid, struct nhop6_basic *pnh6)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct sockaddr_in6 sin6;
 	struct rtentry *rte;
+	RIB_LOCK_READER;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(dst)) {
 		/* Do not lookup link-local addresses in rtable */
 		return (fib6_lla_to_nh_basic(dst, scopeid, pnh6));
 	}
 
 	KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET6);
 	if (rh == NULL)
 		return (ENOENT);
 
 	/* Prepare lookup key */
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_addr = *dst;
 	sin6.sin6_scope_id = scopeid;
 	sa6_embedscope(&sin6, 0);
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rte = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rte->rt_ifp)) {
 			fib6_rte_to_nh_basic(rte, dst, pnh6);
 			RIB_RUNLOCK(rh);
 			return (0);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 /*
  * Performs IPv6 route table lookup on @dst. Returns 0 on success.
  * Stores extende nexthop info provided @pnh4 structure.
  * Note that
  * - nh_ifp cannot be safely dereferenced unless NHOP_LOOKUP_REF is specified.
  * - in that case you need to call fib6_free_nh_ext()
  * - nh_ifp represents logical transmit interface (rt_ifp)
  * - mtu from logical transmit interface will be returned.
  */
 int
 fib6_lookup_nh_ext(uint32_t fibnum, struct in6_addr *dst, uint32_t scopeid,
     uint32_t flowid, uint32_t flags, struct nhop6_extended *pnh6)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct sockaddr_in6 sin6;
 	struct rtentry *rte;
+	RIB_LOCK_READER;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(dst)) {
 		/* Do not lookup link-local addresses in rtable */
 		/* XXX: Do lwref on egress ifp */
 		return (fib6_lla_to_nh_extended(dst, scopeid, pnh6));
 	}
 
 	KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET6);
 	if (rh == NULL)
 		return (ENOENT);
 
 	/* Prepare lookup key */
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_addr = *dst;
 	sin6.sin6_scope_id = scopeid;
 	sa6_embedscope(&sin6, 0);
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rte = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rte->rt_ifp)) {
 			fib6_rte_to_nh_extended(rte, dst, pnh6);
 			if ((flags & NHOP_LOOKUP_REF) != 0) {
 				/* TODO: Do lwref on egress ifp's */
 			}
 			RIB_RUNLOCK(rh);
 
 			return (0);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 void
 fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6)
 {
 
 }
 
 int
 rib6_lookup_nh_ext(uint32_t fibnum, struct in6_addr *dst, uint32_t scopeid,
     uint32_t flowid, uint32_t flags, struct rt6_extended *prt6)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct sockaddr_in6 sin6;
 	struct rtentry *rte;
+	RIB_LOCK_READER;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(dst)) {
 		/* Do not lookup link-local addresses in rtable */
 		/* XXX: Do lwref on egress ifp */
 		return (rib6_lla_to_nh_extended(dst, scopeid, prt6));
 	}
 
 	KASSERT((fibnum < rt_numfibs), ("rib6_lookup_nh_ext: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET6);
 	if (rh == NULL)
 		return (ENOENT);
 
 	/* Prepare lookup key */
 	memset(&sin6, 0, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_addr = *dst;
 	sin6.sin6_scope_id = scopeid;
 	sa6_embedscope(&sin6, 0);
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rte = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rte->rt_ifp)) {
 			rib6_rte_to_nh_extended(rte, dst, prt6);
 			if ((flags & NHOP_LOOKUP_REF) != 0) {
 				/* TODO: Do lwref on egress ifp's */
 			}
 			RIB_RUNLOCK(rh);
 
 			return (0);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 void
 rib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *prt6)
 {
 
 }
 
 #endif
 
 void
 fib_free_nh_ext(uint32_t fibnum, struct nhopu_extended *pnhu)
 {
 
 }
 
 
 #if 0
 typedef void nhop_change_cb_t(void *state);
 
 
 struct nhop_tracker {
 	TAILQ_ENTRY(nhop_tracker)	next;
 	nhop_change_cb_t	*f;
 	void		*state;
 	uint32_t	fibnum;
 	struct sockaddr_storage	ss;
 };
 
 struct nhop_tracker *
 nhop_alloc_tracked(uint32_t fibnum, struct sockaddr *sa, nhop_change_cb_t *f,
     void *state)
 {
 	struct nhop_tracker *nt;
 
 	nt = malloc(sizeof(struct nhop_tracker), M_RTFIB, M_WAITOK | M_ZERO);
 
 	nt->f = f;
 	nt-state = state;
 	nt->fibnum = fibnum;
 	memcpy(&nt->ss, sa, sa->sa_len);
 
 	return (nt);
 }
 
 
 int
 nhop_bind(struct nhop_tracker *nt)
 {
 	NHOP_LOCK(nnh);
 
 	NHOP_UNLOCK(nnh);
 
 	return (0);
 }
 #endif
 
 
 
 
 
 
 
 
Index: projects/routing/sys/net/rtsock.c
===================================================================
--- projects/routing/sys/net/rtsock.c	(revision 274335)
+++ projects/routing/sys/net/rtsock.c	(revision 274336)
@@ -1,1921 +1,1924 @@
 /*-
  * Copyright (c) 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)rtsock.c	8.7 (Berkeley) 10/12/95
  * $FreeBSD$
  */
 #include "opt_compat.h"
 #include "opt_mpath.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/domain.h>
 #include <sys/lock.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/rmlock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/raw_cb.h>
 #include <net/route.h>
 #include <net/route_internal.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip_carp.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #endif
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 
 struct if_msghdr32 {
 	uint16_t ifm_msglen;
 	uint8_t	ifm_version;
 	uint8_t	ifm_type;
 	int32_t	ifm_addrs;
 	int32_t	ifm_flags;
 	uint16_t ifm_index;
 	struct	if_data ifm_data;
 };
 
 struct if_msghdrl32 {
 	uint16_t ifm_msglen;
 	uint8_t	ifm_version;
 	uint8_t	ifm_type;
 	int32_t	ifm_addrs;
 	int32_t	ifm_flags;
 	uint16_t ifm_index;
 	uint16_t _ifm_spare1;
 	uint16_t ifm_len;
 	uint16_t ifm_data_off;
 	struct	if_data ifm_data;
 };
 
 struct ifa_msghdrl32 {
 	uint16_t ifam_msglen;
 	uint8_t	ifam_version;
 	uint8_t	ifam_type;
 	int32_t	ifam_addrs;
 	int32_t	ifam_flags;
 	uint16_t ifam_index;
 	uint16_t _ifam_spare1;
 	uint16_t ifam_len;
 	uint16_t ifam_data_off;
 	int32_t	ifam_metric;
 	struct	if_data ifam_data;
 };
 #endif /* COMPAT_FREEBSD32 */
 
 MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
 
 /* NB: these are not modified */
 static struct	sockaddr route_src = { 2, PF_ROUTE, };
 static struct	sockaddr sa_zero   = { sizeof(sa_zero), AF_INET, };
 
 /* These are external hooks for CARP. */
 int	(*carp_get_vhid_p)(struct ifaddr *);
 
 /*
  * Used by rtsock/raw_input callback code to decide whether to filter the update
  * notification to a socket bound to a particular FIB.
  */
 #define	RTS_FILTER_FIB	M_PROTO8
 
 typedef struct {
 	int	ip_count;	/* attached w/ AF_INET */
 	int	ip6_count;	/* attached w/ AF_INET6 */
 	int	any_count;	/* total attached */
 } route_cb_t;
 static VNET_DEFINE(route_cb_t, route_cb);
 #define	V_route_cb VNET(route_cb)
 
 struct mtx rtsock_mtx;
 MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF);
 
 #define	RTSOCK_LOCK()	mtx_lock(&rtsock_mtx)
 #define	RTSOCK_UNLOCK()	mtx_unlock(&rtsock_mtx)
 #define	RTSOCK_LOCK_ASSERT()	mtx_assert(&rtsock_mtx, MA_OWNED)
 
 static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, "");
 
 struct walkarg {
 	int	w_tmemsize;
 	int	w_op, w_arg;
 	caddr_t	w_tmem;
 	struct sysctl_req *w_req;
 };
 
 static void	rts_input(struct mbuf *m);
 static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo);
 static int	rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo,
 			struct walkarg *w, int *plen);
 static int	rt_xaddrs(caddr_t cp, caddr_t cplim,
 			struct rt_addrinfo *rtinfo);
 static int	sysctl_dumpentry(struct radix_node *rn, void *vw);
 static int	sysctl_iflist(int af, struct walkarg *w);
 static int	sysctl_ifmalist(int af, struct walkarg *w);
 static int	route_output(struct mbuf *m, struct socket *so, ...);
 static void	rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out);
 static void	rt_dispatch(struct mbuf *, sa_family_t);
 static struct sockaddr	*rtsock_fix_netmask(struct sockaddr *dst,
 			struct sockaddr *smask, struct sockaddr_storage *dmask);
 
 static struct netisr_handler rtsock_nh = {
 	.nh_name = "rtsock",
 	.nh_handler = rts_input,
 	.nh_proto = NETISR_ROUTE,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 static int
 sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&rtsock_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
         if (error || !req->newptr)
                 return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&rtsock_nh, qlimit));
 }
 SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_route_netisr_maxqlen, "I",
     "maximum routing socket dispatch queue length");
 
 static void
 rts_init(void)
 {
 	int tmp;
 
 	if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp))
 		rtsock_nh.nh_qlimit = tmp;
 	netisr_register(&rtsock_nh);
 }
 SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0);
 
 static int
 raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src,
     struct rawcb *rp)
 {
 	int fibnum;
 
 	KASSERT(m != NULL, ("%s: m is NULL", __func__));
 	KASSERT(proto != NULL, ("%s: proto is NULL", __func__));
 	KASSERT(rp != NULL, ("%s: rp is NULL", __func__));
 
 	/* No filtering requested. */
 	if ((m->m_flags & RTS_FILTER_FIB) == 0)
 		return (0);
 
 	/* Check if it is a rts and the fib matches the one of the socket. */
 	fibnum = M_GETFIB(m);
 	if (proto->sp_family != PF_ROUTE ||
 	    rp->rcb_socket == NULL ||
 	    rp->rcb_socket->so_fibnum == fibnum)
 		return (0);
 
 	/* Filtering requested and no match, the socket shall be skipped. */
 	return (1);
 }
 
 static void
 rts_input(struct mbuf *m)
 {
 	struct sockproto route_proto;
 	unsigned short *family;
 	struct m_tag *tag;
 
 	route_proto.sp_family = PF_ROUTE;
 	tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL);
 	if (tag != NULL) {
 		family = (unsigned short *)(tag + 1);
 		route_proto.sp_protocol = *family;
 		m_tag_delete(m, tag);
 	} else
 		route_proto.sp_protocol = 0;
 
 	raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb);
 }
 
 /*
  * It really doesn't make any sense at all for this code to share much
  * with raw_usrreq.c, since its functionality is so restricted.  XXX
  */
 static void
 rts_abort(struct socket *so)
 {
 
 	raw_usrreqs.pru_abort(so);
 }
 
 static void
 rts_close(struct socket *so)
 {
 
 	raw_usrreqs.pru_close(so);
 }
 
 /* pru_accept is EOPNOTSUPP */
 
 static int
 rts_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct rawcb *rp;
 	int error;
 
 	KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL"));
 
 	/* XXX */
 	rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO);
 	if (rp == NULL)
 		return ENOBUFS;
 
 	so->so_pcb = (caddr_t)rp;
 	so->so_fibnum = td->td_proc->p_fibnum;
 	error = raw_attach(so, proto);
 	rp = sotorawcb(so);
 	if (error) {
 		so->so_pcb = NULL;
 		free(rp, M_PCB);
 		return error;
 	}
 	RTSOCK_LOCK();
 	switch(rp->rcb_proto.sp_protocol) {
 	case AF_INET:
 		V_route_cb.ip_count++;
 		break;
 	case AF_INET6:
 		V_route_cb.ip6_count++;
 		break;
 	}
 	V_route_cb.any_count++;
 	RTSOCK_UNLOCK();
 	soisconnected(so);
 	so->so_options |= SO_USELOOPBACK;
 	return 0;
 }
 
 static int
 rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */
 }
 
 static int
 rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */
 }
 
 /* pru_connect2 is EOPNOTSUPP */
 /* pru_control is EOPNOTSUPP */
 
 static void
 rts_detach(struct socket *so)
 {
 	struct rawcb *rp = sotorawcb(so);
 
 	KASSERT(rp != NULL, ("rts_detach: rp == NULL"));
 
 	RTSOCK_LOCK();
 	switch(rp->rcb_proto.sp_protocol) {
 	case AF_INET:
 		V_route_cb.ip_count--;
 		break;
 	case AF_INET6:
 		V_route_cb.ip6_count--;
 		break;
 	}
 	V_route_cb.any_count--;
 	RTSOCK_UNLOCK();
 	raw_usrreqs.pru_detach(so);
 }
 
 static int
 rts_disconnect(struct socket *so)
 {
 
 	return (raw_usrreqs.pru_disconnect(so));
 }
 
 /* pru_listen is EOPNOTSUPP */
 
 static int
 rts_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 
 	return (raw_usrreqs.pru_peeraddr(so, nam));
 }
 
 /* pru_rcvd is EOPNOTSUPP */
 /* pru_rcvoob is EOPNOTSUPP */
 
 static int
 rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	 struct mbuf *control, struct thread *td)
 {
 
 	return (raw_usrreqs.pru_send(so, flags, m, nam, control, td));
 }
 
 /* pru_sense is null */
 
 static int
 rts_shutdown(struct socket *so)
 {
 
 	return (raw_usrreqs.pru_shutdown(so));
 }
 
 static int
 rts_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 
 	return (raw_usrreqs.pru_sockaddr(so, nam));
 }
 
 static struct pr_usrreqs route_usrreqs = {
 	.pru_abort =		rts_abort,
 	.pru_attach =		rts_attach,
 	.pru_bind =		rts_bind,
 	.pru_connect =		rts_connect,
 	.pru_detach =		rts_detach,
 	.pru_disconnect =	rts_disconnect,
 	.pru_peeraddr =		rts_peeraddr,
 	.pru_send =		rts_send,
 	.pru_shutdown =		rts_shutdown,
 	.pru_sockaddr =		rts_sockaddr,
 	.pru_close =		rts_close,
 };
 
 #ifndef _SOCKADDR_UNION_DEFINED
 #define	_SOCKADDR_UNION_DEFINED
 /*
  * The union of all possible address formats we handle.
  */
 union sockaddr_union {
 	struct sockaddr		sa;
 	struct sockaddr_in	sin;
 	struct sockaddr_in6	sin6;
 };
 #endif /* _SOCKADDR_UNION_DEFINED */
 
 static int
 rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp,
     struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred)
 {
 
 	/* First, see if the returned address is part of the jail. */
 	if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) {
 		info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
 		return (0);
 	}
 
 	switch (info->rti_info[RTAX_DST]->sa_family) {
 #ifdef INET
 	case AF_INET:
 	{
 		struct in_addr ia;
 		struct ifaddr *ifa;
 		int found;
 
 		found = 0;
 		/*
 		 * Try to find an address on the given outgoing interface
 		 * that belongs to the jail.
 		 */
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa;
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			ia = ((struct sockaddr_in *)sa)->sin_addr;
 			if (prison_check_ip4(cred, &ia) == 0) {
 				found = 1;
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (!found) {
 			/*
 			 * As a last resort return the 'default' jail address.
 			 */
 			ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)->
 			    sin_addr;
 			if (prison_get_ip4(cred, &ia) != 0)
 				return (ESRCH);
 		}
 		bzero(&saun->sin, sizeof(struct sockaddr_in));
 		saun->sin.sin_len = sizeof(struct sockaddr_in);
 		saun->sin.sin_family = AF_INET;
 		saun->sin.sin_addr.s_addr = ia.s_addr;
 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin;
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6:
 	{
 		struct in6_addr ia6;
 		struct ifaddr *ifa;
 		int found;
 
 		found = 0;
 		/*
 		 * Try to find an address on the given outgoing interface
 		 * that belongs to the jail.
 		 */
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa;
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET6)
 				continue;
 			bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr,
 			    &ia6, sizeof(struct in6_addr));
 			if (prison_check_ip6(cred, &ia6) == 0) {
 				found = 1;
 				break;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		if (!found) {
 			/*
 			 * As a last resort return the 'default' jail address.
 			 */
 			ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)->
 			    sin6_addr;
 			if (prison_get_ip6(cred, &ia6) != 0)
 				return (ESRCH);
 		}
 		bzero(&saun->sin6, sizeof(struct sockaddr_in6));
 		saun->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		saun->sin6.sin6_family = AF_INET6;
 		bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr));
 		if (sa6_recoverscope(&saun->sin6) != 0)
 			return (ESRCH);
 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6;
 		break;
 	}
 #endif
 	default:
 		return (ESRCH);
 	}
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 route_output(struct mbuf *m, struct socket *so, ...)
 {
 	struct rt_msghdr *rtm = NULL;
 	struct rtentry *rt = NULL;
 	struct rib_head *rh;
 	struct rt_addrinfo info;
 	struct sockaddr_storage ss;
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	int i, rti_need_deembed = 0;
 #endif
 	int alloc_len = 0, len, error = 0, fibnum;
 	struct ifnet *ifp = NULL;
 	union sockaddr_union saun;
 	sa_family_t saf = AF_UNSPEC;
 	struct rawcb *rp = NULL;
 	struct walkarg w;
 
 	fibnum = so->so_fibnum;
 
 #define senderr(e) { error = e; goto flush;}
 	if (m == NULL || ((m->m_len < sizeof(long)) &&
 		       (m = m_pullup(m, sizeof(long))) == NULL))
 		return (ENOBUFS);
 	if ((m->m_flags & M_PKTHDR) == 0)
 		panic("route_output");
 	len = m->m_pkthdr.len;
 	if (len < sizeof(*rtm) ||
 	    len != mtod(m, struct rt_msghdr *)->rtm_msglen)
 		senderr(EINVAL);
 
 	/*
 	 * Most of current messages are in range 200-240 bytes,
 	 * minimize possible re-allocation on reply using larger size
 	 * buffer aligned on 1k boundaty.
 	 */
 	alloc_len = roundup2(len, 1024);
 	if ((rtm = malloc(alloc_len, M_TEMP, M_NOWAIT)) == NULL)
 		senderr(ENOBUFS);
 
 	m_copydata(m, 0, len, (caddr_t)rtm);
 	bzero(&info, sizeof(info));
 	bzero(&w, sizeof(w));
 
 	if (rtm->rtm_version != RTM_VERSION) {
 		/* Do not touch message since format is unknown */
 		free(rtm, M_TEMP);
 		rtm = NULL;
 		senderr(EPROTONOSUPPORT);
 	}
 
 	/*
 	 * Starting from here, it is possible
 	 * to alter original message and insert
 	 * caller PID and error value.
 	 */
 
 	rtm->rtm_pid = curproc->p_pid;
 	info.rti_addrs = rtm->rtm_addrs;
 
 	info.rti_mflags = rtm->rtm_inits;
 	info.rti_rmx = &rtm->rtm_rmx;
 
 	/*
 	 * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6
 	 * link-local address because rtrequest requires addresses with
 	 * embedded scope id.
 	 */
 	if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info))
 		senderr(EINVAL);
 
 	info.rti_flags = rtm->rtm_flags;
 	if (info.rti_info[RTAX_DST] == NULL ||
 	    info.rti_info[RTAX_DST]->sa_family >= AF_MAX ||
 	    (info.rti_info[RTAX_GATEWAY] != NULL &&
 	     info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX))
 		senderr(EINVAL);
 	saf = info.rti_info[RTAX_DST]->sa_family;
 	/*
 	 * Verify that the caller has the appropriate privilege; RTM_GET
 	 * is the only operation the non-superuser is allowed.
 	 */
 	if (rtm->rtm_type != RTM_GET) {
 		error = priv_check(curthread, PRIV_NET_ROUTE);
 		if (error)
 			senderr(error);
 	}
 
 	/*
 	 * The given gateway address may be an interface address.
 	 * For example, issuing a "route change" command on a route
 	 * entry that was created from a tunnel, and the gateway
 	 * address given is the local end point. In this case the 
 	 * RTF_GATEWAY flag must be cleared or the destination will
 	 * not be reachable even though there is no error message.
 	 */
 	if (info.rti_info[RTAX_GATEWAY] != NULL &&
 	    info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) {
 		struct rtentry *rt;
 		/* XXX-ME: Is this enough? */
 		struct sockaddr dst;
 
 		bzero(&dst, sizeof(dst));
 		dst = *info.rti_info[RTAX_GATEWAY];
 		rt = rtalloc1_fib(&dst, 0, 0, fibnum);
 		/* 
 		 * A host route through the loopback interface is 
 		 * installed for each interface adddress. In pre 8.0
 		 * releases the interface address of a PPP link type
 		 * is not reachable locally. This behavior is fixed as 
 		 * part of the new L2/L3 redesign and rewrite work. The
 		 * signature of this interface address route is the
 		 * AF_LINK sa_family type of the rt_gateway, and the
 		 * rt_ifp has the IFF_LOOPBACK flag set.
 		 */
 		if (rt != NULL && rt->rt_gateway->sa_family == AF_LINK &&
 		    rt->rt_ifp->if_flags & IFF_LOOPBACK) {
 			info.rti_flags &= ~RTF_GATEWAY;
 			info.rti_flags |= RTF_GWFLAG_COMPAT;
 		}
 		if (rt != NULL)
 			RTFREE_LOCKED(rt);
 	}
 
 	switch (rtm->rtm_type) {
 		struct rtentry *saved_nrt;
 
 	case RTM_ADD:
 	case RTM_CHANGE:
 		if (info.rti_info[RTAX_GATEWAY] == NULL)
 			senderr(EINVAL);
 		saved_nrt = NULL;
 
 		/* support for new ARP code */
 		if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK &&
 		    (rtm->rtm_flags & RTF_LLDATA) != 0) {
 			error = lla_rt_output(rtm, &info);
 #ifdef INET6
 			if (error == 0)
 				rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
 #endif
 			break;
 		}
 		error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt,
 		    fibnum);
 		if (error == 0 && saved_nrt != NULL) {
 #ifdef INET6
 			rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
 #endif
 			RT_LOCK(saved_nrt);
 			rtm->rtm_index = saved_nrt->rt_ifp->if_index;
 			RT_REMREF(saved_nrt);
 			RT_UNLOCK(saved_nrt);
 		}
 		break;
 
 	case RTM_DELETE:
 		saved_nrt = NULL;
 		/* support for new ARP code */
 		if (info.rti_info[RTAX_GATEWAY] && 
 		    (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) &&
 		    (rtm->rtm_flags & RTF_LLDATA) != 0) {
 			error = lla_rt_output(rtm, &info);
 #ifdef INET6
 			if (error == 0)
 				rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
 #endif
 			break;
 		}
 		error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum);
 		if (error == 0) {
 			RT_LOCK(saved_nrt);
 			rt = saved_nrt;
 			goto report;
 		}
 #ifdef INET6
 		/* rt_msg2() will not be used when RTM_DELETE fails. */
 		rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
 #endif
 		break;
 
 	case RTM_GET:
 		rh = rt_tables_get_rnh(fibnum, saf);
 		if (rh == NULL)
 			senderr(EAFNOSUPPORT);
 
-		RIB_RLOCK(rh);
+		RIB_CFG_RLOCK(rh);
 
 		if (info.rti_info[RTAX_NETMASK] == NULL &&
 		    rtm->rtm_type == RTM_GET) {
 			/*
 			 * Provide logest prefix match for
 			 * address lookup (no mask).
 			 * 'route -n get addr'
 			 */
 			rt = (struct rtentry *) rh->rnh_matchaddr(
 			    info.rti_info[RTAX_DST], &rh->head);
 		} else
 			rt = (struct rtentry *) rh->rnh_lookup(
 			    info.rti_info[RTAX_DST],
 			    info.rti_info[RTAX_NETMASK], &rh->head);
 
 		if (rt == NULL) {
-			RIB_RUNLOCK(rh);
+			RIB_CFG_RUNLOCK(rh);
 			senderr(ESRCH);
 		}
 #ifdef RADIX_MPATH
 		/*
 		 * for RTM_CHANGE/LOCK, if we got multipath routes,
 		 * we require users to specify a matching RTAX_GATEWAY.
 		 *
 		 * for RTM_GET, gate is optional even with multipath.
 		 * if gate == NULL the first match is returned.
 		 * (no need to call rt_mpath_matchgate if gate == NULL)
 		 */
 		if (rn_mpath_capable(rh) &&
 		    (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) {
 			rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]);
 			if (!rt) {
-				RIB_RUNLOCK(rh);
+				RIB_CFG_RUNLOCK(rh);
 				senderr(ESRCH);
 			}
 		}
 #endif
 		/*
 		 * If performing proxied L2 entry insertion, and
 		 * the actual PPP host entry is found, perform
 		 * another search to retrieve the prefix route of
 		 * the local end point of the PPP link.
 		 */
 		if (rtm->rtm_flags & RTF_ANNOUNCE) {
 			struct sockaddr laddr;
 
 			if (rt->rt_ifp != NULL && 
 			    rt->rt_ifp->if_type == IFT_PROPVIRTUAL) {
 				struct ifaddr *ifa;
 
 				ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1,
 						RT_ALL_FIBS);
 				if (ifa != NULL)
 					rt_maskedcopy(ifa->ifa_addr,
 						      &laddr,
 						      ifa->ifa_netmask);
 			} else
 				rt_maskedcopy(rt->rt_ifa->ifa_addr,
 					      &laddr,
 					      rt->rt_ifa->ifa_netmask);
 			/* 
 			 * refactor rt and no lock operation necessary
 			 */
 			rt = (struct rtentry *)rh->rnh_matchaddr(&laddr, &rh->head);
 			if (rt == NULL) {
-				RIB_RUNLOCK(rh);
+				RIB_CFG_RUNLOCK(rh);
 				senderr(ESRCH);
 			}
 		} 
 		RT_LOCK(rt);
 		RT_ADDREF(rt);
-		RIB_RUNLOCK(rh);
+		RIB_CFG_RUNLOCK(rh);
 
 report:
 		RT_LOCK_ASSERT(rt);
 		if ((rt->rt_flags & RTF_HOST) == 0
 		    ? jailed_without_vnet(curthread->td_ucred)
 		    : prison_if(curthread->td_ucred,
 		    rt_key(rt)) != 0) {
 			RT_UNLOCK(rt);
 			senderr(ESRCH);
 		}
 		info.rti_info[RTAX_DST] = rt_key(rt);
 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 		info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt),
 		    rt_mask(rt), &ss);
 		info.rti_info[RTAX_GENMASK] = 0;
 		if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
 			ifp = rt->rt_ifp;
 			if (ifp) {
 				info.rti_info[RTAX_IFP] =
 				    ifp->if_addr->ifa_addr;
 				error = rtm_get_jailed(&info, ifp, rt,
 				    &saun, curthread->td_ucred);
 				if (error != 0) {
 					RT_UNLOCK(rt);
 					senderr(error);
 				}
 				if (ifp->if_flags & IFF_POINTOPOINT)
 					info.rti_info[RTAX_BRD] =
 					    rt->rt_ifa->ifa_dstaddr;
 				rtm->rtm_index = ifp->if_index;
 			} else {
 				info.rti_info[RTAX_IFP] = NULL;
 				info.rti_info[RTAX_IFA] = NULL;
 			}
 		} else if ((ifp = rt->rt_ifp) != NULL) {
 			rtm->rtm_index = ifp->if_index;
 		}
 
 		/* Check if we need to realloc storage */
 		rtsock_msg_buffer(rtm->rtm_type, &info, NULL, &len);
 		if (len > alloc_len) {
 			struct rt_msghdr *new_rtm;
 			new_rtm = malloc(len, M_TEMP, M_NOWAIT);
 			if (new_rtm == NULL) {
 				RT_UNLOCK(rt);
 				senderr(ENOBUFS);
 			}
 			bcopy(rtm, new_rtm, rtm->rtm_msglen);
 			free(rtm, M_TEMP);
 			rtm = new_rtm;
 			alloc_len = len;
 		}
 
 		w.w_tmem = (caddr_t)rtm;
 		w.w_tmemsize = alloc_len;
 		rtsock_msg_buffer(rtm->rtm_type, &info, &w, &len);
 
 		if (rt->rt_flags & RTF_GWFLAG_COMPAT)
 			rtm->rtm_flags = RTF_GATEWAY | 
 				(rt->rt_flags & ~RTF_GWFLAG_COMPAT);
 		else
 			rtm->rtm_flags = rt->rt_flags;
 		rt_getmetrics(rt, &rtm->rtm_rmx);
 		rtm->rtm_addrs = info.rti_addrs;
 
 		RT_UNLOCK(rt);
 		break;
 
 	default:
 		senderr(EOPNOTSUPP);
 	}
 
 flush:
 	if (rt != NULL)
 		RTFREE(rt);
 	/*
 	 * Check to see if we don't want our own messages.
 	 */
 	if ((so->so_options & SO_USELOOPBACK) == 0) {
 		if (V_route_cb.any_count <= 1) {
 			if (rtm != NULL)
 				free(rtm, M_TEMP);
 			m_freem(m);
 			return (error);
 		}
 		/* There is another listener, so construct message */
 		rp = sotorawcb(so);
 	}
 
 	if (rtm != NULL) {
 #ifdef INET6
 		if (rti_need_deembed) {
 			/* sin6_scope_id is recovered before sending rtm. */
 			sin6 = (struct sockaddr_in6 *)&ss;
 			for (i = 0; i < RTAX_MAX; i++) {
 				if (info.rti_info[i] == NULL)
 					continue;
 				if (info.rti_info[i]->sa_family != AF_INET6)
 					continue;
 				bcopy(info.rti_info[i], sin6, sizeof(*sin6));
 				if (sa6_recoverscope(sin6) == 0)
 					bcopy(sin6, info.rti_info[i],
 						    sizeof(*sin6));
 			}
 		}
 #endif
 		if (error != 0)
 			rtm->rtm_errno = error;
 		else
 			rtm->rtm_flags |= RTF_DONE;
 
 		m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm);
 		if (m->m_pkthdr.len < rtm->rtm_msglen) {
 			m_freem(m);
 			m = NULL;
 		} else if (m->m_pkthdr.len > rtm->rtm_msglen)
 			m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
 
 		free(rtm, M_TEMP);
 	}
 	if (m != NULL) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 		if (rp) {
 			/*
 			 * XXX insure we don't get a copy by
 			 * invalidating our protocol
 			 */
 			unsigned short family = rp->rcb_proto.sp_family;
 			rp->rcb_proto.sp_family = 0;
 			rt_dispatch(m, saf);
 			rp->rcb_proto.sp_family = family;
 		} else
 			rt_dispatch(m, saf);
 	}
 
 	return (error);
 }
 
 static void
 rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out)
 {
 
 	bzero(out, sizeof(*out));
 	out->rmx_mtu = rt->rt_mtu;
 	out->rmx_weight = rt->rt_weight;
 	/* Kernel -> userland timebase conversion. */
 	out->rmx_expire = rt->rt_expire ?
 	    rt->rt_expire - time_uptime + time_second : 0;
 }
 
 /*
  * Extract the addresses of the passed sockaddrs.
  * Do a little sanity checking so as to avoid bad memory references.
  * This data is derived straight from userland.
  */
 static int
 rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
 {
 	struct sockaddr *sa;
 	int i;
 
 	for (i = 0; i < RTAX_MAX && cp < cplim; i++) {
 		if ((rtinfo->rti_addrs & (1 << i)) == 0)
 			continue;
 		sa = (struct sockaddr *)cp;
 		/*
 		 * It won't fit.
 		 */
 		if (cp + sa->sa_len > cplim)
 			return (EINVAL);
 		/*
 		 * there are no more.. quit now
 		 * If there are more bits, they are in error.
 		 * I've seen this. route(1) can evidently generate these. 
 		 * This causes kernel to core dump.
 		 * for compatibility, If we see this, point to a safe address.
 		 */
 		if (sa->sa_len == 0) {
 			rtinfo->rti_info[i] = &sa_zero;
 			return (0); /* should be EINVAL but for compat */
 		}
 		/* accept it */
 #ifdef INET6
 		if (sa->sa_family == AF_INET6)
 			sa6_embedscope((struct sockaddr_in6 *)sa,
 			    V_ip6_use_defzone);
 #endif
 		rtinfo->rti_info[i] = sa;
 		cp += SA_SIZE(sa);
 	}
 	return (0);
 }
 
 /*
  * Fill in @dmask with valid netmask leaving original @smask
  * intact. Mostly used with radix netmasks.
  */
 static struct sockaddr *
 rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask,
     struct sockaddr_storage *dmask)
 {
 	if (dst == NULL || smask == NULL)
 		return (NULL);
 
 	memset(dmask, 0, dst->sa_len);
 	memcpy(dmask, smask, smask->sa_len);
 	dmask->ss_len = dst->sa_len;
 	dmask->ss_family = dst->sa_family;
 
 	return ((struct sockaddr *)dmask);
 }
 
 /*
  * Writes information related to @rtinfo object to newly-allocated mbuf.
  * Assumes MCLBYTES is enough to construct any message.
  * Used for OS notifications of vaious events (if/ifa announces,etc)
  *
  * Returns allocated mbuf or NULL on failure.
  */
 static struct mbuf *
 rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo)
 {
 	struct rt_msghdr *rtm;
 	struct mbuf *m;
 	int i;
 	struct sockaddr *sa;
 #ifdef INET6
 	struct sockaddr_storage ss;
 	struct sockaddr_in6 *sin6;
 #endif
 	int len, dlen;
 
 	switch (type) {
 
 	case RTM_DELADDR:
 	case RTM_NEWADDR:
 		len = sizeof(struct ifa_msghdr);
 		break;
 
 	case RTM_DELMADDR:
 	case RTM_NEWMADDR:
 		len = sizeof(struct ifma_msghdr);
 		break;
 
 	case RTM_IFINFO:
 		len = sizeof(struct if_msghdr);
 		break;
 
 	case RTM_IFANNOUNCE:
 	case RTM_IEEE80211:
 		len = sizeof(struct if_announcemsghdr);
 		break;
 
 	default:
 		len = sizeof(struct rt_msghdr);
 	}
 
 	/* XXXGL: can we use MJUMPAGESIZE cluster here? */
 	KASSERT(len <= MCLBYTES, ("%s: message too big", __func__));
 	if (len > MHLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (m);
 
 	m->m_pkthdr.len = m->m_len = len;
 	rtm = mtod(m, struct rt_msghdr *);
 	bzero((caddr_t)rtm, len);
 	for (i = 0; i < RTAX_MAX; i++) {
 		if ((sa = rtinfo->rti_info[i]) == NULL)
 			continue;
 		rtinfo->rti_addrs |= (1 << i);
 		dlen = SA_SIZE(sa);
 #ifdef INET6
 		if (V_deembed_scopeid && sa->sa_family == AF_INET6) {
 			sin6 = (struct sockaddr_in6 *)&ss;
 			bcopy(sa, sin6, sizeof(*sin6));
 			if (sa6_recoverscope(sin6) == 0)
 				sa = (struct sockaddr *)sin6;
 		}
 #endif
 		m_copyback(m, len, dlen, (caddr_t)sa);
 		len += dlen;
 	}
 	if (m->m_pkthdr.len != len) {
 		m_freem(m);
 		return (NULL);
 	}
 	rtm->rtm_msglen = len;
 	rtm->rtm_version = RTM_VERSION;
 	rtm->rtm_type = type;
 	return (m);
 }
 
 /*
  * Writes information related to @rtinfo object to preallocated buffer.
  * Stores needed size in @plen. If @w is NULL, calculates size without
  * writing.
  * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation.
  *
  * Returns 0 on success.
  *
  */
 static int
 rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen)
 {
 	int i;
 	int len, buflen = 0, dlen;
 	caddr_t cp = NULL;
 	struct rt_msghdr *rtm = NULL;
 #ifdef INET6
 	struct sockaddr_storage ss;
 	struct sockaddr_in6 *sin6;
 #endif
 
 	switch (type) {
 
 	case RTM_DELADDR:
 	case RTM_NEWADDR:
 		if (w != NULL && w->w_op == NET_RT_IFLISTL) {
 #ifdef COMPAT_FREEBSD32
 			if (w->w_req->flags & SCTL_MASK32)
 				len = sizeof(struct ifa_msghdrl32);
 			else
 #endif
 				len = sizeof(struct ifa_msghdrl);
 		} else
 			len = sizeof(struct ifa_msghdr);
 		break;
 
 	case RTM_IFINFO:
 #ifdef COMPAT_FREEBSD32
 		if (w != NULL && w->w_req->flags & SCTL_MASK32) {
 			if (w->w_op == NET_RT_IFLISTL)
 				len = sizeof(struct if_msghdrl32);
 			else
 				len = sizeof(struct if_msghdr32);
 			break;
 		}
 #endif
 		if (w != NULL && w->w_op == NET_RT_IFLISTL)
 			len = sizeof(struct if_msghdrl);
 		else
 			len = sizeof(struct if_msghdr);
 		break;
 
 	case RTM_NEWMADDR:
 		len = sizeof(struct ifma_msghdr);
 		break;
 
 	default:
 		len = sizeof(struct rt_msghdr);
 	}
 
 	if (w != NULL) {
 		rtm = (struct rt_msghdr *)w->w_tmem;
 		buflen = w->w_tmemsize - len;
 		cp = (caddr_t)w->w_tmem + len;
 	}
 
 	rtinfo->rti_addrs = 0;
 	for (i = 0; i < RTAX_MAX; i++) {
 		struct sockaddr *sa;
 
 		if ((sa = rtinfo->rti_info[i]) == NULL)
 			continue;
 		rtinfo->rti_addrs |= (1 << i);
 		dlen = SA_SIZE(sa);
 		if (cp != NULL && buflen >= dlen) {
 #ifdef INET6
 			if (V_deembed_scopeid && sa->sa_family == AF_INET6) {
 				sin6 = (struct sockaddr_in6 *)&ss;
 				bcopy(sa, sin6, sizeof(*sin6));
 				if (sa6_recoverscope(sin6) == 0)
 					sa = (struct sockaddr *)sin6;
 			}
 #endif
 			bcopy((caddr_t)sa, cp, (unsigned)dlen);
 			cp += dlen;
 			buflen -= dlen;
 		} else if (cp != NULL) {
 			/*
 			 * Buffer too small. Count needed size
 			 * and return with error.
 			 */
 			cp = NULL;
 		}
 
 		len += dlen;
 	}
 
 	if (cp != NULL) {
 		dlen = ALIGN(len) - len;
 		if (buflen < dlen)
 			cp = NULL;
 		else
 			buflen -= dlen;
 	}
 	len = ALIGN(len);
 
 	if (cp != NULL) {
 		/* fill header iff buffer is large enough */
 		rtm->rtm_version = RTM_VERSION;
 		rtm->rtm_type = type;
 		rtm->rtm_msglen = len;
 	}
 
 	*plen = len;
 
 	if (w != NULL && cp == NULL)
 		return (ENOBUFS);
 
 	return (0);
 }
 
 /*
  * This routine is called to generate a message from the routing
  * socket indicating that a redirect has occured, a routing lookup
  * has failed, or that a protocol has detected timeouts to a particular
  * destination.
  */
 void
 rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error,
     int fibnum)
 {
 	struct rt_msghdr *rtm;
 	struct mbuf *m;
 	struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];
 
 	if (V_route_cb.any_count == 0)
 		return;
 	m = rtsock_msg_mbuf(type, rtinfo);
 	if (m == NULL)
 		return;
 
 	if (fibnum != RT_ALL_FIBS) {
 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out "
 		    "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs));
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rtm = mtod(m, struct rt_msghdr *);
 	rtm->rtm_flags = RTF_DONE | flags;
 	rtm->rtm_errno = error;
 	rtm->rtm_addrs = rtinfo->rti_addrs;
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 }
 
 void
 rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error)
 {
 
 	rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS);
 }
 
 /*
  * This routine is called to generate a message from the routing
  * socket indicating that the status of a network interface has changed.
  */
 void
 rt_ifmsg(struct ifnet *ifp)
 {
 	struct if_msghdr *ifm;
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	if (V_route_cb.any_count == 0)
 		return;
 	bzero((caddr_t)&info, sizeof(info));
 	m = rtsock_msg_mbuf(RTM_IFINFO, &info);
 	if (m == NULL)
 		return;
 	ifm = mtod(m, struct if_msghdr *);
 	ifm->ifm_index = ifp->if_index;
 	ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 	if_data_copy(ifp, &ifm->ifm_data);
 	ifm->ifm_addrs = 0;
 	rt_dispatch(m, AF_UNSPEC);
 }
 
 /*
  * Announce interface address arrival/withdraw.
  * Please do not call directly, use rt_addrmsg().
  * Assume input data to be valid.
  * Returns 0 on success.
  */
 int
 rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
 {
 	struct rt_addrinfo info;
 	struct sockaddr *sa;
 	int ncmd;
 	struct mbuf *m;
 	struct ifa_msghdr *ifam;
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct sockaddr_storage ss;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
 	info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
 	    info.rti_info[RTAX_IFP], ifa->ifa_netmask, &ss);
 	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
 	if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL)
 		return (ENOBUFS);
 	ifam = mtod(m, struct ifa_msghdr *);
 	ifam->ifam_index = ifp->if_index;
 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 	ifam->ifam_flags = ifa->ifa_flags;
 	ifam->ifam_addrs = info.rti_addrs;
 
 	if (fibnum != RT_ALL_FIBS) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 
 	return (0);
 }
 
 /*
  * Announce route addition/removal.
  * Please do not call directly, use rt_routemsg().
  * Note that @rt data MAY be inconsistent/invalid:
  * if some userland app sends us "invalid" route message (invalid mask,
  * no dst, wrong address families, etc...) we need to pass it back
  * to app (and any other rtsock consumers) with rtm_errno field set to
  * non-zero value.
  *
  * Returns 0 on success.
  */
 int
 rtsock_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt,
     int fibnum)
 {
 	struct rt_addrinfo info;
 	struct sockaddr *sa;
 	struct mbuf *m;
 	struct rt_msghdr *rtm;
 	struct sockaddr_storage ss;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = sa = rt_key(rt);
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(sa, rt_mask(rt), &ss);
 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 	if ((m = rtsock_msg_mbuf(cmd, &info)) == NULL)
 		return (ENOBUFS);
 	rtm = mtod(m, struct rt_msghdr *);
 	rtm->rtm_index = ifp->if_index;
 	rtm->rtm_flags |= rt->rt_flags;
 	rtm->rtm_errno = error;
 	rtm->rtm_addrs = info.rti_addrs;
 
 	if (fibnum != RT_ALL_FIBS) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 
 	return (0);
 }
 
 /*
  * This is the analogue to the rt_newaddrmsg which performs the same
  * function but for multicast group memberhips.  This is easier since
  * there is no route state to worry about.
  */
 void
 rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma)
 {
 	struct rt_addrinfo info;
 	struct mbuf *m = NULL;
 	struct ifnet *ifp = ifma->ifma_ifp;
 	struct ifma_msghdr *ifmam;
 
 	if (V_route_cb.any_count == 0)
 		return;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_IFA] = ifma->ifma_addr;
 	info.rti_info[RTAX_IFP] = ifp ? ifp->if_addr->ifa_addr : NULL;
 	/*
 	 * If a link-layer address is present, present it as a ``gateway''
 	 * (similarly to how ARP entries, e.g., are presented).
 	 */
 	info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr;
 	m = rtsock_msg_mbuf(cmd, &info);
 	if (m == NULL)
 		return;
 	ifmam = mtod(m, struct ifma_msghdr *);
 	KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n",
 	    __func__));
 	ifmam->ifmam_index = ifp->if_index;
 	ifmam->ifmam_addrs = info.rti_addrs;
 	rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC);
 }
 
 static struct mbuf *
 rt_makeifannouncemsg(struct ifnet *ifp, int type, int what,
 	struct rt_addrinfo *info)
 {
 	struct if_announcemsghdr *ifan;
 	struct mbuf *m;
 
 	if (V_route_cb.any_count == 0)
 		return NULL;
 	bzero((caddr_t)info, sizeof(*info));
 	m = rtsock_msg_mbuf(type, info);
 	if (m != NULL) {
 		ifan = mtod(m, struct if_announcemsghdr *);
 		ifan->ifan_index = ifp->if_index;
 		strlcpy(ifan->ifan_name, ifp->if_xname,
 			sizeof(ifan->ifan_name));
 		ifan->ifan_what = what;
 	}
 	return m;
 }
 
 /*
  * This is called to generate routing socket messages indicating
  * IEEE80211 wireless events.
  * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way.
  */
 void
 rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len)
 {
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info);
 	if (m != NULL) {
 		/*
 		 * Append the ieee80211 data.  Try to stick it in the
 		 * mbuf containing the ifannounce msg; otherwise allocate
 		 * a new mbuf and append.
 		 *
 		 * NB: we assume m is a single mbuf.
 		 */
 		if (data_len > M_TRAILINGSPACE(m)) {
 			struct mbuf *n = m_get(M_NOWAIT, MT_DATA);
 			if (n == NULL) {
 				m_freem(m);
 				return;
 			}
 			bcopy(data, mtod(n, void *), data_len);
 			n->m_len = data_len;
 			m->m_next = n;
 		} else if (data_len > 0) {
 			bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len);
 			m->m_len += data_len;
 		}
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len += data_len;
 		mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len;
 		rt_dispatch(m, AF_UNSPEC);
 	}
 }
 
 /*
  * This is called to generate routing socket messages indicating
  * network interface arrival and departure.
  */
 void
 rt_ifannouncemsg(struct ifnet *ifp, int what)
 {
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info);
 	if (m != NULL)
 		rt_dispatch(m, AF_UNSPEC);
 }
 
 static void
 rt_dispatch(struct mbuf *m, sa_family_t saf)
 {
 	struct m_tag *tag;
 
 	/*
 	 * Preserve the family from the sockaddr, if any, in an m_tag for
 	 * use when injecting the mbuf into the routing socket buffer from
 	 * the netisr.
 	 */
 	if (saf != AF_UNSPEC) {
 		tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short),
 		    M_NOWAIT);
 		if (tag == NULL) {
 			m_freem(m);
 			return;
 		}
 		*(unsigned short *)(tag + 1) = saf;
 		m_tag_prepend(m, tag);
 	}
 #ifdef VIMAGE
 	if (V_loif)
 		m->m_pkthdr.rcvif = V_loif;
 	else {
 		m_freem(m);
 		return;
 	}
 #endif
 	netisr_queue(NETISR_ROUTE, m);	/* mbuf is free'd on failure. */
 }
 
 /*
  * This is used in dumping the kernel table via sysctl().
  */
 static int
 sysctl_dumpentry(struct radix_node *rn, void *vw)
 {
 	struct walkarg *w = vw;
 	struct rtentry *rt = (struct rtentry *)rn;
 	int error = 0, size;
 	struct rt_addrinfo info;
 	struct sockaddr_storage ss;
 
 	if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
 		return 0;
 	if ((rt->rt_flags & RTF_HOST) == 0
 	    ? jailed_without_vnet(w->w_req->td->td_ucred)
 	    : prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0)
 		return (0);
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = rt_key(rt);
 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt),
 	    rt_mask(rt), &ss);
 	info.rti_info[RTAX_GENMASK] = 0;
 	if (rt->rt_ifp) {
 		info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr;
 		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
 		if (rt->rt_ifp->if_flags & IFF_POINTOPOINT)
 			info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
 	}
 	if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0)
 		return (error);
 	if (w->w_req && w->w_tmem) {
 		struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
 
 		if (rt->rt_flags & RTF_GWFLAG_COMPAT)
 			rtm->rtm_flags = RTF_GATEWAY | 
 				(rt->rt_flags & ~RTF_GWFLAG_COMPAT);
 		else
 			rtm->rtm_flags = rt->rt_flags;
 		rt_getmetrics(rt, &rtm->rtm_rmx);
 		rtm->rtm_index = rt->rt_ifp->if_index;
 		rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
 		rtm->rtm_addrs = info.rti_addrs;
 		error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
 		return (error);
 	}
 	return (error);
 }
 
 static int
 sysctl_iflist_ifml(struct ifnet *ifp, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct if_msghdrl *ifm;
 	struct if_data *ifd;
 
 	ifm = (struct if_msghdrl *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct if_msghdrl32 *ifm32;
 
 		ifm32 = (struct if_msghdrl32 *)ifm;
 		ifm32->ifm_addrs = info->rti_addrs;
 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm32->ifm_index = ifp->if_index;
 		ifm32->_ifm_spare1 = 0;
 		ifm32->ifm_len = sizeof(*ifm32);
 		ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data);
 		ifd = &ifm32->ifm_data;
 	} else
 #endif
 	{
 		ifm->ifm_addrs = info->rti_addrs;
 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm->ifm_index = ifp->if_index;
 		ifm->_ifm_spare1 = 0;
 		ifm->ifm_len = sizeof(*ifm);
 		ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data);
 		ifd = &ifm->ifm_data;
 	}
 
 	if_data_copy(ifp, ifd);
 
 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
 }
 
 static int
 sysctl_iflist_ifm(struct ifnet *ifp, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct if_msghdr *ifm;
 	struct if_data *ifd;
 
 	ifm = (struct if_msghdr *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct if_msghdr32 *ifm32;
 
 		ifm32 = (struct if_msghdr32 *)ifm;
 		ifm32->ifm_addrs = info->rti_addrs;
 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm32->ifm_index = ifp->if_index;
 		ifd = &ifm32->ifm_data;
 	} else
 #endif
 	{
 		ifm->ifm_addrs = info->rti_addrs;
 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm->ifm_index = ifp->if_index;
 		ifd = &ifm->ifm_data;
 	}
 
 	if_data_copy(ifp, ifd);
 
 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
 }
 
 static int
 sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct ifa_msghdrl *ifam;
 	struct if_data *ifd;
 
 	ifam = (struct ifa_msghdrl *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct ifa_msghdrl32 *ifam32;
 
 		ifam32 = (struct ifa_msghdrl32 *)ifam;
 		ifam32->ifam_addrs = info->rti_addrs;
 		ifam32->ifam_flags = ifa->ifa_flags;
 		ifam32->ifam_index = ifa->ifa_ifp->if_index;
 		ifam32->_ifam_spare1 = 0;
 		ifam32->ifam_len = sizeof(*ifam32);
 		ifam32->ifam_data_off =
 		    offsetof(struct ifa_msghdrl32, ifam_data);
 		ifam32->ifam_metric = ifa->ifa_ifp->if_metric;
 		ifd = &ifam32->ifam_data;
 	} else
 #endif
 	{
 		ifam->ifam_addrs = info->rti_addrs;
 		ifam->ifam_flags = ifa->ifa_flags;
 		ifam->ifam_index = ifa->ifa_ifp->if_index;
 		ifam->_ifam_spare1 = 0;
 		ifam->ifam_len = sizeof(*ifam);
 		ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data);
 		ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 		ifd = &ifam->ifam_data;
 	}
 
 	bzero(ifd, sizeof(*ifd));
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets);
 	ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets);
 	ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes);
 	ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes);
 
 	/* Fixup if_data carp(4) vhid. */
 	if (carp_get_vhid_p != NULL)
 		ifd->ifi_vhid = (*carp_get_vhid_p)(ifa);
 
 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
 }
 
 static int
 sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct ifa_msghdr *ifam;
 
 	ifam = (struct ifa_msghdr *)w->w_tmem;
 	ifam->ifam_addrs = info->rti_addrs;
 	ifam->ifam_flags = ifa->ifa_flags;
 	ifam->ifam_index = ifa->ifa_ifp->if_index;
 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 
 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
 }
 
 static int
 sysctl_iflist(int af, struct walkarg *w)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct rt_addrinfo info;
 	int len, error = 0;
 	struct sockaddr_storage ss;
 
 	bzero((caddr_t)&info, sizeof(info));
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (w->w_arg && w->w_arg != ifp->if_index)
 			continue;
 		IF_ADDR_RLOCK(ifp);
 		ifa = ifp->if_addr;
 		info.rti_info[RTAX_IFP] = ifa->ifa_addr;
 		error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len);
 		if (error != 0)
 			goto done;
 		info.rti_info[RTAX_IFP] = NULL;
 		if (w->w_req && w->w_tmem) {
 			if (w->w_op == NET_RT_IFLISTL)
 				error = sysctl_iflist_ifml(ifp, &info, w, len);
 			else
 				error = sysctl_iflist_ifm(ifp, &info, w, len);
 			if (error)
 				goto done;
 		}
 		while ((ifa = TAILQ_NEXT(ifa, ifa_link)) != NULL) {
 			if (af && af != ifa->ifa_addr->sa_family)
 				continue;
 			if (prison_if(w->w_req->td->td_ucred,
 			    ifa->ifa_addr) != 0)
 				continue;
 			info.rti_info[RTAX_IFA] = ifa->ifa_addr;
 			info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
 			    ifa->ifa_addr, ifa->ifa_netmask, &ss);
 			info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
 			error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len);
 			if (error != 0)
 				goto done;
 			if (w->w_req && w->w_tmem) {
 				if (w->w_op == NET_RT_IFLISTL)
 					error = sysctl_iflist_ifaml(ifa, &info,
 					    w, len);
 				else
 					error = sysctl_iflist_ifam(ifa, &info,
 					    w, len);
 				if (error)
 					goto done;
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 		info.rti_info[RTAX_IFA] = NULL;
 		info.rti_info[RTAX_NETMASK] = NULL;
 		info.rti_info[RTAX_BRD] = NULL;
 	}
 done:
 	if (ifp != NULL)
 		IF_ADDR_RUNLOCK(ifp);
 	IFNET_RUNLOCK_NOSLEEP();
 	return (error);
 }
 
 static int
 sysctl_ifmalist(int af, struct walkarg *w)
 {
 	struct ifnet *ifp;
 	struct ifmultiaddr *ifma;
 	struct	rt_addrinfo info;
 	int	len, error = 0;
 	struct ifaddr *ifa;
 
 	bzero((caddr_t)&info, sizeof(info));
 	IFNET_RLOCK_NOSLEEP();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (w->w_arg && w->w_arg != ifp->if_index)
 			continue;
 		ifa = ifp->if_addr;
 		info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (af && af != ifma->ifma_addr->sa_family)
 				continue;
 			if (prison_if(w->w_req->td->td_ucred,
 			    ifma->ifma_addr) != 0)
 				continue;
 			info.rti_info[RTAX_IFA] = ifma->ifma_addr;
 			info.rti_info[RTAX_GATEWAY] =
 			    (ifma->ifma_addr->sa_family != AF_LINK) ?
 			    ifma->ifma_lladdr : NULL;
 			error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len);
 			if (error != 0)
 				goto done;
 			if (w->w_req && w->w_tmem) {
 				struct ifma_msghdr *ifmam;
 
 				ifmam = (struct ifma_msghdr *)w->w_tmem;
 				ifmam->ifmam_index = ifma->ifma_ifp->if_index;
 				ifmam->ifmam_flags = 0;
 				ifmam->ifmam_addrs = info.rti_addrs;
 				error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
 				if (error) {
 					IF_ADDR_RUNLOCK(ifp);
 					goto done;
 				}
 			}
 		}
 		IF_ADDR_RUNLOCK(ifp);
 	}
 done:
 	IFNET_RUNLOCK_NOSLEEP();
 	return (error);
 }
 
 static int
 sysctl_rtsock(SYSCTL_HANDLER_ARGS)
 {
 	int	*name = (int *)arg1;
 	u_int	namelen = arg2;
 	struct rib_head *rh = NULL; /* silence compiler. */
 	int	i, lim, error = EINVAL;
 	int	fib = 0;
 	u_char	af;
 	struct	walkarg w;
 
 	name ++;
 	namelen--;
 	if (req->newptr)
 		return (EPERM);
 	if (name[1] == NET_RT_DUMP) {
 		if (namelen == 3)
 			fib = req->td->td_proc->p_fibnum;
 		else if (namelen == 4)
 			fib = (name[3] == RT_ALL_FIBS) ?
 			    req->td->td_proc->p_fibnum : name[3];
 		else
 			return ((namelen < 3) ? EISDIR : ENOTDIR);
 		if (fib < 0 || fib >= rt_numfibs)
 			return (EINVAL);
 	} else if (namelen != 3)
 		return ((namelen < 3) ? EISDIR : ENOTDIR);
 	af = name[0];
 	if (af > AF_MAX)
 		return (EINVAL);
 	bzero(&w, sizeof(w));
 	w.w_op = name[1];
 	w.w_arg = name[2];
 	w.w_req = req;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 	
 	/*
 	 * Allocate reply buffer in advance.
 	 * All rtsock messages has maximum length of u_short.
 	 */
 	w.w_tmemsize = 65536;
 	w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK);
 
 	switch (w.w_op) {
 
 	case NET_RT_DUMP:
 	case NET_RT_FLAGS:
 		if (af == 0) {			/* dump all tables */
 			i = 1;
 			lim = AF_MAX;
 		} else				/* dump only one table */
 			i = lim = af;
 
 		/*
 		 * take care of llinfo entries, the caller must
 		 * specify an AF
 		 */
 		if (w.w_op == NET_RT_FLAGS &&
 		    (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) {
 			if (af != 0)
 				error = lltable_sysctl_dumparp(af, w.w_req);
 			else
 				error = EINVAL;
 			break;
 		}
 		/*
 		 * take care of routing entries
 		 */
 		for (error = 0; error == 0 && i <= lim; i++) {
 			rh = rt_tables_get_rnh(fib, i);
 			if (rh != NULL) {
-				RIB_RLOCK(rh); 
+				RIB_CFG_RLOCK(rh); 
 			    	error = rh->rnh_walktree(&rh->head,
 				    sysctl_dumpentry, &w);
-				RIB_RUNLOCK(rh);
+				RIB_CFG_RUNLOCK(rh);
 			} else if (af != 0)
 				error = EAFNOSUPPORT;
 		}
 		break;
 
 	case NET_RT_IFLIST:
 	case NET_RT_IFLISTL:
 		error = sysctl_iflist(af, &w);
 		break;
 
 	case NET_RT_IFMALIST:
 		error = sysctl_ifmalist(af, &w);
 		break;
 	}
 
 	free(w.w_tmem, M_TEMP);
 	return (error);
 }
 
 static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, "");
 
 /*
  * Definitions of protocols supported in the ROUTE domain.
  */
 
 static struct domain routedomain;		/* or at least forward */
 
 static struct protosw routesw[] = {
 {
 	.pr_type =		SOCK_RAW,
 	.pr_domain =		&routedomain,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_output =		route_output,
 	.pr_ctlinput =		raw_ctlinput,
 	.pr_init =		raw_init,
 	.pr_usrreqs =		&route_usrreqs
 }
 };
 
 static struct domain routedomain = {
 	.dom_family =		PF_ROUTE,
 	.dom_name =		 "route",
 	.dom_protosw =		routesw,
 	.dom_protoswNPROTOSW =	&routesw[sizeof(routesw)/sizeof(routesw[0])]
 };
 
 VNET_DOMAIN_SET(route);
Index: projects/routing/sys/netinet/in_rmx.c
===================================================================
--- projects/routing/sys/netinet/in_rmx.c	(revision 274335)
+++ projects/routing/sys/netinet/in_rmx.c	(revision 274336)
@@ -1,434 +1,439 @@
 /*-
  * Copyright 1994, 1995 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/rmlock.h>
 #include <sys/sysctl.h>
 #include <sys/socket.h>
 #include <sys/mbuf.h>
 #include <sys/syslog.h>
 #include <sys/callout.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/route_internal.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 
 extern int	in_inithead(void **head, int off);
 #ifdef VIMAGE
 extern int	in_detachhead(void **head, int off);
 #endif
 
 static void in_setifarnh(struct rib_head *rh, uint32_t fibnum,
     int af, void *_arg);
 static void in_rtqtimo_setrnh(struct rib_head *rh, uint32_t fibnum,
     int af, void *_arg);
 
 #define RTPRF_OURS		RTF_PROTO3	/* set on routes we manage */
 
 /*
  * Do what we need to do when inserting a route.
  */
 static struct radix_node *
 in_addroute(void *v_arg, void *n_arg, struct radix_head *head,
     struct radix_node *treenodes)
 {
 	struct rtentry *rt = (struct rtentry *)treenodes;
 	struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt);
 
 	/*
 	 * A little bit of help for both IP output and input:
 	 *   For host routes, we make sure that RTF_BROADCAST
 	 *   is set for anything that looks like a broadcast address.
 	 *   This way, we can avoid an expensive call to in_broadcast()
 	 *   in ip_output() most of the time (because the route passed
 	 *   to ip_output() is almost always a host route).
 	 *
 	 *   We also do the same for local addresses, with the thought
 	 *   that this might one day be used to speed up ip_input().
 	 *
 	 * We also mark routes to multicast addresses as such, because
 	 * it's easy to do and might be useful (but this is much more
 	 * dubious since it's so easy to inspect the address).
 	 */
 	if (rt->rt_flags & RTF_HOST) {
 		if (in_broadcast(sin->sin_addr, rt->rt_ifp)) {
 			rt->rt_flags |= RTF_BROADCAST;
 		} else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr ==
 		    sin->sin_addr.s_addr) {
 			rt->rt_flags |= RTF_LOCAL;
 		}
 	}
 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 		rt->rt_flags |= RTF_MULTICAST;
 
 	if (rt->rt_ifp != NULL) {
 
 		/*
 		 * Check route MTU:
 		 * inherit interface MTU if not set or
 		 * check if MTU is too large.
 		 */
 		if (rt->rt_mtu == 0) {
 			rt->rt_mtu = rt->rt_ifp->if_mtu;
 		} else if (rt->rt_mtu > rt->rt_ifp->if_mtu)
 			rt->rt_mtu = rt->rt_ifp->if_mtu;
 	}
 
 	return (rn_addroute(v_arg, n_arg, head, treenodes));
 }
 
 /*
  * This code is the inverse of in_clsroute: on first reference, if we
  * were managing the route, stop doing so and set the expiration timer
  * back off again.
  */
 static struct radix_node *
 in_matroute(void *v_arg, struct radix_head *head)
 {
 	struct radix_node *rn = rn_match(v_arg, head);
 	struct rtentry *rt = (struct rtentry *)rn;
 
 	if (rt) {
 		RT_LOCK(rt);
 		if (rt->rt_flags & RTPRF_OURS) {
 			rt->rt_flags &= ~RTPRF_OURS;
 			rt->rt_expire = 0;
 		}
 		RT_UNLOCK(rt);
 	}
 	return rn;
 }
 
 static VNET_DEFINE(int, rtq_reallyold) = 60*60; /* one hour is "really old" */
 #define	V_rtq_reallyold		VNET(rtq_reallyold)
 SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(rtq_reallyold), 0,
     "Default expiration time on dynamically learned routes");
 
 /*
  * On last reference drop, mark the route as belong to us so that it can be
  * timed out.
  */
 static void
 in_clsroute(struct radix_node *rn, struct radix_head *head)
 {
 	struct rtentry *rt = (struct rtentry *)rn;
 	struct rib_head *rh = (struct rib_head *)head;
 
 	RT_LOCK_ASSERT(rt);
 
 	if (!(rt->rt_flags & RTF_UP))
 		return;			/* prophylactic measures */
 
 	if (rt->rt_flags & RTPRF_OURS)
 		return;
 
 	if (!(rt->rt_flags & RTF_DYNAMIC))
 		return;
 
 	/*
 	 * If rtq_reallyold is 0, just delete the route without
 	 * waiting for a timeout cycle to kill it.
 	 */
 	if (V_rtq_reallyold != 0) {
 		rt->rt_flags |= RTPRF_OURS;
 		rt->rt_expire = time_uptime + V_rtq_reallyold;
 	} else
 		rt_expunge(rh, rt);
 }
 
 struct rtqk_arg {
 	struct rib_head *rh;
 	int draining;
 	int killed;
 	int found;
 };
 
 /*
  * Get rid of old routes.  When draining, this deletes everything, even when
  * the timeout is not expired yet.
  */
 static int
 in_rtqkill(struct rtentry *rt, void *rock)
 {
 	struct rtqk_arg *ap = rock;
 	int err;
 
-	RIB_WLOCK_ASSERT(ap->rh);
+	//RIB_WLOCK_ASSERT(ap->rh);
 
 	if (rt->rt_flags & RTPRF_OURS) {
 		ap->found++;
 
 		if (ap->draining || rt->rt_expire <= time_uptime) {
 			if (rt->rt_refcnt > 0)
 				panic("rtqkill route really not free");
 
 			err = in_rtrequest(RTM_DELETE,
 					(struct sockaddr *)rt_key(rt),
 					rt->rt_gateway, rt_mask(rt),
 					rt->rt_flags | RTF_RNH_LOCKED, 0,
 					rt->rt_fibnum);
 			if (err != 0) {
 				log(LOG_WARNING, "in_rtqkill: error %d\n", err);
 			} else
 				ap->killed++;
 		}
 	}
 
 	return 0;
 }
 
 #define RTQ_TIMEOUT	60*10	/* run no less than once every ten minutes */
 static VNET_DEFINE(int, rtq_timeout) = RTQ_TIMEOUT;
 static VNET_DEFINE(struct callout, rtq_timer);
 
 #define	V_rtq_timeout		VNET(rtq_timeout)
 #define	V_rtq_timer		VNET(rtq_timer)
 
 static void
 in_rtqtimo_setrnh(struct rib_head *rh, uint32_t fibnum, int af,
     void *_arg)
 {
 	struct rtqk_arg *arg;
 	int draining;
 
 	arg = (struct rtqk_arg *)_arg;
 
 	draining = arg->draining;
 	memset(arg, 0, sizeof(*arg));
 	arg->rh = rh;
 	arg->draining = arg->draining;
 }
 
 static void
 in_rtqtimo(void *rock)
 {
 	CURVNET_SET((struct vnet *) rock);
 	struct rtqk_arg arg;
 	struct timeval atv;
 
 	memset(&arg, 0, sizeof(arg));
 	rt_foreach_fib(AF_INET, in_rtqtimo_setrnh, in_rtqkill, &arg);
 
 	atv.tv_usec = 0;
 	atv.tv_sec = V_rtq_timeout;
 	callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock);
 	CURVNET_RESTORE();
 }
 
 void
 in_rtqdrain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct rtqk_arg arg;
 
 	memset(&arg, 0, sizeof(arg));
 	arg.draining = 1;
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 
 		rt_foreach_fib(AF_INET, in_rtqtimo_setrnh, in_rtqkill, &arg);
 
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 void
 in_setmatchfunc(struct rib_head *rh, int val)
 {
 
+	RIB_CFG_WLOCK(rh);
 	RIB_WLOCK(rh);
 	rh->rnh_matchaddr = (val != 0) ?  rn_match : in_matroute;
 	RIB_WUNLOCK(rh);
+	RIB_CFG_WUNLOCK(rh);
 }
 
 static int _in_rt_was_here;
 /*
  * Initialize our routing tree.
  */
 int
 in_inithead(void **head, int off)
 {
 	struct rib_head *rh;
 
 	rh = rt_table_init(32);
 	if (rh == NULL)
 		return (0);
 
 	rh->rnh_addaddr = in_addroute;
 	in_setmatchfunc(rh, V_drop_redirect);
 	rh->rnh_close = in_clsroute;
 	*head = (void *)rh;
 
 	if (_in_rt_was_here == 0 ) {
 		callout_init(&V_rtq_timer, CALLOUT_MPSAFE);
 		callout_reset(&V_rtq_timer, 1, in_rtqtimo, curvnet);
 		_in_rt_was_here = 1;
 	}
 	return 1;
 }
 
 #ifdef VIMAGE
 int
 in_detachhead(void **head, int off)
 {
 
 	callout_drain(&V_rtq_timer);
 	return (1);
 }
 #endif
 
 /*
  * This zaps old routes when the interface goes down or interface
  * address is deleted.  In the latter case, it deletes static routes
  * that point to this address.  If we don't do this, we may end up
  * using the old address in the future.  The ones we always want to
  * get rid of are things like ARP entries, since the user might down
  * the interface, walk over to a completely different network, and
  * plug back in.
  */
 struct in_ifadown_arg {
 	struct rib_head *rh;
 	struct ifaddr *ifa;
 	int del;
 };
 
 static int
 in_ifadownkill(struct rtentry *rt, void *xap)
 {
 	struct in_ifadown_arg *ap = xap;
 
 	RT_LOCK(rt);
 	if (rt->rt_ifa == ap->ifa &&
 	    (ap->del || !(rt->rt_flags & RTF_STATIC))) {
 		/*
 		 * Aquire a reference so that it can later be freed
 		 * as the refcount would be 0 here in case of at least
 		 * ap->del.
 		 */
 		RT_ADDREF(rt);
 		/*
 		 * Disconnect it from the tree and permit protocols
 		 * to cleanup.
 		 */
 		rt_expunge(ap->rh, rt);
 		/*
 		 * At this point it is an rttrash node, and in case
 		 * the above is the only reference we must free it.
 		 * If we do not noone will have a pointer and the
 		 * rtentry will be leaked forever.
 		 * In case someone else holds a reference, we are
 		 * fine as we only decrement the refcount. In that
 		 * case if the other entity calls RT_REMREF, we
 		 * will still be leaking but at least we tried.
 		 */
 		RTFREE_LOCKED(rt);
 		return (0);
 	}
 	RT_UNLOCK(rt);
 	return 0;
 }
 
 static void
 in_setifarnh(struct rib_head *rh, uint32_t fibnum, int af,
     void *_arg)
 {
 	struct in_ifadown_arg *arg;
 
 	arg = (struct in_ifadown_arg *)_arg;
 
 	arg->rh = rh;
 }
 
 void
 in_ifadown(struct ifaddr *ifa, int delete)
 {
 	struct in_ifadown_arg arg;
 
 	KASSERT(ifa->ifa_addr->sa_family == AF_INET,
 	    ("%s: wrong family", __func__));
 
 	arg.ifa = ifa;
 	arg.del = delete;
 
 	rt_foreach_fib(AF_INET, in_setifarnh, in_ifadownkill, &arg);
 	ifa->ifa_flags &= ~IFA_ROUTE;		/* XXXlocking? */
 }
 
 /*
  * inet versions of rt functions. These have fib extensions and 
  * for now will just reference the _fib variants.
  * eventually this order will be reversed,
  */
 int
 in_rtrequest( int req,
 	struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct rtentry **ret_nrt,
 	u_int fibnum)
 {
 	return (rtrequest_fib(req, dst, gateway, netmask, 
 	    flags, ret_nrt, fibnum));
 }
 
 void
 in_rtredirect(struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct sockaddr *src,
 	u_int fibnum)
 {
 	rtredirect_fib(dst, gateway, netmask, flags, src, fibnum);
 }
  
 #if 0
 int	 in_rt_getifa(struct rt_addrinfo *, u_int fibnum);
 int	 in_rtioctl(u_long, caddr_t, u_int);
 int	 in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int);
 #endif
 
 
Index: projects/routing/sys/netinet6/nd6_rtr.c
===================================================================
--- projects/routing/sys/netinet6/nd6_rtr.c	(revision 274335)
+++ projects/routing/sys/netinet6/nd6_rtr.c	(revision 274336)
@@ -1,2131 +1,2134 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6_rtr.c,v 1.111 2001/04/27 01:37:15 jinmei Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/errno.h>
 #include <sys/rwlock.h>
+#include <sys/rmlock.h>
 #include <sys/syslog.h>
 #include <sys/queue.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route_internal.h>
 #include <net/radix.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <net/if_llatbl.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/scope6_var.h>
 
 static int rtpref(struct nd_defrouter *);
 static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *);
 static int prelist_update(struct nd_prefixctl *, struct nd_defrouter *,
     struct mbuf *, int);
 static struct in6_ifaddr *in6_ifadd(struct nd_prefixctl *, int);
 static struct nd_pfxrouter *pfxrtr_lookup(struct nd_prefix *,
 	struct nd_defrouter *);
 static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *);
 static void pfxrtr_del(struct nd_pfxrouter *);
 static struct nd_pfxrouter *find_pfxlist_reachable_router
 (struct nd_prefix *);
 static void defrouter_delreq(struct nd_defrouter *);
 static void nd6_rtmsg(int, struct rtentry *);
 
 static int in6_init_prefix_ltimes(struct nd_prefix *);
 static void in6_init_address_ltimes(struct nd_prefix *,
 	struct in6_addrlifetime *);
 
 static int nd6_prefix_onlink(struct nd_prefix *);
 static int nd6_prefix_offlink(struct nd_prefix *);
 
 static int rt6_deleteroute(struct rtentry *, void *);
 
 VNET_DECLARE(int, nd6_recalc_reachtm_interval);
 #define	V_nd6_recalc_reachtm_interval	VNET(nd6_recalc_reachtm_interval)
 
 static VNET_DEFINE(struct ifnet *, nd6_defifp);
 VNET_DEFINE(int, nd6_defifindex);
 #define	V_nd6_defifp			VNET(nd6_defifp)
 
 VNET_DEFINE(int, ip6_use_tempaddr) = 0;
 
 VNET_DEFINE(int, ip6_desync_factor);
 VNET_DEFINE(u_int32_t, ip6_temp_preferred_lifetime) = DEF_TEMP_PREFERRED_LIFETIME;
 VNET_DEFINE(u_int32_t, ip6_temp_valid_lifetime) = DEF_TEMP_VALID_LIFETIME;
 
 VNET_DEFINE(int, ip6_temp_regen_advance) = TEMPADDR_REGEN_ADVANCE;
 
 /* RTPREF_MEDIUM has to be 0! */
 #define RTPREF_HIGH	1
 #define RTPREF_MEDIUM	0
 #define RTPREF_LOW	(-1)
 #define RTPREF_RESERVED	(-2)
 #define RTPREF_INVALID	(-3)	/* internal */
 
 /*
  * Receive Router Solicitation Message - just for routers.
  * Router solicitation/advertisement is mostly managed by userland program
  * (rtadvd) so here we have no function like nd6_ra_output().
  *
  * Based on RFC 2461
  */
 void
 nd6_rs_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_router_solicit *nd_rs;
 	struct in6_addr saddr6 = ip6->ip6_src;
 	char *lladdr = NULL;
 	int lladdrlen = 0;
 	union nd_opts ndopts;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
 	/*
 	 * Accept RS only when V_ip6_forwarding=1 and the interface has
 	 * no ND6_IFF_ACCEPT_RTADV.
 	 */
 	if (!V_ip6_forwarding || ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV)
 		goto freeit;
 
 	/* Sanity checks */
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "nd6_rs_input: invalid hlim (%d) from %s to %s on %s\n",
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 	/*
 	 * Don't update the neighbor cache, if src = ::.
 	 * This indicates that the src has no IP address assigned yet.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6))
 		goto freeit;
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_rs = (struct nd_router_solicit *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_rs, struct nd_router_solicit *, m, off, icmp6len);
 	if (nd_rs == NULL) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		return;
 	}
 #endif
 
 	icmp6len -= sizeof(*nd_rs);
 	nd6_option_init(nd_rs + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "nd6_rs_input: invalid ND option, ignored\n"));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 		    "nd6_rs_input: lladdrlen mismatch for %s "
 		    "(if %d, RS packet %d)\n",
 		    ip6_sprintf(ip6bufs, &saddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_SOLICIT, 0);
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	ICMP6STAT_INC(icp6s_badrs);
 	m_freem(m);
 }
 
 /*
  * Receive Router Advertisement Message.
  *
  * Based on RFC 2461
  * TODO: on-link bit on prefix information
  * TODO: ND_RA_FLAG_{OTHER,MANAGED} processing
  */
 void
 nd6_ra_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct nd_ifinfo *ndi = ND_IFINFO(ifp);
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct nd_router_advert *nd_ra;
 	struct in6_addr saddr6 = ip6->ip6_src;
 	int mcast = 0;
 	union nd_opts ndopts;
 	struct nd_defrouter *dr;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
 	/*
 	 * We only accept RAs only when the per-interface flag
 	 * ND6_IFF_ACCEPT_RTADV is on the receiving interface.
 	 */
 	if (!(ndi->flags & ND6_IFF_ACCEPT_RTADV))
 		goto freeit;
 
 	if (ip6->ip6_hlim != 255) {
 		nd6log((LOG_ERR,
 		    "nd6_ra_input: invalid hlim (%d) from %s to %s on %s\n",
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 	if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) {
 		nd6log((LOG_ERR,
 		    "nd6_ra_input: src %s is not link-local\n",
 		    ip6_sprintf(ip6bufs, &saddr6)));
 		goto bad;
 	}
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, icmp6len,);
 	nd_ra = (struct nd_router_advert *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(nd_ra, struct nd_router_advert *, m, off, icmp6len);
 	if (nd_ra == NULL) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		return;
 	}
 #endif
 
 	icmp6len -= sizeof(*nd_ra);
 	nd6_option_init(nd_ra + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "nd6_ra_input: invalid ND option, ignored\n"));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
     {
 	struct nd_defrouter dr0;
 	u_int32_t advreachable = nd_ra->nd_ra_reachable;
 
 	/* remember if this is a multicasted advertisement */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
 		mcast = 1;
 
 	bzero(&dr0, sizeof(dr0));
 	dr0.rtaddr = saddr6;
 	dr0.flags  = nd_ra->nd_ra_flags_reserved;
 	/*
 	 * Effectively-disable routes from RA messages when
 	 * ND6_IFF_NO_RADR enabled on the receiving interface or
 	 * (ip6.forwarding == 1 && ip6.rfc6204w3 != 1).
 	 */
 	if (ndi->flags & ND6_IFF_NO_RADR)
 		dr0.rtlifetime = 0;
 	else if (V_ip6_forwarding && !V_ip6_rfc6204w3)
 		dr0.rtlifetime = 0;
 	else
 		dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime);
 	dr0.expire = time_uptime + dr0.rtlifetime;
 	dr0.ifp = ifp;
 	/* unspecified or not? (RFC 2461 6.3.4) */
 	if (advreachable) {
 		advreachable = ntohl(advreachable);
 		if (advreachable <= MAX_REACHABLE_TIME &&
 		    ndi->basereachable != advreachable) {
 			ndi->basereachable = advreachable;
 			ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable);
 			ndi->recalctm = V_nd6_recalc_reachtm_interval; /* reset */
 		}
 	}
 	if (nd_ra->nd_ra_retransmit)
 		ndi->retrans = ntohl(nd_ra->nd_ra_retransmit);
 	if (nd_ra->nd_ra_curhoplimit)
 		ndi->chlim = nd_ra->nd_ra_curhoplimit;
 	dr = defrtrlist_update(&dr0);
     }
 
 	/*
 	 * prefix
 	 */
 	if (ndopts.nd_opts_pi) {
 		struct nd_opt_hdr *pt;
 		struct nd_opt_prefix_info *pi = NULL;
 		struct nd_prefixctl pr;
 
 		for (pt = (struct nd_opt_hdr *)ndopts.nd_opts_pi;
 		     pt <= (struct nd_opt_hdr *)ndopts.nd_opts_pi_end;
 		     pt = (struct nd_opt_hdr *)((caddr_t)pt +
 						(pt->nd_opt_len << 3))) {
 			if (pt->nd_opt_type != ND_OPT_PREFIX_INFORMATION)
 				continue;
 			pi = (struct nd_opt_prefix_info *)pt;
 
 			if (pi->nd_opt_pi_len != 4) {
 				nd6log((LOG_INFO,
 				    "nd6_ra_input: invalid option "
 				    "len %d for prefix information option, "
 				    "ignored\n", pi->nd_opt_pi_len));
 				continue;
 			}
 
 			if (128 < pi->nd_opt_pi_prefix_len) {
 				nd6log((LOG_INFO,
 				    "nd6_ra_input: invalid prefix "
 				    "len %d for prefix information option, "
 				    "ignored\n", pi->nd_opt_pi_prefix_len));
 				continue;
 			}
 
 			if (IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix)
 			 || IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) {
 				nd6log((LOG_INFO,
 				    "nd6_ra_input: invalid prefix "
 				    "%s, ignored\n",
 				    ip6_sprintf(ip6bufs,
 					&pi->nd_opt_pi_prefix)));
 				continue;
 			}
 
 			bzero(&pr, sizeof(pr));
 			pr.ndpr_prefix.sin6_family = AF_INET6;
 			pr.ndpr_prefix.sin6_len = sizeof(pr.ndpr_prefix);
 			pr.ndpr_prefix.sin6_addr = pi->nd_opt_pi_prefix;
 			pr.ndpr_ifp = (struct ifnet *)m->m_pkthdr.rcvif;
 
 			pr.ndpr_raf_onlink = (pi->nd_opt_pi_flags_reserved &
 			    ND_OPT_PI_FLAG_ONLINK) ? 1 : 0;
 			pr.ndpr_raf_auto = (pi->nd_opt_pi_flags_reserved &
 			    ND_OPT_PI_FLAG_AUTO) ? 1 : 0;
 			pr.ndpr_plen = pi->nd_opt_pi_prefix_len;
 			pr.ndpr_vltime = ntohl(pi->nd_opt_pi_valid_time);
 			pr.ndpr_pltime = ntohl(pi->nd_opt_pi_preferred_time);
 			(void)prelist_update(&pr, dr, m, mcast);
 		}
 	}
 
 	/*
 	 * MTU
 	 */
 	if (ndopts.nd_opts_mtu && ndopts.nd_opts_mtu->nd_opt_mtu_len == 1) {
 		u_long mtu;
 		u_long maxmtu;
 
 		mtu = (u_long)ntohl(ndopts.nd_opts_mtu->nd_opt_mtu_mtu);
 
 		/* lower bound */
 		if (mtu < IPV6_MMTU) {
 			nd6log((LOG_INFO, "nd6_ra_input: bogus mtu option "
 			    "mtu=%lu sent from %s, ignoring\n",
 			    mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src)));
 			goto skip;
 		}
 
 		/* upper bound */
 		maxmtu = (ndi->maxmtu && ndi->maxmtu < ifp->if_mtu)
 		    ? ndi->maxmtu : ifp->if_mtu;
 		if (mtu <= maxmtu) {
 			int change = (ndi->linkmtu != mtu);
 
 			ndi->linkmtu = mtu;
 			if (change) /* in6_maxmtu may change */
 				in6_setmaxmtu();
 		} else {
 			nd6log((LOG_INFO, "nd6_ra_input: bogus mtu "
 			    "mtu=%lu sent from %s; "
 			    "exceeds maxmtu %lu, ignoring\n",
 			    mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src), maxmtu));
 		}
 	}
 
  skip:
 
 	/*
 	 * Source link layer address
 	 */
     {
 	char *lladdr = NULL;
 	int lladdrlen = 0;
 
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 		    "nd6_ra_input: lladdrlen mismatch for %s "
 		    "(if %d, RA packet %d)\n", ip6_sprintf(ip6bufs, &saddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	nd6_cache_lladdr(ifp, &saddr6, lladdr,
 	    lladdrlen, ND_ROUTER_ADVERT, 0);
 
 	/*
 	 * Installing a link-layer address might change the state of the
 	 * router's neighbor cache, which might also affect our on-link
 	 * detection of adveritsed prefixes.
 	 */
 	pfxlist_onlink_check();
     }
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	ICMP6STAT_INC(icp6s_badra);
 	m_freem(m);
 }
 
 /*
  * default router list proccessing sub routines
  */
 
 /* tell the change to user processes watching the routing socket. */
 static void
 nd6_rtmsg(int cmd, struct rtentry *rt)
 {
 	struct rt_addrinfo info;
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = rt_key(rt);
 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
 	ifp = rt->rt_ifp;
 	if (ifp != NULL) {
 		IF_ADDR_RLOCK(ifp);
 		ifa = TAILQ_FIRST(&ifp->if_addrhead);
 		info.rti_info[RTAX_IFP] = ifa->ifa_addr;
 		ifa_ref(ifa);
 		IF_ADDR_RUNLOCK(ifp);
 		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
 	} else
 		ifa = NULL;
 
 	rt_missmsg_fib(cmd, &info, rt->rt_flags, 0, rt->rt_fibnum);
 	if (ifa != NULL)
 		ifa_free(ifa);
 }
 
 static void
 defrouter_addreq(struct nd_defrouter *new)
 {
 	struct sockaddr_in6 def, mask, gate;
 	struct rtentry *newrt = NULL;
 	int error;
 
 	bzero(&def, sizeof(def));
 	bzero(&mask, sizeof(mask));
 	bzero(&gate, sizeof(gate));
 
 	def.sin6_len = mask.sin6_len = gate.sin6_len =
 	    sizeof(struct sockaddr_in6);
 	def.sin6_family = gate.sin6_family = AF_INET6;
 	gate.sin6_addr = new->rtaddr;
 
 	error = in6_rtrequest(RTM_ADD, (struct sockaddr *)&def,
 	    (struct sockaddr *)&gate, (struct sockaddr *)&mask,
 	    RTF_GATEWAY, &newrt, RT_DEFAULT_FIB);
 	if (newrt) {
 		nd6_rtmsg(RTM_ADD, newrt); /* tell user process */
 		RTFREE(newrt);
 	}
 	if (error == 0)
 		new->installed = 1;
 	return;
 }
 
 struct nd_defrouter *
 defrouter_lookup(struct in6_addr *addr, struct ifnet *ifp)
 {
 	struct nd_defrouter *dr;
 
 	TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
 		if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr))
 			return (dr);
 	}
 
 	return (NULL);		/* search failed */
 }
 
 /*
  * Remove the default route for a given router.
  * This is just a subroutine function for defrouter_select(), and should
  * not be called from anywhere else.
  */
 static void
 defrouter_delreq(struct nd_defrouter *dr)
 {
 	struct sockaddr_in6 def, mask, gate;
 	struct rtentry *oldrt = NULL;
 
 	bzero(&def, sizeof(def));
 	bzero(&mask, sizeof(mask));
 	bzero(&gate, sizeof(gate));
 
 	def.sin6_len = mask.sin6_len = gate.sin6_len =
 	    sizeof(struct sockaddr_in6);
 	def.sin6_family = gate.sin6_family = AF_INET6;
 	gate.sin6_addr = dr->rtaddr;
 
 	in6_rtrequest(RTM_DELETE, (struct sockaddr *)&def,
 	    (struct sockaddr *)&gate,
 	    (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt, RT_DEFAULT_FIB);
 	if (oldrt) {
 		nd6_rtmsg(RTM_DELETE, oldrt);
 		RTFREE(oldrt);
 	}
 
 	dr->installed = 0;
 }
 
 /*
  * remove all default routes from default router list
  */
 void
 defrouter_reset(void)
 {
 	struct nd_defrouter *dr;
 
 	TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry)
 		defrouter_delreq(dr);
 
 	/*
 	 * XXX should we also nuke any default routers in the kernel, by
 	 * going through them by rtalloc1()?
 	 */
 }
 
 void
 defrtrlist_del(struct nd_defrouter *dr)
 {
 	struct nd_defrouter *deldr = NULL;
 	struct nd_prefix *pr;
 
 	/*
 	 * Flush all the routing table entries that use the router
 	 * as a next hop.
 	 */
 	if (ND_IFINFO(dr->ifp)->flags & ND6_IFF_ACCEPT_RTADV)
 		rt6_flush(&dr->rtaddr, dr->ifp);
 
 	if (dr->installed) {
 		deldr = dr;
 		defrouter_delreq(dr);
 	}
 	TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry);
 
 	/*
 	 * Also delete all the pointers to the router in each prefix lists.
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		struct nd_pfxrouter *pfxrtr;
 		if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL)
 			pfxrtr_del(pfxrtr);
 	}
 	pfxlist_onlink_check();
 
 	/*
 	 * If the router is the primary one, choose a new one.
 	 * Note that defrouter_select() will remove the current gateway
 	 * from the routing table.
 	 */
 	if (deldr)
 		defrouter_select();
 
 	free(dr, M_IP6NDP);
 }
 
 /*
  * Default Router Selection according to Section 6.3.6 of RFC 2461 and
  * draft-ietf-ipngwg-router-selection:
  * 1) Routers that are reachable or probably reachable should be preferred.
  *    If we have more than one (probably) reachable router, prefer ones
  *    with the highest router preference.
  * 2) When no routers on the list are known to be reachable or
  *    probably reachable, routers SHOULD be selected in a round-robin
  *    fashion, regardless of router preference values.
  * 3) If the Default Router List is empty, assume that all
  *    destinations are on-link.
  *
  * We assume nd_defrouter is sorted by router preference value.
  * Since the code below covers both with and without router preference cases,
  * we do not need to classify the cases by ifdef.
  *
  * At this moment, we do not try to install more than one default router,
  * even when the multipath routing is available, because we're not sure about
  * the benefits for stub hosts comparing to the risk of making the code
  * complicated and the possibility of introducing bugs.
  */
 void
 defrouter_select(void)
 {
 	struct nd_defrouter *dr, *selected_dr = NULL, *installed_dr = NULL;
 	struct llentry *ln = NULL;
 
 	/*
 	 * Let's handle easy case (3) first:
 	 * If default router list is empty, there's nothing to be done.
 	 */
 	if (TAILQ_EMPTY(&V_nd_defrouter))
 		return;
 
 	/*
 	 * Search for a (probably) reachable router from the list.
 	 * We just pick up the first reachable one (if any), assuming that
 	 * the ordering rule of the list described in defrtrlist_update().
 	 */
 	TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
 		IF_AFDATA_RLOCK(dr->ifp);
 		if (selected_dr == NULL &&
 		    (ln = nd6_lookup(&dr->rtaddr, 0, dr->ifp)) &&
 		    ND6_IS_LLINFO_PROBREACH(ln)) {
 			selected_dr = dr;
 		}
 		IF_AFDATA_RUNLOCK(dr->ifp);
 		if (ln != NULL) {
 			LLE_RUNLOCK(ln);
 			ln = NULL;
 		}
 
 		if (dr->installed && installed_dr == NULL)
 			installed_dr = dr;
 		else if (dr->installed && installed_dr) {
 			/* this should not happen.  warn for diagnosis. */
 			log(LOG_ERR, "defrouter_select: more than one router"
 			    " is installed\n");
 		}
 	}
 	/*
 	 * If none of the default routers was found to be reachable,
 	 * round-robin the list regardless of preference.
 	 * Otherwise, if we have an installed router, check if the selected
 	 * (reachable) router should really be preferred to the installed one.
 	 * We only prefer the new router when the old one is not reachable
 	 * or when the new one has a really higher preference value.
 	 */
 	if (selected_dr == NULL) {
 		if (installed_dr == NULL || !TAILQ_NEXT(installed_dr, dr_entry))
 			selected_dr = TAILQ_FIRST(&V_nd_defrouter);
 		else
 			selected_dr = TAILQ_NEXT(installed_dr, dr_entry);
 	} else if (installed_dr) {
 		IF_AFDATA_RLOCK(installed_dr->ifp);
 		if ((ln = nd6_lookup(&installed_dr->rtaddr, 0, installed_dr->ifp)) &&
 		    ND6_IS_LLINFO_PROBREACH(ln) &&
 		    rtpref(selected_dr) <= rtpref(installed_dr)) {
 			selected_dr = installed_dr;
 		}
 		IF_AFDATA_RUNLOCK(installed_dr->ifp);
 		if (ln != NULL)
 			LLE_RUNLOCK(ln);
 	}
 
 	/*
 	 * If the selected router is different than the installed one,
 	 * remove the installed router and install the selected one.
 	 * Note that the selected router is never NULL here.
 	 */
 	if (installed_dr != selected_dr) {
 		if (installed_dr)
 			defrouter_delreq(installed_dr);
 		defrouter_addreq(selected_dr);
 	}
 
 	return;
 }
 
 /*
  * for default router selection
  * regards router-preference field as a 2-bit signed integer
  */
 static int
 rtpref(struct nd_defrouter *dr)
 {
 	switch (dr->flags & ND_RA_FLAG_RTPREF_MASK) {
 	case ND_RA_FLAG_RTPREF_HIGH:
 		return (RTPREF_HIGH);
 	case ND_RA_FLAG_RTPREF_MEDIUM:
 	case ND_RA_FLAG_RTPREF_RSV:
 		return (RTPREF_MEDIUM);
 	case ND_RA_FLAG_RTPREF_LOW:
 		return (RTPREF_LOW);
 	default:
 		/*
 		 * This case should never happen.  If it did, it would mean a
 		 * serious bug of kernel internal.  We thus always bark here.
 		 * Or, can we even panic?
 		 */
 		log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->flags);
 		return (RTPREF_INVALID);
 	}
 	/* NOTREACHED */
 }
 
 static struct nd_defrouter *
 defrtrlist_update(struct nd_defrouter *new)
 {
 	struct nd_defrouter *dr, *n;
 
 	if ((dr = defrouter_lookup(&new->rtaddr, new->ifp)) != NULL) {
 		/* entry exists */
 		if (new->rtlifetime == 0) {
 			defrtrlist_del(dr);
 			dr = NULL;
 		} else {
 			int oldpref = rtpref(dr);
 
 			/* override */
 			dr->flags = new->flags; /* xxx flag check */
 			dr->rtlifetime = new->rtlifetime;
 			dr->expire = new->expire;
 
 			/*
 			 * If the preference does not change, there's no need
 			 * to sort the entries. Also make sure the selected
 			 * router is still installed in the kernel.
 			 */
 			if (dr->installed && rtpref(new) == oldpref)
 				return (dr);
 
 			/*
 			 * preferred router may be changed, so relocate
 			 * this router.
 			 * XXX: calling TAILQ_REMOVE directly is a bad manner.
 			 * However, since defrtrlist_del() has many side
 			 * effects, we intentionally do so here.
 			 * defrouter_select() below will handle routing
 			 * changes later.
 			 */
 			TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry);
 			n = dr;
 			goto insert;
 		}
 		return (dr);
 	}
 
 	/* entry does not exist */
 	if (new->rtlifetime == 0)
 		return (NULL);
 
 	n = (struct nd_defrouter *)malloc(sizeof(*n), M_IP6NDP, M_NOWAIT);
 	if (n == NULL)
 		return (NULL);
 	bzero(n, sizeof(*n));
 	*n = *new;
 
 insert:
 	/*
 	 * Insert the new router in the Default Router List;
 	 * The Default Router List should be in the descending order
 	 * of router-preferece.  Routers with the same preference are
 	 * sorted in the arriving time order.
 	 */
 
 	/* insert at the end of the group */
 	TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
 		if (rtpref(n) > rtpref(dr))
 			break;
 	}
 	if (dr)
 		TAILQ_INSERT_BEFORE(dr, n, dr_entry);
 	else
 		TAILQ_INSERT_TAIL(&V_nd_defrouter, n, dr_entry);
 
 	defrouter_select();
 
 	return (n);
 }
 
 static struct nd_pfxrouter *
 pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr)
 {
 	struct nd_pfxrouter *search;
 
 	LIST_FOREACH(search, &pr->ndpr_advrtrs, pfr_entry) {
 		if (search->router == dr)
 			break;
 	}
 
 	return (search);
 }
 
 static void
 pfxrtr_add(struct nd_prefix *pr, struct nd_defrouter *dr)
 {
 	struct nd_pfxrouter *new;
 
 	new = (struct nd_pfxrouter *)malloc(sizeof(*new), M_IP6NDP, M_NOWAIT);
 	if (new == NULL)
 		return;
 	bzero(new, sizeof(*new));
 	new->router = dr;
 
 	LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry);
 
 	pfxlist_onlink_check();
 }
 
 static void
 pfxrtr_del(struct nd_pfxrouter *pfr)
 {
 	LIST_REMOVE(pfr, pfr_entry);
 	free(pfr, M_IP6NDP);
 }
 
 struct nd_prefix *
 nd6_prefix_lookup(struct nd_prefixctl *key)
 {
 	struct nd_prefix *search;
 
 	LIST_FOREACH(search, &V_nd_prefix, ndpr_entry) {
 		if (key->ndpr_ifp == search->ndpr_ifp &&
 		    key->ndpr_plen == search->ndpr_plen &&
 		    in6_are_prefix_equal(&key->ndpr_prefix.sin6_addr,
 		    &search->ndpr_prefix.sin6_addr, key->ndpr_plen)) {
 			break;
 		}
 	}
 
 	return (search);
 }
 
 int
 nd6_prelist_add(struct nd_prefixctl *pr, struct nd_defrouter *dr,
     struct nd_prefix **newp)
 {
 	struct nd_prefix *new = NULL;
 	int error = 0;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	new = (struct nd_prefix *)malloc(sizeof(*new), M_IP6NDP, M_NOWAIT);
 	if (new == NULL)
 		return(ENOMEM);
 	bzero(new, sizeof(*new));
 	new->ndpr_ifp = pr->ndpr_ifp;
 	new->ndpr_prefix = pr->ndpr_prefix;
 	new->ndpr_plen = pr->ndpr_plen;
 	new->ndpr_vltime = pr->ndpr_vltime;
 	new->ndpr_pltime = pr->ndpr_pltime;
 	new->ndpr_flags = pr->ndpr_flags;
 	if ((error = in6_init_prefix_ltimes(new)) != 0) {
 		free(new, M_IP6NDP);
 		return(error);
 	}
 	new->ndpr_lastupdate = time_uptime;
 	if (newp != NULL)
 		*newp = new;
 
 	/* initialization */
 	LIST_INIT(&new->ndpr_advrtrs);
 	in6_prefixlen2mask(&new->ndpr_mask, new->ndpr_plen);
 	/* make prefix in the canonical form */
 	IN6_MASK_ADDR(&new->ndpr_prefix.sin6_addr, &new->ndpr_mask);
 
 	/* link ndpr_entry to nd_prefix list */
 	LIST_INSERT_HEAD(&V_nd_prefix, new, ndpr_entry);
 
 	/* ND_OPT_PI_FLAG_ONLINK processing */
 	if (new->ndpr_raf_onlink) {
 		int e;
 
 		if ((e = nd6_prefix_onlink(new)) != 0) {
 			nd6log((LOG_ERR, "nd6_prelist_add: failed to make "
 			    "the prefix %s/%d on-link on %s (errno=%d)\n",
 			    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 			    pr->ndpr_plen, if_name(pr->ndpr_ifp), e));
 			/* proceed anyway. XXX: is it correct? */
 		}
 	}
 
 	if (dr)
 		pfxrtr_add(new, dr);
 
 	return 0;
 }
 
 void
 prelist_remove(struct nd_prefix *pr)
 {
 	struct nd_pfxrouter *pfr, *next;
 	int e;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/* make sure to invalidate the prefix until it is really freed. */
 	pr->ndpr_vltime = 0;
 	pr->ndpr_pltime = 0;
 
 	/*
 	 * Though these flags are now meaningless, we'd rather keep the value
 	 * of pr->ndpr_raf_onlink and pr->ndpr_raf_auto not to confuse users
 	 * when executing "ndp -p".
 	 */
 
 	if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0 &&
 	    (e = nd6_prefix_offlink(pr)) != 0) {
 		nd6log((LOG_ERR, "prelist_remove: failed to make %s/%d offlink "
 		    "on %s, errno=%d\n",
 		    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 		    pr->ndpr_plen, if_name(pr->ndpr_ifp), e));
 		/* what should we do? */
 	}
 
 	if (pr->ndpr_refcnt > 0)
 		return;		/* notice here? */
 
 	/* unlink ndpr_entry from nd_prefix list */
 	LIST_REMOVE(pr, ndpr_entry);
 
 	/* free list of routers that adversed the prefix */
 	LIST_FOREACH_SAFE(pfr, &pr->ndpr_advrtrs, pfr_entry, next) {
 		free(pfr, M_IP6NDP);
 	}
 	free(pr, M_IP6NDP);
 
 	pfxlist_onlink_check();
 }
 
 /*
  * dr - may be NULL
  */
 
 static int
 prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr,
     struct mbuf *m, int mcast)
 {
 	struct in6_ifaddr *ia6 = NULL, *ia6_match = NULL;
 	struct ifaddr *ifa;
 	struct ifnet *ifp = new->ndpr_ifp;
 	struct nd_prefix *pr;
 	int error = 0;
 	int newprefix = 0;
 	int auth;
 	struct in6_addrlifetime lt6_tmp;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	auth = 0;
 	if (m) {
 		/*
 		 * Authenticity for NA consists authentication for
 		 * both IP header and IP datagrams, doesn't it ?
 		 */
 #if defined(M_AUTHIPHDR) && defined(M_AUTHIPDGM)
 		auth = ((m->m_flags & M_AUTHIPHDR) &&
 		    (m->m_flags & M_AUTHIPDGM));
 #endif
 	}
 
 	if ((pr = nd6_prefix_lookup(new)) != NULL) {
 		/*
 		 * nd6_prefix_lookup() ensures that pr and new have the same
 		 * prefix on a same interface.
 		 */
 
 		/*
 		 * Update prefix information.  Note that the on-link (L) bit
 		 * and the autonomous (A) bit should NOT be changed from 1
 		 * to 0.
 		 */
 		if (new->ndpr_raf_onlink == 1)
 			pr->ndpr_raf_onlink = 1;
 		if (new->ndpr_raf_auto == 1)
 			pr->ndpr_raf_auto = 1;
 		if (new->ndpr_raf_onlink) {
 			pr->ndpr_vltime = new->ndpr_vltime;
 			pr->ndpr_pltime = new->ndpr_pltime;
 			(void)in6_init_prefix_ltimes(pr); /* XXX error case? */
 			pr->ndpr_lastupdate = time_uptime;
 		}
 
 		if (new->ndpr_raf_onlink &&
 		    (pr->ndpr_stateflags & NDPRF_ONLINK) == 0) {
 			int e;
 
 			if ((e = nd6_prefix_onlink(pr)) != 0) {
 				nd6log((LOG_ERR,
 				    "prelist_update: failed to make "
 				    "the prefix %s/%d on-link on %s "
 				    "(errno=%d)\n",
 				    ip6_sprintf(ip6buf,
 					    &pr->ndpr_prefix.sin6_addr),
 				    pr->ndpr_plen, if_name(pr->ndpr_ifp), e));
 				/* proceed anyway. XXX: is it correct? */
 			}
 		}
 
 		if (dr && pfxrtr_lookup(pr, dr) == NULL)
 			pfxrtr_add(pr, dr);
 	} else {
 		struct nd_prefix *newpr = NULL;
 
 		newprefix = 1;
 
 		if (new->ndpr_vltime == 0)
 			goto end;
 		if (new->ndpr_raf_onlink == 0 && new->ndpr_raf_auto == 0)
 			goto end;
 
 		error = nd6_prelist_add(new, dr, &newpr);
 		if (error != 0 || newpr == NULL) {
 			nd6log((LOG_NOTICE, "prelist_update: "
 			    "nd6_prelist_add failed for %s/%d on %s "
 			    "errno=%d, returnpr=%p\n",
 			    ip6_sprintf(ip6buf, &new->ndpr_prefix.sin6_addr),
 			    new->ndpr_plen, if_name(new->ndpr_ifp),
 			    error, newpr));
 			goto end; /* we should just give up in this case. */
 		}
 
 		/*
 		 * XXX: from the ND point of view, we can ignore a prefix
 		 * with the on-link bit being zero.  However, we need a
 		 * prefix structure for references from autoconfigured
 		 * addresses.  Thus, we explicitly make sure that the prefix
 		 * itself expires now.
 		 */
 		if (newpr->ndpr_raf_onlink == 0) {
 			newpr->ndpr_vltime = 0;
 			newpr->ndpr_pltime = 0;
 			in6_init_prefix_ltimes(newpr);
 		}
 
 		pr = newpr;
 	}
 
 	/*
 	 * Address autoconfiguration based on Section 5.5.3 of RFC 2462.
 	 * Note that pr must be non NULL at this point.
 	 */
 
 	/* 5.5.3 (a). Ignore the prefix without the A bit set. */
 	if (!new->ndpr_raf_auto)
 		goto end;
 
 	/*
 	 * 5.5.3 (b). the link-local prefix should have been ignored in
 	 * nd6_ra_input.
 	 */
 
 	/* 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. */
 	if (new->ndpr_pltime > new->ndpr_vltime) {
 		error = EINVAL;	/* XXX: won't be used */
 		goto end;
 	}
 
 	/*
 	 * 5.5.3 (d).  If the prefix advertised is not equal to the prefix of
 	 * an address configured by stateless autoconfiguration already in the
 	 * list of addresses associated with the interface, and the Valid
 	 * Lifetime is not 0, form an address.  We first check if we have
 	 * a matching prefix.
 	 * Note: we apply a clarification in rfc2462bis-02 here.  We only
 	 * consider autoconfigured addresses while RFC2462 simply said
 	 * "address".
 	 */
 	IF_ADDR_RLOCK(ifp);
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in6_ifaddr *ifa6;
 		u_int32_t remaininglifetime;
 
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		ifa6 = (struct in6_ifaddr *)ifa;
 
 		/*
 		 * We only consider autoconfigured addresses as per rfc2462bis.
 		 */
 		if (!(ifa6->ia6_flags & IN6_IFF_AUTOCONF))
 			continue;
 
 		/*
 		 * Spec is not clear here, but I believe we should concentrate
 		 * on unicast (i.e. not anycast) addresses.
 		 * XXX: other ia6_flags? detached or duplicated?
 		 */
 		if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0)
 			continue;
 
 		/*
 		 * Ignore the address if it is not associated with a prefix
 		 * or is associated with a prefix that is different from this
 		 * one.  (pr is never NULL here)
 		 */
 		if (ifa6->ia6_ndpr != pr)
 			continue;
 
 		if (ia6_match == NULL) /* remember the first one */
 			ia6_match = ifa6;
 
 		/*
 		 * An already autoconfigured address matched.  Now that we
 		 * are sure there is at least one matched address, we can
 		 * proceed to 5.5.3. (e): update the lifetimes according to the
 		 * "two hours" rule and the privacy extension.
 		 * We apply some clarifications in rfc2462bis:
 		 * - use remaininglifetime instead of storedlifetime as a
 		 *   variable name
 		 * - remove the dead code in the "two-hour" rule
 		 */
 #define TWOHOUR		(120*60)
 		lt6_tmp = ifa6->ia6_lifetime;
 
 		if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME)
 			remaininglifetime = ND6_INFINITE_LIFETIME;
 		else if (time_uptime - ifa6->ia6_updatetime >
 			 lt6_tmp.ia6t_vltime) {
 			/*
 			 * The case of "invalid" address.  We should usually
 			 * not see this case.
 			 */
 			remaininglifetime = 0;
 		} else
 			remaininglifetime = lt6_tmp.ia6t_vltime -
 			    (time_uptime - ifa6->ia6_updatetime);
 
 		/* when not updating, keep the current stored lifetime. */
 		lt6_tmp.ia6t_vltime = remaininglifetime;
 
 		if (TWOHOUR < new->ndpr_vltime ||
 		    remaininglifetime < new->ndpr_vltime) {
 			lt6_tmp.ia6t_vltime = new->ndpr_vltime;
 		} else if (remaininglifetime <= TWOHOUR) {
 			if (auth) {
 				lt6_tmp.ia6t_vltime = new->ndpr_vltime;
 			}
 		} else {
 			/*
 			 * new->ndpr_vltime <= TWOHOUR &&
 			 * TWOHOUR < remaininglifetime
 			 */
 			lt6_tmp.ia6t_vltime = TWOHOUR;
 		}
 
 		/* The 2 hour rule is not imposed for preferred lifetime. */
 		lt6_tmp.ia6t_pltime = new->ndpr_pltime;
 
 		in6_init_address_ltimes(pr, &lt6_tmp);
 
 		/*
 		 * We need to treat lifetimes for temporary addresses
 		 * differently, according to
 		 * draft-ietf-ipv6-privacy-addrs-v2-01.txt 3.3 (1);
 		 * we only update the lifetimes when they are in the maximum
 		 * intervals.
 		 */
 		if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
 			u_int32_t maxvltime, maxpltime;
 
 			if (V_ip6_temp_valid_lifetime >
 			    (u_int32_t)((time_uptime - ifa6->ia6_createtime) +
 			    V_ip6_desync_factor)) {
 				maxvltime = V_ip6_temp_valid_lifetime -
 				    (time_uptime - ifa6->ia6_createtime) -
 				    V_ip6_desync_factor;
 			} else
 				maxvltime = 0;
 			if (V_ip6_temp_preferred_lifetime >
 			    (u_int32_t)((time_uptime - ifa6->ia6_createtime) +
 			    V_ip6_desync_factor)) {
 				maxpltime = V_ip6_temp_preferred_lifetime -
 				    (time_uptime - ifa6->ia6_createtime) -
 				    V_ip6_desync_factor;
 			} else
 				maxpltime = 0;
 
 			if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME ||
 			    lt6_tmp.ia6t_vltime > maxvltime) {
 				lt6_tmp.ia6t_vltime = maxvltime;
 			}
 			if (lt6_tmp.ia6t_pltime == ND6_INFINITE_LIFETIME ||
 			    lt6_tmp.ia6t_pltime > maxpltime) {
 				lt6_tmp.ia6t_pltime = maxpltime;
 			}
 		}
 		ifa6->ia6_lifetime = lt6_tmp;
 		ifa6->ia6_updatetime = time_uptime;
 	}
 	IF_ADDR_RUNLOCK(ifp);
 	if (ia6_match == NULL && new->ndpr_vltime) {
 		int ifidlen;
 
 		/*
 		 * 5.5.3 (d) (continued)
 		 * No address matched and the valid lifetime is non-zero.
 		 * Create a new address.
 		 */
 
 		/*
 		 * Prefix Length check:
 		 * If the sum of the prefix length and interface identifier
 		 * length does not equal 128 bits, the Prefix Information
 		 * option MUST be ignored.  The length of the interface
 		 * identifier is defined in a separate link-type specific
 		 * document.
 		 */
 		ifidlen = in6_if2idlen(ifp);
 		if (ifidlen < 0) {
 			/* this should not happen, so we always log it. */
 			log(LOG_ERR, "prelist_update: IFID undefined (%s)\n",
 			    if_name(ifp));
 			goto end;
 		}
 		if (ifidlen + pr->ndpr_plen != 128) {
 			nd6log((LOG_INFO,
 			    "prelist_update: invalid prefixlen "
 			    "%d for %s, ignored\n",
 			    pr->ndpr_plen, if_name(ifp)));
 			goto end;
 		}
 
 		if ((ia6 = in6_ifadd(new, mcast)) != NULL) {
 			/*
 			 * note that we should use pr (not new) for reference.
 			 */
 			pr->ndpr_refcnt++;
 			ia6->ia6_ndpr = pr;
 
 			/*
 			 * RFC 3041 3.3 (2).
 			 * When a new public address is created as described
 			 * in RFC2462, also create a new temporary address.
 			 *
 			 * RFC 3041 3.5.
 			 * When an interface connects to a new link, a new
 			 * randomized interface identifier should be generated
 			 * immediately together with a new set of temporary
 			 * addresses.  Thus, we specifiy 1 as the 2nd arg of
 			 * in6_tmpifadd().
 			 */
 			if (V_ip6_use_tempaddr) {
 				int e;
 				if ((e = in6_tmpifadd(ia6, 1, 1)) != 0) {
 					nd6log((LOG_NOTICE, "prelist_update: "
 					    "failed to create a temporary "
 					    "address, errno=%d\n",
 					    e));
 				}
 			}
 			ifa_free(&ia6->ia_ifa);
 
 			/*
 			 * A newly added address might affect the status
 			 * of other addresses, so we check and update it.
 			 * XXX: what if address duplication happens?
 			 */
 			pfxlist_onlink_check();
 		} else {
 			/* just set an error. do not bark here. */
 			error = EADDRNOTAVAIL; /* XXX: might be unused. */
 		}
 	}
 
  end:
 	return error;
 }
 
 /*
  * A supplement function used in the on-link detection below;
  * detect if a given prefix has a (probably) reachable advertising router.
  * XXX: lengthy function name...
  */
 static struct nd_pfxrouter *
 find_pfxlist_reachable_router(struct nd_prefix *pr)
 {
 	struct nd_pfxrouter *pfxrtr;
 	struct llentry *ln;
 	int canreach;
 
 	LIST_FOREACH(pfxrtr, &pr->ndpr_advrtrs, pfr_entry) {
 		IF_AFDATA_RLOCK(pfxrtr->router->ifp);
 		ln = nd6_lookup(&pfxrtr->router->rtaddr, 0, pfxrtr->router->ifp);
 		IF_AFDATA_RUNLOCK(pfxrtr->router->ifp);
 		if (ln == NULL)
 			continue;
 		canreach = ND6_IS_LLINFO_PROBREACH(ln);
 		LLE_RUNLOCK(ln);
 		if (canreach)
 			break;
 	}
 	return (pfxrtr);
 }
 
 /*
  * Check if each prefix in the prefix list has at least one available router
  * that advertised the prefix (a router is "available" if its neighbor cache
  * entry is reachable or probably reachable).
  * If the check fails, the prefix may be off-link, because, for example,
  * we have moved from the network but the lifetime of the prefix has not
  * expired yet.  So we should not use the prefix if there is another prefix
  * that has an available router.
  * But, if there is no prefix that has an available router, we still regards
  * all the prefixes as on-link.  This is because we can't tell if all the
  * routers are simply dead or if we really moved from the network and there
  * is no router around us.
  */
 void
 pfxlist_onlink_check()
 {
 	struct nd_prefix *pr;
 	struct in6_ifaddr *ifa;
 	struct nd_defrouter *dr;
 	struct nd_pfxrouter *pfxrtr = NULL;
 
 	/*
 	 * Check if there is a prefix that has a reachable advertising
 	 * router.
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr))
 			break;
 	}
 
 	/*
 	 * If we have no such prefix, check whether we still have a router
 	 * that does not advertise any prefixes.
 	 */
 	if (pr == NULL) {
 		TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) {
 			struct nd_prefix *pr0;
 
 			LIST_FOREACH(pr0, &V_nd_prefix, ndpr_entry) {
 				if ((pfxrtr = pfxrtr_lookup(pr0, dr)) != NULL)
 					break;
 			}
 			if (pfxrtr != NULL)
 				break;
 		}
 	}
 	if (pr != NULL || (!TAILQ_EMPTY(&V_nd_defrouter) && pfxrtr == NULL)) {
 		/*
 		 * There is at least one prefix that has a reachable router,
 		 * or at least a router which probably does not advertise
 		 * any prefixes.  The latter would be the case when we move
 		 * to a new link where we have a router that does not provide
 		 * prefixes and we configure an address by hand.
 		 * Detach prefixes which have no reachable advertising
 		 * router, and attach other prefixes.
 		 */
 		LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 			/* XXX: a link-local prefix should never be detached */
 			if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
 				continue;
 
 			/*
 			 * we aren't interested in prefixes without the L bit
 			 * set.
 			 */
 			if (pr->ndpr_raf_onlink == 0)
 				continue;
 
 			if (pr->ndpr_raf_auto == 0)
 				continue;
 
 			if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 &&
 			    find_pfxlist_reachable_router(pr) == NULL)
 				pr->ndpr_stateflags |= NDPRF_DETACHED;
 			if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 &&
 			    find_pfxlist_reachable_router(pr) != 0)
 				pr->ndpr_stateflags &= ~NDPRF_DETACHED;
 		}
 	} else {
 		/* there is no prefix that has a reachable router */
 		LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 			if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
 				continue;
 
 			if (pr->ndpr_raf_onlink == 0)
 				continue;
 
 			if (pr->ndpr_raf_auto == 0)
 				continue;
 
 			if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0)
 				pr->ndpr_stateflags &= ~NDPRF_DETACHED;
 		}
 	}
 
 	/*
 	 * Remove each interface route associated with a (just) detached
 	 * prefix, and reinstall the interface route for a (just) attached
 	 * prefix.  Note that all attempt of reinstallation does not
 	 * necessarily success, when a same prefix is shared among multiple
 	 * interfaces.  Such cases will be handled in nd6_prefix_onlink,
 	 * so we don't have to care about them.
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		int e;
 		char ip6buf[INET6_ADDRSTRLEN];
 
 		if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
 			continue;
 
 		if (pr->ndpr_raf_onlink == 0)
 			continue;
 
 		if (pr->ndpr_raf_auto == 0)
 			continue;
 
 		if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 &&
 		    (pr->ndpr_stateflags & NDPRF_ONLINK) != 0) {
 			if ((e = nd6_prefix_offlink(pr)) != 0) {
 				nd6log((LOG_ERR,
 				    "pfxlist_onlink_check: failed to "
 				    "make %s/%d offlink, errno=%d\n",
 				    ip6_sprintf(ip6buf,
 					    &pr->ndpr_prefix.sin6_addr),
 					    pr->ndpr_plen, e));
 			}
 		}
 		if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 &&
 		    (pr->ndpr_stateflags & NDPRF_ONLINK) == 0 &&
 		    pr->ndpr_raf_onlink) {
 			if ((e = nd6_prefix_onlink(pr)) != 0) {
 				nd6log((LOG_ERR,
 				    "pfxlist_onlink_check: failed to "
 				    "make %s/%d onlink, errno=%d\n",
 				    ip6_sprintf(ip6buf,
 					    &pr->ndpr_prefix.sin6_addr),
 					    pr->ndpr_plen, e));
 			}
 		}
 	}
 
 	/*
 	 * Changes on the prefix status might affect address status as well.
 	 * Make sure that all addresses derived from an attached prefix are
 	 * attached, and that all addresses derived from a detached prefix are
 	 * detached.  Note, however, that a manually configured address should
 	 * always be attached.
 	 * The precise detection logic is same as the one for prefixes.
 	 *
 	 * XXXRW: in6_ifaddrhead locking.
 	 */
 	TAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) {
 		if (!(ifa->ia6_flags & IN6_IFF_AUTOCONF))
 			continue;
 
 		if (ifa->ia6_ndpr == NULL) {
 			/*
 			 * This can happen when we first configure the address
 			 * (i.e. the address exists, but the prefix does not).
 			 * XXX: complicated relationships...
 			 */
 			continue;
 		}
 
 		if (find_pfxlist_reachable_router(ifa->ia6_ndpr))
 			break;
 	}
 	if (ifa) {
 		TAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) {
 			if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 				continue;
 
 			if (ifa->ia6_ndpr == NULL) /* XXX: see above. */
 				continue;
 
 			if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) {
 				if (ifa->ia6_flags & IN6_IFF_DETACHED) {
 					ifa->ia6_flags &= ~IN6_IFF_DETACHED;
 					ifa->ia6_flags |= IN6_IFF_TENTATIVE;
 					nd6_dad_start((struct ifaddr *)ifa, 0);
 				}
 			} else {
 				ifa->ia6_flags |= IN6_IFF_DETACHED;
 			}
 		}
 	}
 	else {
 		TAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) {
 			if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 				continue;
 
 			if (ifa->ia6_flags & IN6_IFF_DETACHED) {
 				ifa->ia6_flags &= ~IN6_IFF_DETACHED;
 				ifa->ia6_flags |= IN6_IFF_TENTATIVE;
 				/* Do we need a delay in this case? */
 				nd6_dad_start((struct ifaddr *)ifa, 0);
 			}
 		}
 	}
 }
 
 static int
 nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa)
 {
 	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
 	struct rib_head *rh;
 	struct rtentry *rt;
 	struct sockaddr_in6 mask6;
 	u_long rtflags;
 	int error, a_failure, fibnum;
 
 	/*
 	 * in6_ifinit() sets nd6_rtrequest to ifa_rtrequest for all ifaddrs.
 	 * ifa->ifa_rtrequest = nd6_rtrequest;
 	 */
 	bzero(&mask6, sizeof(mask6));
 	mask6.sin6_len = sizeof(mask6);
 	mask6.sin6_addr = pr->ndpr_mask;
 	rtflags = (ifa->ifa_flags & ~IFA_RTSELF) | RTF_UP;
 
 	a_failure = 0;
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 
 		rt = NULL;
 		error = in6_rtrequest(RTM_ADD,
 		    (struct sockaddr *)&pr->ndpr_prefix, ifa->ifa_addr,
 		    (struct sockaddr *)&mask6, rtflags, &rt, fibnum);
 		if (error == 0) {
 			KASSERT(rt != NULL, ("%s: in6_rtrequest return no "
 			    "error(%d) but rt is NULL, pr=%p, ifa=%p", __func__,
 			    error, pr, ifa));
 
 			rh = rt_tables_get_rnh(rt->rt_fibnum, AF_INET6);
 			/* XXX what if rhn == NULL? */
+			RIB_CFG_WLOCK(rh);
 			RIB_WLOCK(rh);
 			RT_LOCK(rt);
 			if (rt_setgate(rt, rt_key(rt),
 			    (struct sockaddr *)&null_sdl) == 0) {
 				struct sockaddr_dl *dl;
 
 				dl = (struct sockaddr_dl *)rt->rt_gateway;
 				dl->sdl_type = rt->rt_ifp->if_type;
 				dl->sdl_index = rt->rt_ifp->if_index;
 			}
 			RIB_WUNLOCK(rh);
+			RIB_CFG_WUNLOCK(rh);
 			nd6_rtmsg(RTM_ADD, rt);
 			RT_UNLOCK(rt);
 			pr->ndpr_stateflags |= NDPRF_ONLINK;
 		} else {
 			char ip6buf[INET6_ADDRSTRLEN];
 			char ip6bufg[INET6_ADDRSTRLEN];
 			char ip6bufm[INET6_ADDRSTRLEN];
 			struct sockaddr_in6 *sin6;
 
 			sin6 = (struct sockaddr_in6 *)ifa->ifa_addr;
 			nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add "
 			    "route for a prefix (%s/%d) on %s, gw=%s, mask=%s, "
 			    "flags=%lx errno = %d\n",
 			    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 			    pr->ndpr_plen, if_name(pr->ndpr_ifp),
 			    ip6_sprintf(ip6bufg, &sin6->sin6_addr),
 			    ip6_sprintf(ip6bufm, &mask6.sin6_addr),
 			    rtflags, error));
 
 			/* Save last error to return, see rtinit(). */
 			a_failure = error;
 		}
 
 		if (rt != NULL) {
 			RT_LOCK(rt);
 			RT_REMREF(rt);
 			RT_UNLOCK(rt);
 		}
 	}
 
 	/* Return the last error we got. */
 	return (a_failure);
 }
 
 static int
 nd6_prefix_onlink(struct nd_prefix *pr)
 {
 	struct ifaddr *ifa;
 	struct ifnet *ifp = pr->ndpr_ifp;
 	struct nd_prefix *opr;
 	int error = 0;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/* sanity check */
 	if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) {
 		nd6log((LOG_ERR,
 		    "nd6_prefix_onlink: %s/%d is already on-link\n",
 		    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 		    pr->ndpr_plen));
 		return (EEXIST);
 	}
 
 	/*
 	 * Add the interface route associated with the prefix.  Before
 	 * installing the route, check if there's the same prefix on another
 	 * interface, and the prefix has already installed the interface route.
 	 * Although such a configuration is expected to be rare, we explicitly
 	 * allow it.
 	 */
 	LIST_FOREACH(opr, &V_nd_prefix, ndpr_entry) {
 		if (opr == pr)
 			continue;
 
 		if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0)
 			continue;
 
 		if (opr->ndpr_plen == pr->ndpr_plen &&
 		    in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr,
 		    &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen))
 			return (0);
 	}
 
 	/*
 	 * We prefer link-local addresses as the associated interface address.
 	 */
 	/* search for a link-local addr */
 	ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp,
 	    IN6_IFF_NOTREADY | IN6_IFF_ANYCAST);
 	if (ifa == NULL) {
 		/* XXX: freebsd does not have ifa_ifwithaf */
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family == AF_INET6)
 				break;
 		}
 		if (ifa != NULL)
 			ifa_ref(ifa);
 		IF_ADDR_RUNLOCK(ifp);
 		/* should we care about ia6_flags? */
 	}
 	if (ifa == NULL) {
 		/*
 		 * This can still happen, when, for example, we receive an RA
 		 * containing a prefix with the L bit set and the A bit clear,
 		 * after removing all IPv6 addresses on the receiving
 		 * interface.  This should, of course, be rare though.
 		 */
 		nd6log((LOG_NOTICE,
 		    "nd6_prefix_onlink: failed to find any ifaddr"
 		    " to add route for a prefix(%s/%d) on %s\n",
 		    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 		    pr->ndpr_plen, if_name(ifp)));
 		return (0);
 	}
 
 	error = nd6_prefix_onlink_rtrequest(pr, ifa);
 
 	if (ifa != NULL)
 		ifa_free(ifa);
 
 	return (error);
 }
 
 static int
 nd6_prefix_offlink(struct nd_prefix *pr)
 {
 	int error = 0;
 	struct ifnet *ifp = pr->ndpr_ifp;
 	struct nd_prefix *opr;
 	struct sockaddr_in6 sa6, mask6;
 	struct rtentry *rt;
 	char ip6buf[INET6_ADDRSTRLEN];
 	int fibnum, a_failure;
 
 	/* sanity check */
 	if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) {
 		nd6log((LOG_ERR,
 		    "nd6_prefix_offlink: %s/%d is already off-link\n",
 		    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 		    pr->ndpr_plen));
 		return (EEXIST);
 	}
 
 	bzero(&sa6, sizeof(sa6));
 	sa6.sin6_family = AF_INET6;
 	sa6.sin6_len = sizeof(sa6);
 	bcopy(&pr->ndpr_prefix.sin6_addr, &sa6.sin6_addr,
 	    sizeof(struct in6_addr));
 	bzero(&mask6, sizeof(mask6));
 	mask6.sin6_family = AF_INET6;
 	mask6.sin6_len = sizeof(sa6);
 	bcopy(&pr->ndpr_mask, &mask6.sin6_addr, sizeof(struct in6_addr));
 
 	a_failure = 0;
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		rt = NULL;
 		error = in6_rtrequest(RTM_DELETE, (struct sockaddr *)&sa6, NULL,
 		    (struct sockaddr *)&mask6, 0, &rt, fibnum);
 		if (error == 0) {
 			/* report the route deletion to the routing socket. */
 			if (rt != NULL)
 				nd6_rtmsg(RTM_DELETE, rt);
 		} else {
 			/* Save last error to return, see rtinit(). */
 			a_failure = error;
 		}
 		if (rt != NULL) {
 			RTFREE(rt);
 		}
 	}
 	error = a_failure;
 	a_failure = 1;
 	if (error == 0) {
 		pr->ndpr_stateflags &= ~NDPRF_ONLINK;
 
 		/*
 		 * There might be the same prefix on another interface,
 		 * the prefix which could not be on-link just because we have
 		 * the interface route (see comments in nd6_prefix_onlink).
 		 * If there's one, try to make the prefix on-link on the
 		 * interface.
 		 */
 		LIST_FOREACH(opr, &V_nd_prefix, ndpr_entry) {
 			if (opr == pr)
 				continue;
 
 			if ((opr->ndpr_stateflags & NDPRF_ONLINK) != 0)
 				continue;
 
 			/*
 			 * KAME specific: detached prefixes should not be
 			 * on-link.
 			 */
 			if ((opr->ndpr_stateflags & NDPRF_DETACHED) != 0)
 				continue;
 
 			if (opr->ndpr_plen == pr->ndpr_plen &&
 			    in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr,
 			    &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) {
 				int e;
 
 				if ((e = nd6_prefix_onlink(opr)) != 0) {
 					nd6log((LOG_ERR,
 					    "nd6_prefix_offlink: failed to "
 					    "recover a prefix %s/%d from %s "
 					    "to %s (errno = %d)\n",
 					    ip6_sprintf(ip6buf,
 						&opr->ndpr_prefix.sin6_addr),
 					    opr->ndpr_plen, if_name(ifp),
 					    if_name(opr->ndpr_ifp), e));
 				} else
 					a_failure = 0;
 			}
 		}
 	} else {
 		/* XXX: can we still set the NDPRF_ONLINK flag? */
 		nd6log((LOG_ERR,
 		    "nd6_prefix_offlink: failed to delete route: "
 		    "%s/%d on %s (errno = %d)\n",
 		    ip6_sprintf(ip6buf, &sa6.sin6_addr), pr->ndpr_plen,
 		    if_name(ifp), error));
 	}
 
 	if (a_failure)
 		lltable_prefix_free(AF_INET6, (struct sockaddr *)&sa6,
 		    (struct sockaddr *)&mask6, LLE_STATIC);
 
 	return (error);
 }
 
 static struct in6_ifaddr *
 in6_ifadd(struct nd_prefixctl *pr, int mcast)
 {
 	struct ifnet *ifp = pr->ndpr_ifp;
 	struct ifaddr *ifa;
 	struct in6_aliasreq ifra;
 	struct in6_ifaddr *ia, *ib;
 	int error, plen0;
 	struct in6_addr mask;
 	int prefixlen = pr->ndpr_plen;
 	int updateflags;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	in6_prefixlen2mask(&mask, prefixlen);
 
 	/*
 	 * find a link-local address (will be interface ID).
 	 * Is it really mandatory? Theoretically, a global or a site-local
 	 * address can be configured without a link-local address, if we
 	 * have a unique interface identifier...
 	 *
 	 * it is not mandatory to have a link-local address, we can generate
 	 * interface identifier on the fly.  we do this because:
 	 * (1) it should be the easiest way to find interface identifier.
 	 * (2) RFC2462 5.4 suggesting the use of the same interface identifier
 	 * for multiple addresses on a single interface, and possible shortcut
 	 * of DAD.  we omitted DAD for this reason in the past.
 	 * (3) a user can prevent autoconfiguration of global address
 	 * by removing link-local address by hand (this is partly because we
 	 * don't have other way to control the use of IPv6 on an interface.
 	 * this has been our design choice - cf. NRL's "ifconfig auto").
 	 * (4) it is easier to manage when an interface has addresses
 	 * with the same interface identifier, than to have multiple addresses
 	 * with different interface identifiers.
 	 */
 	ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); /* 0 is OK? */
 	if (ifa)
 		ib = (struct in6_ifaddr *)ifa;
 	else
 		return NULL;
 
 	/* prefixlen + ifidlen must be equal to 128 */
 	plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL);
 	if (prefixlen != plen0) {
 		ifa_free(ifa);
 		nd6log((LOG_INFO, "in6_ifadd: wrong prefixlen for %s "
 		    "(prefix=%d ifid=%d)\n",
 		    if_name(ifp), prefixlen, 128 - plen0));
 		return NULL;
 	}
 
 	/* make ifaddr */
 	in6_prepare_ifra(&ifra, &pr->ndpr_prefix.sin6_addr, &mask);
 
 	IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr, &mask);
 	/* interface ID */
 	ifra.ifra_addr.sin6_addr.s6_addr32[0] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]);
 	ifra.ifra_addr.sin6_addr.s6_addr32[1] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]);
 	ifra.ifra_addr.sin6_addr.s6_addr32[2] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]);
 	ifra.ifra_addr.sin6_addr.s6_addr32[3] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]);
 	ifa_free(ifa);
 
 	/* lifetimes. */
 	ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime;
 	ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime;
 
 	/* XXX: scope zone ID? */
 
 	ifra.ifra_flags |= IN6_IFF_AUTOCONF; /* obey autoconf */
 
 	/*
 	 * Make sure that we do not have this address already.  This should
 	 * usually not happen, but we can still see this case, e.g., if we
 	 * have manually configured the exact address to be configured.
 	 */
 	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp,
 	    &ifra.ifra_addr.sin6_addr);
 	if (ifa != NULL) {
 		ifa_free(ifa);
 		/* this should be rare enough to make an explicit log */
 		log(LOG_INFO, "in6_ifadd: %s is already configured\n",
 		    ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr));
 		return (NULL);
 	}
 
 	/*
 	 * Allocate ifaddr structure, link into chain, etc.
 	 * If we are going to create a new address upon receiving a multicasted
 	 * RA, we need to impose a random delay before starting DAD.
 	 * [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2]
 	 */
 	updateflags = 0;
 	if (mcast)
 		updateflags |= IN6_IFAUPDATE_DADDELAY;
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) {
 		nd6log((LOG_ERR,
 		    "in6_ifadd: failed to make ifaddr %s on %s (errno=%d)\n",
 		    ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr),
 		    if_name(ifp), error));
 		return (NULL);	/* ifaddr must not have been allocated. */
 	}
 
 	ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr);
 	/*
 	 * XXXRW: Assumption of non-NULLness here might not be true with
 	 * fine-grained locking -- should we validate it?  Or just return
 	 * earlier ifa rather than looking it up again?
 	 */
 	return (ia);		/* this is always non-NULL  and referenced. */
 }
 
 /*
  * ia0 - corresponding public address
  */
 int
 in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay)
 {
 	struct ifnet *ifp = ia0->ia_ifa.ifa_ifp;
 	struct in6_ifaddr *newia;
 	struct in6_aliasreq ifra;
 	int error;
 	int trylimit = 3;	/* XXX: adhoc value */
 	int updateflags;
 	u_int32_t randid[2];
 	time_t vltime0, pltime0;
 
 	in6_prepare_ifra(&ifra, &ia0->ia_addr.sin6_addr,
 	    &ia0->ia_prefixmask.sin6_addr);
 
 	ifra.ifra_addr = ia0->ia_addr;	/* XXX: do we need this ? */
 	/* clear the old IFID */
 	IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr,
 	    &ifra.ifra_prefixmask.sin6_addr);
 
   again:
 	if (in6_get_tmpifid(ifp, (u_int8_t *)randid,
 	    (const u_int8_t *)&ia0->ia_addr.sin6_addr.s6_addr[8], forcegen)) {
 		nd6log((LOG_NOTICE, "in6_tmpifadd: failed to find a good "
 		    "random IFID\n"));
 		return (EINVAL);
 	}
 	ifra.ifra_addr.sin6_addr.s6_addr32[2] |=
 	    (randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2]));
 	ifra.ifra_addr.sin6_addr.s6_addr32[3] |=
 	    (randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3]));
 
 	/*
 	 * in6_get_tmpifid() quite likely provided a unique interface ID.
 	 * However, we may still have a chance to see collision, because
 	 * there may be a time lag between generation of the ID and generation
 	 * of the address.  So, we'll do one more sanity check.
 	 */
 
 	if (in6_localip(&ifra.ifra_addr.sin6_addr) != 0) {
 		if (trylimit-- > 0) {
 			forcegen = 1;
 			goto again;
 		}
 
 		/* Give up.  Something strange should have happened.  */
 		nd6log((LOG_NOTICE, "in6_tmpifadd: failed to "
 		    "find a unique random IFID\n"));
 		return (EEXIST);
 	}
 
 	/*
 	 * The Valid Lifetime is the lower of the Valid Lifetime of the
          * public address or TEMP_VALID_LIFETIME.
 	 * The Preferred Lifetime is the lower of the Preferred Lifetime
          * of the public address or TEMP_PREFERRED_LIFETIME -
          * DESYNC_FACTOR.
 	 */
 	if (ia0->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 		vltime0 = IFA6_IS_INVALID(ia0) ? 0 :
 		    (ia0->ia6_lifetime.ia6t_vltime -
 		    (time_uptime - ia0->ia6_updatetime));
 		if (vltime0 > V_ip6_temp_valid_lifetime)
 			vltime0 = V_ip6_temp_valid_lifetime;
 	} else
 		vltime0 = V_ip6_temp_valid_lifetime;
 	if (ia0->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 		pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 :
 		    (ia0->ia6_lifetime.ia6t_pltime -
 		    (time_uptime - ia0->ia6_updatetime));
 		if (pltime0 > V_ip6_temp_preferred_lifetime - V_ip6_desync_factor){
 			pltime0 = V_ip6_temp_preferred_lifetime -
 			    V_ip6_desync_factor;
 		}
 	} else
 		pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor;
 	ifra.ifra_lifetime.ia6t_vltime = vltime0;
 	ifra.ifra_lifetime.ia6t_pltime = pltime0;
 
 	/*
 	 * A temporary address is created only if this calculated Preferred
 	 * Lifetime is greater than REGEN_ADVANCE time units.
 	 */
 	if (ifra.ifra_lifetime.ia6t_pltime <= V_ip6_temp_regen_advance)
 		return (0);
 
 	/* XXX: scope zone ID? */
 
 	ifra.ifra_flags |= (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY);
 
 	/* allocate ifaddr structure, link into chain, etc. */
 	updateflags = 0;
 	if (delay)
 		updateflags |= IN6_IFAUPDATE_DADDELAY;
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0)
 		return (error);
 
 	newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr);
 	if (newia == NULL) {	/* XXX: can it happen? */
 		nd6log((LOG_ERR,
 		    "in6_tmpifadd: ifa update succeeded, but we got "
 		    "no ifaddr\n"));
 		return (EINVAL); /* XXX */
 	}
 	newia->ia6_ndpr = ia0->ia6_ndpr;
 	newia->ia6_ndpr->ndpr_refcnt++;
 	ifa_free(&newia->ia_ifa);
 
 	/*
 	 * A newly added address might affect the status of other addresses.
 	 * XXX: when the temporary address is generated with a new public
 	 * address, the onlink check is redundant.  However, it would be safe
 	 * to do the check explicitly everywhere a new address is generated,
 	 * and, in fact, we surely need the check when we create a new
 	 * temporary address due to deprecation of an old temporary address.
 	 */
 	pfxlist_onlink_check();
 
 	return (0);
 }
 
 static int
 in6_init_prefix_ltimes(struct nd_prefix *ndpr)
 {
 	if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME)
 		ndpr->ndpr_preferred = 0;
 	else
 		ndpr->ndpr_preferred = time_uptime + ndpr->ndpr_pltime;
 	if (ndpr->ndpr_vltime == ND6_INFINITE_LIFETIME)
 		ndpr->ndpr_expire = 0;
 	else
 		ndpr->ndpr_expire = time_uptime + ndpr->ndpr_vltime;
 
 	return 0;
 }
 
 static void
 in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6)
 {
 	/* init ia6t_expire */
 	if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME)
 		lt6->ia6t_expire = 0;
 	else {
 		lt6->ia6t_expire = time_uptime;
 		lt6->ia6t_expire += lt6->ia6t_vltime;
 	}
 
 	/* init ia6t_preferred */
 	if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME)
 		lt6->ia6t_preferred = 0;
 	else {
 		lt6->ia6t_preferred = time_uptime;
 		lt6->ia6t_preferred += lt6->ia6t_pltime;
 	}
 }
 
 /*
  * Delete all the routing table entries that use the specified gateway.
  * XXX: this function causes search through all entries of routing table, so
  * it shouldn't be called when acting as a router.
  */
 void
 rt6_flush(struct in6_addr *gateway, struct ifnet *ifp)
 {
 
 	/* We'll care only link-local addresses */
 	if (!IN6_IS_ADDR_LINKLOCAL(gateway))
 		return;
 
 	/* XXX Do we really need to walk any but the default FIB? */
 	rt_foreach_fib(AF_INET6, NULL, rt6_deleteroute, (void *)gateway);
 }
 
 static int
 rt6_deleteroute(struct rtentry *rt, void *arg)
 {
 #define SIN6(s)	((struct sockaddr_in6 *)s)
 	struct in6_addr *gate = (struct in6_addr *)arg;
 
 	if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6)
 		return (0);
 
 	if (!IN6_ARE_ADDR_EQUAL(gate, &SIN6(rt->rt_gateway)->sin6_addr)) {
 		return (0);
 	}
 
 	/*
 	 * Do not delete a static route.
 	 * XXX: this seems to be a bit ad-hoc. Should we consider the
 	 * 'cloned' bit instead?
 	 */
 	if ((rt->rt_flags & RTF_STATIC) != 0)
 		return (0);
 
 	/*
 	 * We delete only host route. This means, in particular, we don't
 	 * delete default route.
 	 */
 	if ((rt->rt_flags & RTF_HOST) == 0)
 		return (0);
 
 	return (in6_rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
 	    rt_mask(rt), rt->rt_flags, NULL, rt->rt_fibnum));
 #undef SIN6
 }
 
 int
 nd6_setdefaultiface(int ifindex)
 {
 	int error = 0;
 
 	if (ifindex < 0 || V_if_index < ifindex)
 		return (EINVAL);
 	if (ifindex != 0 && !ifnet_byindex(ifindex))
 		return (EINVAL);
 
 	if (V_nd6_defifindex != ifindex) {
 		V_nd6_defifindex = ifindex;
 		if (V_nd6_defifindex > 0)
 			V_nd6_defifp = ifnet_byindex(V_nd6_defifindex);
 		else
 			V_nd6_defifp = NULL;
 
 		/*
 		 * Our current implementation assumes one-to-one maping between
 		 * interfaces and links, so it would be natural to use the
 		 * default interface as the default link.
 		 */
 		scope6_setdefault(V_nd6_defifp);
 	}
 
 	return (error);
 }
Index: projects/routing/sys/netpfil/ipfw/ip_fw_table_algo.c
===================================================================
--- projects/routing/sys/netpfil/ipfw/ip_fw_table_algo.c	(revision 274335)
+++ projects/routing/sys/netpfil/ipfw/ip_fw_table_algo.c	(revision 274336)
@@ -1,4082 +1,4083 @@
 /*-
  * Copyright (c) 2014 Yandex LLC
  * Copyright (c) 2014 Alexander V. Chernikov
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Lookup table algorithms.
  *
  */
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #ifndef INET
 #error IPFIREWALL requires INET.
 #endif /* INET */
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/queue.h>
 #include <net/if.h>	/* ip_fw.h requires IFNAMSIZ */
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/route_internal.h>
 
 #include <netinet/in.h>
 #include <netinet/ip_var.h>	/* struct ipfw_rule_ref */
 #include <netinet/ip_fw.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <netpfil/ipfw/ip_fw_table.h>
 
 
 /*
  * IPFW table lookup algorithms.
  *
  * What is needed to add another table algo?
  *
  * Algo init:
  * * struct table_algo has to be filled with:
  *   name: "type:algoname" format, e.g. "addr:radix". Currently
  *     there are the following types: "addr", "iface", "number" and "flow".
  *   type: one of IPFW_TABLE_* types
  *   flags: one or more TA_FLAGS_*
  *   ta_buf_size: size of structure used to store add/del item state.
  *     Needs to be less than TA_BUF_SZ.
  *   callbacks: see below for description.
  * * ipfw_add_table_algo / ipfw_del_table_algo has to be called
  *
  * Callbacks description:
  *
  * -init: request to initialize new table instance.
  * typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state,
  *     struct table_info *ti, char *data, uint8_t tflags);
  * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success.
  *
  *  Allocate all structures needed for normal operations.
  *  * Caller may want to parse @data for some algo-specific
  *    options provided by userland.
  *  * Caller may want to save configuration state pointer to @ta_state
  *  * Caller needs to save desired runtime structure pointer(s)
  *    inside @ti fields. Note that it is not correct to save
  *    @ti pointer at this moment. Use -change_ti hook for that.
  *  * Caller has to fill in ti->lookup to appropriate function
  *    pointer.
  *
  *
  *
  * -destroy: request to destroy table instance.
  * typedef void (ta_destroy)(void *ta_state, struct table_info *ti);
  * MANDATORY, may be locked (UH+WLOCK). (M_NOWAIT).
  *
  * Frees all table entries and all tables structures allocated by -init.
  *
  *
  *
  * -prepare_add: request to allocate state for adding new entry.
  * typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei,
  *     void *ta_buf);
  * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success.
  *
  * Allocates state and fills it in with all necessary data (EXCEPT value)
  * from @tei to minimize operations needed to be done under WLOCK.
  * "value" field has to be copied to new entry in @add callback.
  * Buffer ta_buf of size ta->ta_buf_sz may be used to store
  * allocated state.
  *
  *
  *
  * -prepare_del: request to set state for deleting existing entry.
  * typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei,
  *     void *ta_buf);
  * MANDATORY, locked, UH. (M_NOWAIT). Returns 0 on success.
  *
  * Buffer ta_buf of size ta->ta_buf_sz may be used to store
  * allocated state. Caller should use on-stack ta_buf allocation
  * instead of doing malloc().
  *
  *
  *
  * -add: request to insert new entry into runtime/config structures.
  *  typedef int (ta_add)(void *ta_state, struct table_info *ti,
  *     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
  * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success.
  *
  * Insert new entry using previously-allocated state in @ta_buf.
  * * @tei may have the following flags:
  *   TEI_FLAGS_UPDATE: request to add or update entry.
  *   TEI_FLAGS_DONTADD: request to update (but not add) entry.
  * * Caller is required to do the following:
  *   copy real entry value from @tei
  *   entry added: return 0, set 1 to @pnum
  *   entry updated: return 0, store 0 to @pnum, store old value in @tei,
  *     add TEI_FLAGS_UPDATED flag to @tei.
  *   entry exists: return EEXIST
  *   entry not found: return ENOENT
  *   other error: return non-zero error code.
  *
  *
  *
  * -del: request to delete existing entry from runtime/config structures.
  *  typedef int (ta_del)(void *ta_state, struct table_info *ti,
  *     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
  *  MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success.
  *
  *  Delete entry using previously set up in @ta_buf.
  * * Caller is required to do the following:
  *   entry deleted: return 0, set 1 to @pnum, store old value in @tei.
  *   entry not found: return ENOENT
  *   other error: return non-zero error code.
  *
  *
  *
  * -flush_entry: flush entry state created by -prepare_add / -del / others
  *  typedef void (ta_flush_entry)(struct ip_fw_chain *ch,
  *      struct tentry_info *tei, void *ta_buf);
  *  MANDATORY, may be locked. (M_NOWAIT).
  *
  *  Delete state allocated by:
  *  -prepare_add (-add returned EEXIST|UPDATED)
  *  -prepare_del (if any)
  *  -del
  *  * Caller is required to handle empty @ta_buf correctly.
  *
  *
  * -find_tentry: finds entry specified by key @tei
  *  typedef int ta_find_tentry(void *ta_state, struct table_info *ti,
  *      ipfw_obj_tentry *tent);
  *  OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 on success.
  *
  *  Finds entry specified by given key.
  *  * Caller is requred to do the following:
  *    entry found: returns 0, export entry to @tent
  *    entry not found: returns ENOENT
  *
  *
  * -need_modify: checks if @ti has enough space to hold another @count items.
  *  typedef int (ta_need_modify)(void *ta_state, struct table_info *ti,
  *      uint32_t count, uint64_t *pflags);
  *  OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 if has.
  *
  *  Checks if given table has enough space to add @count items without
  *  resize. Caller may use @pflags to store desired modification data.
  *
  *
  *
  * -prepare_mod: allocate structures for table modification.
  *  typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags);
  * OPTIONAL(need_modify), unlocked. (M_WAITOK). Returns 0 on success.
  *
  * Allocate all needed state for table modification. Caller
  * should use `struct mod_item` to store new state in @ta_buf.
  * Up to TA_BUF_SZ (128 bytes) can be stored in @ta_buf.
  * 
  *
  *
  * -fill_mod: copy some data to new state/
  *  typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti,
  *      void *ta_buf, uint64_t *pflags);
  * OPTIONAL(need_modify), locked (UH). (M_NOWAIT). Returns 0 on success.
  *
  * Copy as much data as we can to minimize changes under WLOCK.
  * For example, array can be merged inside this callback.
  *
  *
  *
  * -modify: perform final modification.
  *  typedef void (ta_modify)(void *ta_state, struct table_info *ti,
  *      void *ta_buf, uint64_t pflags);
  * OPTIONAL(need_modify), locked (UH+WLOCK). (M_NOWAIT). 
  *
  * Performs all changes necessary to switch to new structures.
  * * Caller should save old pointers to @ta_buf storage.
  *
  *
  *
  * -flush_mod: flush table modification state.
  *  typedef void (ta_flush_mod)(void *ta_buf);
  * OPTIONAL(need_modify), unlocked. (M_WAITOK).
  *
  * Performs flush for the following:
  *   - prepare_mod (modification was not necessary)
  *   - modify (for the old state)
  *
  *
  *
  * -change_gi: monitor table info pointer changes
  * typedef void (ta_change_ti)(void *ta_state, struct table_info *ti);
  * OPTIONAL, locked (UH). (M_NOWAIT).
  *
  * Called on @ti pointer changed. Called immediately after -init
  * to set initial state.
  *
  *
  *
  * -foreach: calls @f for each table entry
  *  typedef void ta_foreach(void *ta_state, struct table_info *ti,
  *      ta_foreach_f *f, void *arg);
  * MANDATORY, locked(UH). (M_NOWAIT).
  *
  * Runs callback with specified argument for each table entry,
  * Typically used for dumping table entries.
  *
  *
  *
  * -dump_tentry: dump table entry in current @tentry format.
  *  typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e,
  *      ipfw_obj_tentry *tent);
  * MANDATORY, locked(UH). (M_NOWAIT). Returns 0 on success.
  *
  * Dumps entry @e to @tent.
  *
  *
  * -print_config: prints custom algoritm options into buffer.
  *  typedef void (ta_print_config)(void *ta_state, struct table_info *ti,
  *      char *buf, size_t bufsize);
  * OPTIONAL. locked(UH). (M_NOWAIT).
  *
  * Prints custom algorithm options in the format suitable to pass
  * back to -init callback.
  *
  *
  *
  * -dump_tinfo: dumps algo-specific info.
  *  typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti,
  *      ipfw_ta_tinfo *tinfo);
  * OPTIONAL. locked(UH). (M_NOWAIT).
  *
  * Dumps options like items size/hash size, etc.
  */
 
 MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
 
 /*
  * Utility structures/functions common to more than one algo
  */
 
 struct mod_item {
 	void	*main_ptr;
 	size_t	size;
 	void	*main_ptr6;
 	size_t	size6;
 };
 
 static int badd(const void *key, void *item, void *base, size_t nmemb,
     size_t size, int (*compar) (const void *, const void *));
 static int bdel(const void *key, void *base, size_t nmemb, size_t size,
     int (*compar) (const void *, const void *));
 
 
 /*
  * ADDR implementation using radix
  *
  */
 
 /*
  * The radix code expects addr and mask to be array of bytes,
  * with the first byte being the length of the array. rn_inithead
  * is called with the offset in bits of the lookup key within the
  * array. If we use a sockaddr_in as the underlying type,
  * sin_len is conveniently located at offset 0, sin_addr is at
  * offset 4 and normally aligned.
  * But for portability, let's avoid assumption and make the code explicit
  */
 #define KEY_LEN(v)	*((uint8_t *)&(v))
 /*
  * Do not require radix to compare more than actual IPv4/IPv6 address
  */
 #define KEY_LEN_INET	(offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t))
 #define KEY_LEN_INET6	(offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr))
 
 #define OFF_LEN_INET	(8 * offsetof(struct sockaddr_in, sin_addr))
 #define OFF_LEN_INET6	(8 * offsetof(struct sa_in6, sin6_addr))
 
 struct radix_addr_entry {
 	struct radix_node	rn[2];
 	struct sockaddr_in	addr;
 	uint32_t		value;
 	uint8_t			masklen;
 };
 
 struct sa_in6 {
 	uint8_t			sin6_len;
 	uint8_t			sin6_family;
 	uint8_t			pad[2];
 	struct in6_addr		sin6_addr;
 };
 
 struct radix_addr_xentry {
 	struct radix_node	rn[2];
 	struct sa_in6		addr6;
 	uint32_t		value;
 	uint8_t			masklen;
 };
 
 struct radix_cfg {
 	struct radix_node_head	*head4;
 	struct radix_node_head	*head6;
 	size_t			count4;
 	size_t			count6;
 };
 
 struct ta_buf_radix
 {
 	void *ent_ptr;
 	struct sockaddr	*addr_ptr;
 	struct sockaddr	*mask_ptr;
 	union {
 		struct {
 			struct sockaddr_in sa;
 			struct sockaddr_in ma;
 		} a4;
 		struct {
 			struct sa_in6 sa;
 			struct sa_in6 ma;
 		} a6;
 	} addr;
 };
 
 static int ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 static int ta_init_radix(struct ip_fw_chain *ch, void **ta_state,
     struct table_info *ti, char *data, uint8_t tflags);
 static int flush_radix_entry(struct radix_node *rn, void *arg);
 static void ta_destroy_radix(void *ta_state, struct table_info *ti);
 static void ta_dump_radix_tinfo(void *ta_state, struct table_info *ti,
     ipfw_ta_tinfo *tinfo);
 static int ta_dump_radix_tentry(void *ta_state, struct table_info *ti,
     void *e, ipfw_obj_tentry *tent);
 static int ta_find_radix_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent);
 static void ta_foreach_radix(void *ta_state, struct table_info *ti,
     ta_foreach_f *f, void *arg);
 static void tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa,
     struct sockaddr *ma, int *set_mask);
 static int ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_add_radix(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static int ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_del_radix(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_need_modify_radix(void *ta_state, struct table_info *ti,
     uint32_t count, uint64_t *pflags);
 
 static int
 ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct radix_node_head *rnh;
 
 	if (keylen == sizeof(in_addr_t)) {
 		struct radix_addr_entry *ent;
 		struct sockaddr_in sa;
 		KEY_LEN(sa) = KEY_LEN_INET;
 		sa.sin_addr.s_addr = *((in_addr_t *)key);
 		rnh = (struct radix_node_head *)ti->state;
 		ent = (struct radix_addr_entry *)(rnh->rnh_matchaddr(&sa, &rnh->rh));
 		if (ent != NULL) {
 			*val = ent->value;
 			return (1);
 		}
 	} else {
 		struct radix_addr_xentry *xent;
 		struct sa_in6 sa6;
 		KEY_LEN(sa6) = KEY_LEN_INET6;
 		memcpy(&sa6.sin6_addr, key, sizeof(struct in6_addr));
 		rnh = (struct radix_node_head *)ti->xstate;
 		xent = (struct radix_addr_xentry *)(rnh->rnh_matchaddr(&sa6, &rnh->rh));
 		if (xent != NULL) {
 			*val = xent->value;
 			return (1);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * New table
  */
 static int
 ta_init_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
     char *data, uint8_t tflags)
 {
 	struct radix_cfg *cfg;
 
 	if (!rn_inithead(&ti->state, OFF_LEN_INET))
 		return (ENOMEM);
 	if (!rn_inithead(&ti->xstate, OFF_LEN_INET6)) {
 		rn_detachhead(&ti->state);
 		return (ENOMEM);
 	}
 
 	cfg = malloc(sizeof(struct radix_cfg), M_IPFW, M_WAITOK | M_ZERO);
 
 	*ta_state = cfg;
 	ti->lookup = ta_lookup_radix;
 
 	return (0);
 }
 
 static int
 flush_radix_entry(struct radix_node *rn, void *arg)
 {
 	struct radix_node_head * const rnh = arg;
 	struct radix_addr_entry *ent;
 
 	ent = (struct radix_addr_entry *)
 	    rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, &rnh->rh);
 	if (ent != NULL)
 		free(ent, M_IPFW_TBL);
 	return (0);
 }
 
 static void
 ta_destroy_radix(void *ta_state, struct table_info *ti)
 {
 	struct radix_cfg *cfg;
 	struct radix_node_head *rnh;
 
 	cfg = (struct radix_cfg *)ta_state;
 
 	rnh = (struct radix_node_head *)(ti->state);
 	rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh);
 	rn_detachhead(&ti->state);
 
 	rnh = (struct radix_node_head *)(ti->xstate);
 	rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh);
 	rn_detachhead(&ti->xstate);
 
 	free(cfg, M_IPFW);
 }
 
 /*
  * Provide algo-specific table info
  */
 static void
 ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
 {
 	struct radix_cfg *cfg;
 
 	cfg = (struct radix_cfg *)ta_state;
 
 	tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM;
 	tinfo->taclass4 = IPFW_TACLASS_RADIX;
 	tinfo->count4 = cfg->count4;
 	tinfo->itemsize4 = sizeof(struct radix_addr_entry);
 	tinfo->taclass6 = IPFW_TACLASS_RADIX;
 	tinfo->count6 = cfg->count6;
 	tinfo->itemsize6 = sizeof(struct radix_addr_xentry);
 }
 
 static int
 ta_dump_radix_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent)
 {
 	struct radix_addr_entry *n;
 #ifdef INET6
 	struct radix_addr_xentry *xn;
 #endif
 
 	n = (struct radix_addr_entry *)e;
 
 	/* Guess IPv4/IPv6 radix by sockaddr family */
 	if (n->addr.sin_family == AF_INET) {
 		tent->k.addr.s_addr = n->addr.sin_addr.s_addr;
 		tent->masklen = n->masklen;
 		tent->subtype = AF_INET;
 		tent->v.kidx = n->value;
 #ifdef INET6
 	} else {
 		xn = (struct radix_addr_xentry *)e;
 		memcpy(&tent->k, &xn->addr6.sin6_addr, sizeof(struct in6_addr));
 		tent->masklen = xn->masklen;
 		tent->subtype = AF_INET6;
 		tent->v.kidx = xn->value;
 #endif
 	}
 
 	return (0);
 }
 
 static int
 ta_find_radix_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent)
 {
 	struct radix_node_head *rnh;
 	void *e;
 
 	e = NULL;
 	if (tent->subtype == AF_INET) {
 		struct sockaddr_in sa;
 		KEY_LEN(sa) = KEY_LEN_INET;
 		sa.sin_addr.s_addr = tent->k.addr.s_addr;
 		rnh = (struct radix_node_head *)ti->state;
 		e = rnh->rnh_matchaddr(&sa, &rnh->rh);
 	} else {
 		struct sa_in6 sa6;
 		KEY_LEN(sa6) = KEY_LEN_INET6;
 		memcpy(&sa6.sin6_addr, &tent->k.addr6, sizeof(struct in6_addr));
 		rnh = (struct radix_node_head *)ti->xstate;
 		e = rnh->rnh_matchaddr(&sa6, &rnh->rh);
 	}
 
 	if (e != NULL) {
 		ta_dump_radix_tentry(ta_state, ti, e, tent);
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_foreach_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f,
     void *arg)
 {
 	struct radix_node_head *rnh;
 
 	rnh = (struct radix_node_head *)(ti->state);
 	rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg);
 
 	rnh = (struct radix_node_head *)(ti->xstate);
 	rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg);
 }
 
 
 #ifdef INET6
 static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask);
 
 static inline void
 ipv6_writemask(struct in6_addr *addr6, uint8_t mask)
 {
 	uint32_t *cp;
 
 	for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32)
 		*cp++ = 0xFFFFFFFF;
 	*cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0);
 }
 #endif
 
 static void
 tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa,
     struct sockaddr *ma, int *set_mask)
 {
 	int mlen;
 #ifdef INET
 	struct sockaddr_in *addr, *mask;
 #endif
 #ifdef INET6
 	struct sa_in6 *addr6, *mask6;
 #endif
 	in_addr_t a4;
 
 	mlen = tei->masklen;
 
 	if (tei->subtype == AF_INET) {
 #ifdef INET
 		addr = (struct sockaddr_in *)sa;
 		mask = (struct sockaddr_in *)ma;
 		/* Set 'total' structure length */
 		KEY_LEN(*addr) = KEY_LEN_INET;
 		KEY_LEN(*mask) = KEY_LEN_INET;
 		addr->sin_family = AF_INET;
 		mask->sin_addr.s_addr =
 		    htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
 		a4 = *((in_addr_t *)tei->paddr);
 		addr->sin_addr.s_addr = a4 & mask->sin_addr.s_addr;
 		if (mlen != 32)
 			*set_mask = 1;
 		else
 			*set_mask = 0;
 #endif
 #ifdef INET6
 	} else if (tei->subtype == AF_INET6) {
 		/* IPv6 case */
 		addr6 = (struct sa_in6 *)sa;
 		mask6 = (struct sa_in6 *)ma;
 		/* Set 'total' structure length */
 		KEY_LEN(*addr6) = KEY_LEN_INET6;
 		KEY_LEN(*mask6) = KEY_LEN_INET6;
 		addr6->sin6_family = AF_INET6;
 		ipv6_writemask(&mask6->sin6_addr, mlen);
 		memcpy(&addr6->sin6_addr, tei->paddr, sizeof(struct in6_addr));
 		APPLY_MASK(&addr6->sin6_addr, &mask6->sin6_addr);
 		if (mlen != 128)
 			*set_mask = 1;
 		else
 			*set_mask = 0;
 #endif
 	}
 }
 
 static int
 ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_radix *tb;
 	struct radix_addr_entry *ent;
 #ifdef INET6
 	struct radix_addr_xentry *xent;
 #endif
 	struct sockaddr *addr, *mask;
 	int mlen, set_mask;
 
 	tb = (struct ta_buf_radix *)ta_buf;
 
 	mlen = tei->masklen;
 	set_mask = 0;
 	
 	if (tei->subtype == AF_INET) {
 #ifdef INET
 		if (mlen > 32)
 			return (EINVAL);
 		ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO);
 		ent->masklen = mlen;
 
 		addr = (struct sockaddr *)&ent->addr;
 		mask = (struct sockaddr *)&tb->addr.a4.ma;
 		tb->ent_ptr = ent;
 #endif
 #ifdef INET6
 	} else if (tei->subtype == AF_INET6) {
 		/* IPv6 case */
 		if (mlen > 128)
 			return (EINVAL);
 		xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO);
 		xent->masklen = mlen;
 
 		addr = (struct sockaddr *)&xent->addr6;
 		mask = (struct sockaddr *)&tb->addr.a6.ma;
 		tb->ent_ptr = xent;
 #endif
 	} else {
 		/* Unknown CIDR type */
 		return (EINVAL);
 	}
 
 	tei_to_sockaddr_ent(tei, addr, mask, &set_mask);
 	/* Set pointers */
 	tb->addr_ptr = addr;
 	if (set_mask != 0)
 		tb->mask_ptr = mask;
 
 	return (0);
 }
 
 static int
 ta_add_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct radix_cfg *cfg;
 	struct radix_node_head *rnh;
 	struct radix_node *rn;
 	struct ta_buf_radix *tb;
 	uint32_t *old_value, value;
 
 	cfg = (struct radix_cfg *)ta_state;
 	tb = (struct ta_buf_radix *)ta_buf;
 
 	/* Save current entry value from @tei */
 	if (tei->subtype == AF_INET) {
 		rnh = ti->state;
 		((struct radix_addr_entry *)tb->ent_ptr)->value = tei->value;
 	} else {
 		rnh = ti->xstate;
 		((struct radix_addr_xentry *)tb->ent_ptr)->value = tei->value;
 	}
 
 	/* Search for an entry first */
 	rn = rnh->rnh_lookup(tb->addr_ptr, tb->mask_ptr, &rnh->rh);
 	if (rn != NULL) {
 		if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
 			return (EEXIST);
 		/* Record already exists. Update value if we're asked to */
 		if (tei->subtype == AF_INET)
 			old_value = &((struct radix_addr_entry *)rn)->value;
 		else
 			old_value = &((struct radix_addr_xentry *)rn)->value;
 
 		value = *old_value;
 		*old_value = tei->value;
 		tei->value = value;
 
 		/* Indicate that update has happened instead of addition */
 		tei->flags |= TEI_FLAGS_UPDATED;
 		*pnum = 0;
 
 		return (0);
 	}
 
 	if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
 		return (EFBIG);
 
 	rn = rnh->rnh_addaddr(tb->addr_ptr, tb->mask_ptr, &rnh->rh, tb->ent_ptr);
 	if (rn == NULL) {
 		/* Unknown error */
 		return (EINVAL);
 	}
 	
 	if (tei->subtype == AF_INET)
 		cfg->count4++;
 	else
 		cfg->count6++;
 	tb->ent_ptr = NULL;
 	*pnum = 1;
 
 	return (0);
 }
 
 static int
 ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_radix *tb;
 	struct sockaddr *addr, *mask;
 	int mlen, set_mask;
 
 	tb = (struct ta_buf_radix *)ta_buf;
 
 	mlen = tei->masklen;
 	set_mask = 0;
 
 	if (tei->subtype == AF_INET) {
 		if (mlen > 32)
 			return (EINVAL);
 
 		addr = (struct sockaddr *)&tb->addr.a4.sa;
 		mask = (struct sockaddr *)&tb->addr.a4.ma;
 #ifdef INET6
 	} else if (tei->subtype == AF_INET6) {
 		if (mlen > 128)
 			return (EINVAL);
 
 		addr = (struct sockaddr *)&tb->addr.a6.sa;
 		mask = (struct sockaddr *)&tb->addr.a6.ma;
 #endif
 	} else
 		return (EINVAL);
 
 	tei_to_sockaddr_ent(tei, addr, mask, &set_mask);
 	tb->addr_ptr = addr;
 	if (set_mask != 0)
 		tb->mask_ptr = mask;
 
 	return (0);
 }
 
 static int
 ta_del_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct radix_cfg *cfg;
 	struct radix_node_head *rnh;
 	struct radix_node *rn;
 	struct ta_buf_radix *tb;
 
 	cfg = (struct radix_cfg *)ta_state;
 	tb = (struct ta_buf_radix *)ta_buf;
 
 	if (tei->subtype == AF_INET)
 		rnh = ti->state;
 	else
 		rnh = ti->xstate;
 
 	rn = rnh->rnh_deladdr(tb->addr_ptr, tb->mask_ptr, &rnh->rh);
 
 	if (rn == NULL)
 		return (ENOENT);
 
 	/* Save entry value to @tei */
 	if (tei->subtype == AF_INET)
 		tei->value = ((struct radix_addr_entry *)rn)->value;
 	else
 		tei->value = ((struct radix_addr_xentry *)rn)->value;
 
 	tb->ent_ptr = rn;
 	
 	if (tei->subtype == AF_INET)
 		cfg->count4--;
 	else
 		cfg->count6--;
 	*pnum = 1;
 
 	return (0);
 }
 
 static void
 ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_radix *tb;
 
 	tb = (struct ta_buf_radix *)ta_buf;
 
 	if (tb->ent_ptr != NULL)
 		free(tb->ent_ptr, M_IPFW_TBL);
 }
 
 static int
 ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count,
     uint64_t *pflags)
 {
 
 	/*
 	 * radix does not require additional memory allocations
 	 * other than nodes itself. Adding new masks to the tree do
 	 * but we don't have any API to call (and we don't known which
 	 * sizes do we need).
 	 */
 	return (0);
 }
 
 struct table_algo addr_radix = {
 	.name		= "addr:radix",
 	.type		= IPFW_TABLE_ADDR,
 	.flags		= TA_FLAG_DEFAULT,
 	.ta_buf_size	= sizeof(struct ta_buf_radix),
 	.init		= ta_init_radix,
 	.destroy	= ta_destroy_radix,
 	.prepare_add	= ta_prepare_add_radix,
 	.prepare_del	= ta_prepare_del_radix,
 	.add		= ta_add_radix,
 	.del		= ta_del_radix,
 	.flush_entry	= ta_flush_radix_entry,
 	.foreach	= ta_foreach_radix,
 	.dump_tentry	= ta_dump_radix_tentry,
 	.find_tentry	= ta_find_radix_tentry,
 	.dump_tinfo	= ta_dump_radix_tinfo,
 	.need_modify	= ta_need_modify_radix,
 };
 
 
 /*
  * addr:hash cmds
  *
  *
  * ti->data:
  * [inv.mask4][inv.mask6][log2hsize4][log2hsize6]
  * [        8][        8[          8][         8]
  *
  * inv.mask4: 32 - mask
  * inv.mask6:
  * 1) _slow lookup: mask
  * 2) _aligned: (128 - mask) / 8
  * 3) _64: 8
  *
  *
  * pflags:
  * [v4=1/v6=0][hsize]
  * [       32][   32]
  */
 
 struct chashentry;
 
 SLIST_HEAD(chashbhead, chashentry);
 
 struct chash_cfg {
 	struct chashbhead *head4;
 	struct chashbhead *head6;
 	size_t	size4;
 	size_t	size6;
 	size_t	items4;
 	size_t	items6;
 	uint8_t	mask4;
 	uint8_t	mask6;
 };
 
 struct chashentry {
 	SLIST_ENTRY(chashentry)	next;
 	uint32_t	value;
 	uint32_t	type;
 	union {
 		uint32_t	a4;	/* Host format */
 		struct in6_addr	a6;	/* Network format */
 	} a;
 };
 
 struct ta_buf_chash
 {
 	void *ent_ptr;
 	struct chashentry ent;
 };
 
 #ifdef INET
 static __inline uint32_t hash_ip(uint32_t addr, int hsize);
 #endif
 #ifdef INET6
 static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize);
 static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize);
 static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key,
     int mask, int hsize);
 static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask,
     int hsize);
 #endif
 static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 static int ta_lookup_chash_aligned(struct table_info *ti, void *key,
     uint32_t keylen, uint32_t *val);
 static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 static int chash_parse_opts(struct chash_cfg *cfg, char *data);
 static void ta_print_chash_config(void *ta_state, struct table_info *ti,
     char *buf, size_t bufsize);
 static int ta_log2(uint32_t v);
 static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state,
     struct table_info *ti, char *data, uint8_t tflags);
 static void ta_destroy_chash(void *ta_state, struct table_info *ti);
 static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti,
     ipfw_ta_tinfo *tinfo);
 static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti,
     void *e, ipfw_obj_tentry *tent);
 static uint32_t hash_ent(struct chashentry *ent, int af, int mlen,
     uint32_t size);
 static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent);
 static int ta_find_chash_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent);
 static void ta_foreach_chash(void *ta_state, struct table_info *ti,
     ta_foreach_f *f, void *arg);
 static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_add_chash(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_del_chash(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_need_modify_chash(void *ta_state, struct table_info *ti,
     uint32_t count, uint64_t *pflags);
 static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags);
 static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t *pflags);
 static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags);
 static void ta_flush_mod_chash(void *ta_buf);
 
 
 #ifdef INET
 static __inline uint32_t
 hash_ip(uint32_t addr, int hsize)
 {
 
 	return (addr % (hsize - 1));
 }
 #endif
 
 #ifdef INET6
 static __inline uint32_t
 hash_ip6(struct in6_addr *addr6, int hsize)
 {
 	uint32_t i;
 
 	i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1] ^
 	    addr6->s6_addr32[2] ^ addr6->s6_addr32[3];
 
 	return (i % (hsize - 1));
 }
 
 
 static __inline uint16_t
 hash_ip64(struct in6_addr *addr6, int hsize)
 {
 	uint32_t i;
 
 	i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1];
 
 	return (i % (hsize - 1));
 }
 
 
 static __inline uint32_t
 hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize)
 {
 	struct in6_addr mask6;
 
 	ipv6_writemask(&mask6, mask);
 	memcpy(addr6, key, sizeof(struct in6_addr));
 	APPLY_MASK(addr6, &mask6);
 	return (hash_ip6(addr6, hsize));
 }
 
 static __inline uint32_t
 hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize)
 {
 	uint64_t *paddr;
 
 	paddr = (uint64_t *)addr6;
 	*paddr = 0;
 	*(paddr + 1) = 0;
 	memcpy(addr6, key, mask);
 	return (hash_ip6(addr6, hsize));
 }
 #endif
 
 static int
 ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct chashbhead *head;
 	struct chashentry *ent;
 	uint16_t hash, hsize;
 	uint8_t imask;
 
 	if (keylen == sizeof(in_addr_t)) {
 #ifdef INET
 		head = (struct chashbhead *)ti->state;
 		imask = ti->data >> 24;
 		hsize = 1 << ((ti->data & 0xFFFF) >> 8);
 		uint32_t a;
 		a = ntohl(*((in_addr_t *)key));
 		a = a >> imask;
 		hash = hash_ip(a, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			if (ent->a.a4 == a) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 #endif
 	} else {
 #ifdef INET6
 		/* IPv6: worst scenario: non-round mask */
 		struct in6_addr addr6;
 		head = (struct chashbhead *)ti->xstate;
 		imask = (ti->data & 0xFF0000) >> 16;
 		hsize = 1 << (ti->data & 0xFF);
 		hash = hash_ip6_slow(&addr6, key, imask, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			if (memcmp(&ent->a.a6, &addr6, 16) == 0) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 #endif
 	}
 
 	return (0);
 }
 
 static int
 ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct chashbhead *head;
 	struct chashentry *ent;
 	uint16_t hash, hsize;
 	uint8_t imask;
 
 	if (keylen == sizeof(in_addr_t)) {
 #ifdef INET
 		head = (struct chashbhead *)ti->state;
 		imask = ti->data >> 24;
 		hsize = 1 << ((ti->data & 0xFFFF) >> 8);
 		uint32_t a;
 		a = ntohl(*((in_addr_t *)key));
 		a = a >> imask;
 		hash = hash_ip(a, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			if (ent->a.a4 == a) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 #endif
 	} else {
 #ifdef INET6
 		/* IPv6: aligned to 8bit mask */
 		struct in6_addr addr6;
 		uint64_t *paddr, *ptmp;
 		head = (struct chashbhead *)ti->xstate;
 		imask = (ti->data & 0xFF0000) >> 16;
 		hsize = 1 << (ti->data & 0xFF);
 
 		hash = hash_ip6_al(&addr6, key, imask, hsize);
 		paddr = (uint64_t *)&addr6;
 		SLIST_FOREACH(ent, &head[hash], next) {
 			ptmp = (uint64_t *)&ent->a.a6;
 			if (paddr[0] == ptmp[0] && paddr[1] == ptmp[1]) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 #endif
 	}
 
 	return (0);
 }
 
 static int
 ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct chashbhead *head;
 	struct chashentry *ent;
 	uint16_t hash, hsize;
 	uint8_t imask;
 
 	if (keylen == sizeof(in_addr_t)) {
 #ifdef INET
 		head = (struct chashbhead *)ti->state;
 		imask = ti->data >> 24;
 		hsize = 1 << ((ti->data & 0xFFFF) >> 8);
 		uint32_t a;
 		a = ntohl(*((in_addr_t *)key));
 		a = a >> imask;
 		hash = hash_ip(a, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			if (ent->a.a4 == a) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 #endif
 	} else {
 #ifdef INET6
 		/* IPv6: /64 */
 		uint64_t a6, *paddr;
 		head = (struct chashbhead *)ti->xstate;
 		paddr = (uint64_t *)key;
 		hsize = 1 << (ti->data & 0xFF);
 		a6 = *paddr;
 		hash = hash_ip64((struct in6_addr *)key, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			paddr = (uint64_t *)&ent->a.a6;
 			if (a6 == *paddr) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 #endif
 	}
 
 	return (0);
 }
 
 static int
 chash_parse_opts(struct chash_cfg *cfg, char *data)
 {
 	char *pdel, *pend, *s;
 	int mask4, mask6;
 
 	mask4 = cfg->mask4;
 	mask6 = cfg->mask6;
 
 	if (data == NULL)
 		return (0);
 	if ((pdel = strchr(data, ' ')) == NULL)
 		return (0);
 	while (*pdel == ' ')
 		pdel++;
 	if (strncmp(pdel, "masks=", 6) != 0)
 		return (EINVAL);
 	if ((s = strchr(pdel, ' ')) != NULL)
 		*s++ = '\0';
 
 	pdel += 6;
 	/* Need /XX[,/YY] */
 	if (*pdel++ != '/')
 		return (EINVAL);
 	mask4 = strtol(pdel, &pend, 10);
 	if (*pend == ',') {
 		/* ,/YY */
 		pdel = pend + 1;
 		if (*pdel++ != '/')
 			return (EINVAL);
 		mask6 = strtol(pdel, &pend, 10);
 		if (*pend != '\0')
 			return (EINVAL);
 	} else if (*pend != '\0')
 		return (EINVAL);
 
 	if (mask4 < 0 || mask4 > 32 || mask6 < 0 || mask6 > 128)
 		return (EINVAL);
 
 	cfg->mask4 = mask4;
 	cfg->mask6 = mask6;
 
 	return (0);
 }
 
 static void
 ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf,
     size_t bufsize)
 {
 	struct chash_cfg *cfg;
 
 	cfg = (struct chash_cfg *)ta_state;
 
 	if (cfg->mask4 != 32 || cfg->mask6 != 128)
 		snprintf(buf, bufsize, "%s masks=/%d,/%d", "addr:hash",
 		    cfg->mask4, cfg->mask6);
 	else
 		snprintf(buf, bufsize, "%s", "addr:hash");
 }
 
 static int
 ta_log2(uint32_t v)
 {
 	uint32_t r;
 
 	r = 0;
 	while (v >>= 1)
 		r++;
 
 	return (r);
 }
 
 /*
  * New table.
  * We assume 'data' to be either NULL or the following format:
  * 'addr:hash [masks=/32[,/128]]'
  */
 static int
 ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
     char *data, uint8_t tflags)
 {
 	int error, i;
 	uint32_t hsize;
 	struct chash_cfg *cfg;
 
 	cfg = malloc(sizeof(struct chash_cfg), M_IPFW, M_WAITOK | M_ZERO);
 
 	cfg->mask4 = 32;
 	cfg->mask6 = 128;
 
 	if ((error = chash_parse_opts(cfg, data)) != 0) {
 		free(cfg, M_IPFW);
 		return (error);
 	}
 
 	cfg->size4 = 128;
 	cfg->size6 = 128;
 
 	cfg->head4 = malloc(sizeof(struct chashbhead) * cfg->size4, M_IPFW,
 	    M_WAITOK | M_ZERO);
 	cfg->head6 = malloc(sizeof(struct chashbhead) * cfg->size6, M_IPFW,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < cfg->size4; i++)
 		SLIST_INIT(&cfg->head4[i]);
 	for (i = 0; i < cfg->size6; i++)
 		SLIST_INIT(&cfg->head6[i]);
 
 
 	*ta_state = cfg;
 	ti->state = cfg->head4;
 	ti->xstate = cfg->head6;
 
 	/* Store data depending on v6 mask length */
 	hsize = ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6);
 	if (cfg->mask6 == 64) {
 		ti->data = (32 - cfg->mask4) << 24 | (128 - cfg->mask6) << 16|
 		    hsize;
 		ti->lookup = ta_lookup_chash_64;
 	} else if ((cfg->mask6  % 8) == 0) {
 		ti->data = (32 - cfg->mask4) << 24 |
 		    cfg->mask6 << 13 | hsize;
 		ti->lookup = ta_lookup_chash_aligned;
 	} else {
 		/* don't do that! */
 		ti->data = (32 - cfg->mask4) << 24 |
 		    cfg->mask6 << 16 | hsize;
 		ti->lookup = ta_lookup_chash_slow;
 	}
 
 	return (0);
 }
 
 static void
 ta_destroy_chash(void *ta_state, struct table_info *ti)
 {
 	struct chash_cfg *cfg;
 	struct chashentry *ent, *ent_next;
 	int i;
 
 	cfg = (struct chash_cfg *)ta_state;
 
 	for (i = 0; i < cfg->size4; i++)
 		SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next)
 			free(ent, M_IPFW_TBL);
 
 	for (i = 0; i < cfg->size6; i++)
 		SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next)
 			free(ent, M_IPFW_TBL);
 
 	free(cfg->head4, M_IPFW);
 	free(cfg->head6, M_IPFW);
 
 	free(cfg, M_IPFW);
 }
 
 static void
 ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
 {
 	struct chash_cfg *cfg;
 
 	cfg = (struct chash_cfg *)ta_state;
 
 	tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM;
 	tinfo->taclass4 = IPFW_TACLASS_HASH;
 	tinfo->size4 = cfg->size4;
 	tinfo->count4 = cfg->items4;
 	tinfo->itemsize4 = sizeof(struct chashentry);
 	tinfo->taclass6 = IPFW_TACLASS_HASH;
 	tinfo->size6 = cfg->size6;
 	tinfo->count6 = cfg->items6;
 	tinfo->itemsize6 = sizeof(struct chashentry);
 }
 
 static int
 ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent)
 {
 	struct chash_cfg *cfg;
 	struct chashentry *ent;
 
 	cfg = (struct chash_cfg *)ta_state;
 	ent = (struct chashentry *)e;
 
 	if (ent->type == AF_INET) {
 		tent->k.addr.s_addr = htonl(ent->a.a4 << (32 - cfg->mask4));
 		tent->masklen = cfg->mask4;
 		tent->subtype = AF_INET;
 		tent->v.kidx = ent->value;
 #ifdef INET6
 	} else {
 		memcpy(&tent->k, &ent->a.a6, sizeof(struct in6_addr));
 		tent->masklen = cfg->mask6;
 		tent->subtype = AF_INET6;
 		tent->v.kidx = ent->value;
 #endif
 	}
 
 	return (0);
 }
 
 static uint32_t
 hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size)
 {
 	uint32_t hash;
 
 	hash = 0;
 
 	if (af == AF_INET) {
 #ifdef INET
 		hash = hash_ip(ent->a.a4, size);
 #endif
 	} else {
 #ifdef INET6
 		if (mlen == 64)
 			hash = hash_ip64(&ent->a.a6, size);
 		else
 			hash = hash_ip6(&ent->a.a6, size);
 #endif
 	}
 
 	return (hash);
 }
 
 static int
 tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent)
 {
 	int mlen;
 #ifdef INET6
 	struct in6_addr mask6;
 #endif
 
 
 	mlen = tei->masklen;
 	
 	if (tei->subtype == AF_INET) {
 #ifdef INET
 		if (mlen > 32)
 			return (EINVAL);
 		ent->type = AF_INET;
 
 		/* Calculate masked address */
 		ent->a.a4 = ntohl(*((in_addr_t *)tei->paddr)) >> (32 - mlen);
 #endif
 #ifdef INET6
 	} else if (tei->subtype == AF_INET6) {
 		/* IPv6 case */
 		if (mlen > 128)
 			return (EINVAL);
 		ent->type = AF_INET6;
 
 		ipv6_writemask(&mask6, mlen);
 		memcpy(&ent->a.a6, tei->paddr, sizeof(struct in6_addr));
 		APPLY_MASK(&ent->a.a6, &mask6);
 #endif
 	} else {
 		/* Unknown CIDR type */
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 ta_find_chash_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent)
 {
 	struct chash_cfg *cfg;
 	struct chashbhead *head;
 	struct chashentry ent, *tmp;
 	struct tentry_info tei;
 	int error;
 	uint32_t hash;
 
 	cfg = (struct chash_cfg *)ta_state;
 
 	memset(&ent, 0, sizeof(ent));
 	memset(&tei, 0, sizeof(tei));
 
 	if (tent->subtype == AF_INET) {
 		tei.paddr = &tent->k.addr;
 		tei.masklen = cfg->mask4;
 		tei.subtype = AF_INET;
 
 		if ((error = tei_to_chash_ent(&tei, &ent)) != 0)
 			return (error);
 
 		head = cfg->head4;
 		hash = hash_ent(&ent, AF_INET, cfg->mask4, cfg->size4);
 		/* Check for existence */
 		SLIST_FOREACH(tmp, &head[hash], next) {
 			if (tmp->a.a4 != ent.a.a4)
 				continue;
 
 			ta_dump_chash_tentry(ta_state, ti, tmp, tent);
 			return (0);
 		}
 	} else {
 		tei.paddr = &tent->k.addr6;
 		tei.masklen = cfg->mask6;
 		tei.subtype = AF_INET6;
 
 		if ((error = tei_to_chash_ent(&tei, &ent)) != 0)
 			return (error);
 
 		head = cfg->head6;
 		hash = hash_ent(&ent, AF_INET6, cfg->mask6, cfg->size6);
 		/* Check for existence */
 		SLIST_FOREACH(tmp, &head[hash], next) {
 			if (memcmp(&tmp->a.a6, &ent.a.a6, 16) != 0)
 				continue;
 			ta_dump_chash_tentry(ta_state, ti, tmp, tent);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f,
     void *arg)
 {
 	struct chash_cfg *cfg;
 	struct chashentry *ent, *ent_next;
 	int i;
 
 	cfg = (struct chash_cfg *)ta_state;
 
 	for (i = 0; i < cfg->size4; i++)
 		SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next)
 			f(ent, arg);
 
 	for (i = 0; i < cfg->size6; i++)
 		SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next)
 			f(ent, arg);
 }
 
 static int
 ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_chash *tb;
 	struct chashentry *ent;
 	int error;
 
 	tb = (struct ta_buf_chash *)ta_buf;
 
 	ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO);
 
 	error = tei_to_chash_ent(tei, ent);
 	if (error != 0) {
 		free(ent, M_IPFW_TBL);
 		return (error);
 	}
 	tb->ent_ptr = ent;
 
 	return (0);
 }
 
 static int
 ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct chash_cfg *cfg;
 	struct chashbhead *head;
 	struct chashentry *ent, *tmp;
 	struct ta_buf_chash *tb;
 	int exists;
 	uint32_t hash, value;
 
 	cfg = (struct chash_cfg *)ta_state;
 	tb = (struct ta_buf_chash *)ta_buf;
 	ent = (struct chashentry *)tb->ent_ptr;
 	hash = 0;
 	exists = 0;
 
 	/* Read current value from @tei */
 	ent->value = tei->value;
 
 	/* Read cuurrent value */
 	if (tei->subtype == AF_INET) {
 		if (tei->masklen != cfg->mask4)
 			return (EINVAL);
 		head = cfg->head4;
 		hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4);
 
 		/* Check for existence */
 		SLIST_FOREACH(tmp, &head[hash], next) {
 			if (tmp->a.a4 == ent->a.a4) {
 				exists = 1;
 				break;
 			}
 		}
 	} else {
 		if (tei->masklen != cfg->mask6)
 			return (EINVAL);
 		head = cfg->head6;
 		hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6);
 		/* Check for existence */
 		SLIST_FOREACH(tmp, &head[hash], next) {
 			if (memcmp(&tmp->a.a6, &ent->a.a6, 16) == 0) {
 				exists = 1;
 				break;
 			}
 		}
 	}
 
 	if (exists == 1) {
 		if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
 			return (EEXIST);
 		/* Record already exists. Update value if we're asked to */
 		value = tmp->value;
 		tmp->value = tei->value;
 		tei->value = value;
 		/* Indicate that update has happened instead of addition */
 		tei->flags |= TEI_FLAGS_UPDATED;
 		*pnum = 0;
 	} else {
 		if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
 			return (EFBIG);
 		SLIST_INSERT_HEAD(&head[hash], ent, next);
 		tb->ent_ptr = NULL;
 		*pnum = 1;
 
 		/* Update counters */
 		if (tei->subtype == AF_INET)
 			cfg->items4++;
 		else
 			cfg->items6++;
 	}
 
 	return (0);
 }
 
 static int
 ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_chash *tb;
 
 	tb = (struct ta_buf_chash *)ta_buf;
 
 	return (tei_to_chash_ent(tei, &tb->ent));
 }
 
 static int
 ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct chash_cfg *cfg;
 	struct chashbhead *head;
 	struct chashentry *tmp, *tmp_next, *ent;
 	struct ta_buf_chash *tb;
 	uint32_t hash;
 
 	cfg = (struct chash_cfg *)ta_state;
 	tb = (struct ta_buf_chash *)ta_buf;
 	ent = &tb->ent;
 
 	if (tei->subtype == AF_INET) {
 		if (tei->masklen != cfg->mask4)
 			return (EINVAL);
 		head = cfg->head4;
 		hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4);
 
 		SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) {
 			if (tmp->a.a4 != ent->a.a4)
 				continue;
 
 			SLIST_REMOVE(&head[hash], tmp, chashentry, next);
 			cfg->items4--;
 			tb->ent_ptr = tmp;
 			tei->value = tmp->value;
 			*pnum = 1;
 			return (0);
 		}
 	} else {
 		if (tei->masklen != cfg->mask6)
 			return (EINVAL);
 		head = cfg->head6;
 		hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6);
 		SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) {
 			if (memcmp(&tmp->a.a6, &ent->a.a6, 16) != 0)
 				continue;
 
 			SLIST_REMOVE(&head[hash], tmp, chashentry, next);
 			cfg->items6--;
 			tb->ent_ptr = tmp;
 			tei->value = tmp->value;
 			*pnum = 1;
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_chash *tb;
 
 	tb = (struct ta_buf_chash *)ta_buf;
 
 	if (tb->ent_ptr != NULL)
 		free(tb->ent_ptr, M_IPFW_TBL);
 }
 
 /*
  * Hash growing callbacks.
  */
 
 static int
 ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count,
     uint64_t *pflags)
 {
 	struct chash_cfg *cfg;
 	uint64_t data;
 
 	/*
 	 * Since we don't know exact number of IPv4/IPv6 records in @count,
 	 * ignore non-zero @count value at all. Check current hash sizes
 	 * and return appropriate data.
 	 */
 
 	cfg = (struct chash_cfg *)ta_state;
 
 	data = 0;
 	if (cfg->items4 > cfg->size4 && cfg->size4 < 65536)
 		data |= (cfg->size4 * 2) << 16;
 	if (cfg->items6 > cfg->size6 && cfg->size6 < 65536)
 		data |= cfg->size6 * 2;
 
 	if (data != 0) {
 		*pflags = data;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Allocate new, larger chash.
  */
 static int
 ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags)
 {
 	struct mod_item *mi;
 	struct chashbhead *head;
 	int i;
 
 	mi = (struct mod_item *)ta_buf;
 
 	memset(mi, 0, sizeof(struct mod_item));
 	mi->size = (*pflags >> 16) & 0xFFFF;
 	mi->size6 = *pflags & 0xFFFF;
 	if (mi->size > 0) {
 		head = malloc(sizeof(struct chashbhead) * mi->size,
 		    M_IPFW, M_WAITOK | M_ZERO);
 		for (i = 0; i < mi->size; i++)
 			SLIST_INIT(&head[i]);
 		mi->main_ptr = head;
 	}
 
 	if (mi->size6 > 0) {
 		head = malloc(sizeof(struct chashbhead) * mi->size6,
 		    M_IPFW, M_WAITOK | M_ZERO);
 		for (i = 0; i < mi->size6; i++)
 			SLIST_INIT(&head[i]);
 		mi->main_ptr6 = head;
 	}
 
 	return (0);
 }
 
 /*
  * Copy data from old runtime array to new one.
  */
 static int
 ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t *pflags)
 {
 
 	/* In is not possible to do rehash if we're not holidng WLOCK. */
 	return (0);
 }
 
 /*
  * Switch old & new arrays.
  */
 static void
 ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags)
 {
 	struct mod_item *mi;
 	struct chash_cfg *cfg;
 	struct chashbhead *old_head, *new_head;
 	struct chashentry *ent, *ent_next;
 	int af, i, mlen;
 	uint32_t nhash;
 	size_t old_size, new_size;
 
 	mi = (struct mod_item *)ta_buf;
 	cfg = (struct chash_cfg *)ta_state;
 
 	/* Check which hash we need to grow and do we still need that */
 	if (mi->size > 0 && cfg->size4 < mi->size) {
 		new_head = (struct chashbhead *)mi->main_ptr;
 		new_size = mi->size;
 		old_size = cfg->size4;
 		old_head = ti->state;
 		mlen = cfg->mask4;
 		af = AF_INET;
 
 		for (i = 0; i < old_size; i++) {
 			SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) {
 				nhash = hash_ent(ent, af, mlen, new_size);
 				SLIST_INSERT_HEAD(&new_head[nhash], ent, next);
 			}
 		}
 
 		ti->state = new_head;
 		cfg->head4 = new_head;
 		cfg->size4 = mi->size;
 		mi->main_ptr = old_head;
 	}
 
 	if (mi->size6 > 0 && cfg->size6 < mi->size6) {
 		new_head = (struct chashbhead *)mi->main_ptr6;
 		new_size = mi->size6;
 		old_size = cfg->size6;
 		old_head = ti->xstate;
 		mlen = cfg->mask6;
 		af = AF_INET6;
 
 		for (i = 0; i < old_size; i++) {
 			SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) {
 				nhash = hash_ent(ent, af, mlen, new_size);
 				SLIST_INSERT_HEAD(&new_head[nhash], ent, next);
 			}
 		}
 
 		ti->xstate = new_head;
 		cfg->head6 = new_head;
 		cfg->size6 = mi->size6;
 		mi->main_ptr6 = old_head;
 	}
 
 	/* Update lower 32 bits with new values */
 	ti->data &= 0xFFFFFFFF00000000;
 	ti->data |= ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6);
 }
 
 /*
  * Free unneded array.
  */
 static void
 ta_flush_mod_chash(void *ta_buf)
 {
 	struct mod_item *mi;
 
 	mi = (struct mod_item *)ta_buf;
 	if (mi->main_ptr != NULL)
 		free(mi->main_ptr, M_IPFW);
 	if (mi->main_ptr6 != NULL)
 		free(mi->main_ptr6, M_IPFW);
 }
 
 struct table_algo addr_hash = {
 	.name		= "addr:hash",
 	.type		= IPFW_TABLE_ADDR,
 	.ta_buf_size	= sizeof(struct ta_buf_chash),
 	.init		= ta_init_chash,
 	.destroy	= ta_destroy_chash,
 	.prepare_add	= ta_prepare_add_chash,
 	.prepare_del	= ta_prepare_del_chash,
 	.add		= ta_add_chash,
 	.del		= ta_del_chash,
 	.flush_entry	= ta_flush_chash_entry,
 	.foreach	= ta_foreach_chash,
 	.dump_tentry	= ta_dump_chash_tentry,
 	.find_tentry	= ta_find_chash_tentry,
 	.print_config	= ta_print_chash_config,
 	.dump_tinfo	= ta_dump_chash_tinfo,
 	.need_modify	= ta_need_modify_chash,
 	.prepare_mod	= ta_prepare_mod_chash,
 	.fill_mod	= ta_fill_mod_chash,
 	.modify		= ta_modify_chash,
 	.flush_mod	= ta_flush_mod_chash,
 };
 
 
 /*
  * Iface table cmds.
  *
  * Implementation:
  *
  * Runtime part:
  * - sorted array of "struct ifidx" pointed by ti->state.
  *   Array is allocated with rounding up to IFIDX_CHUNK. Only existing
  *   interfaces are stored in array, however its allocated size is
  *   sufficient to hold all table records if needed.
  * - current array size is stored in ti->data
  *
  * Table data:
  * - "struct iftable_cfg" is allocated to store table state (ta_state).
  * - All table records are stored inside namedobj instance.
  *
  */
 
 struct ifidx {
 	uint16_t	kidx;
 	uint16_t	spare;
 	uint32_t	value;
 };
 #define	DEFAULT_IFIDX_SIZE	64
 
 struct iftable_cfg;
 
 struct ifentry {
 	struct named_object	no;
 	struct ipfw_ifc		ic;
 	struct iftable_cfg	*icfg;
 	uint32_t		value;
 	int			linked;
 };
 
 struct iftable_cfg {
 	struct namedobj_instance	*ii;
 	struct ip_fw_chain	*ch;
 	struct table_info	*ti;
 	void	*main_ptr;
 	size_t	size;	/* Number of items allocated in array */
 	size_t	count;	/* Number of all items */
 	size_t	used;	/* Number of items _active_ now */
 };
 
 struct ta_buf_ifidx
 {
 	struct ifentry *ife;
 	uint32_t value;
 };
 
 int compare_ifidx(const void *k, const void *v);
 static struct ifidx * ifidx_find(struct table_info *ti, void *key);
 static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state,
     struct table_info *ti, char *data, uint8_t tflags);
 static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti);
 static void destroy_ifidx_locked(struct namedobj_instance *ii,
     struct named_object *no, void *arg);
 static void ta_destroy_ifidx(void *ta_state, struct table_info *ti);
 static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti,
     ipfw_ta_tinfo *tinfo);
 static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_add_ifidx(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_del_ifidx(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static void ta_flush_ifidx_entry(struct ip_fw_chain *ch,
     struct tentry_info *tei, void *ta_buf);
 static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex);
 static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti,
     uint32_t count, uint64_t *pflags);
 static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags);
 static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti,
     void *ta_buf, uint64_t *pflags);
 static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags);
 static void ta_flush_mod_ifidx(void *ta_buf);
 static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent);
 static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent);
 static void foreach_ifidx(struct namedobj_instance *ii, struct named_object *no,
     void *arg);
 static void ta_foreach_ifidx(void *ta_state, struct table_info *ti,
     ta_foreach_f *f, void *arg);
 
 int
 compare_ifidx(const void *k, const void *v)
 {
 	const struct ifidx *ifidx;
 	uint16_t key;
 
 	key = *((const uint16_t *)k);
 	ifidx = (const struct ifidx *)v;
 
 	if (key < ifidx->kidx)
 		return (-1);
 	else if (key > ifidx->kidx)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Adds item @item with key @key into ascending-sorted array @base.
  * Assumes @base has enough additional storage.
  *
  * Returns 1 on success, 0 on duplicate key.
  */
 static int
 badd(const void *key, void *item, void *base, size_t nmemb,
     size_t size, int (*compar) (const void *, const void *))
 {
 	int min, max, mid, shift, res;
 	caddr_t paddr;
 
 	if (nmemb == 0) {
 		memcpy(base, item, size);
 		return (1);
 	}
 
 	/* Binary search */
 	min = 0;
 	max = nmemb - 1;
 	mid = 0;
 	while (min <= max) {
 		mid = (min + max) / 2;
 		res = compar(key, (const void *)((caddr_t)base + mid * size));
 		if (res == 0)
 			return (0);
 
 		if (res > 0)
 			min = mid + 1;
 		else
 			max = mid - 1;
 	}
 
 	/* Item not found. */
 	res = compar(key, (const void *)((caddr_t)base + mid * size));
 	if (res > 0)
 		shift = mid + 1;
 	else
 		shift = mid;
 
 	paddr = (caddr_t)base + shift * size;
 	if (nmemb > shift)
 		memmove(paddr + size, paddr, (nmemb - shift) * size);
 
 	memcpy(paddr, item, size);
 
 	return (1);
 }
 
 /*
  * Deletes item with key @key from ascending-sorted array @base.
  *
  * Returns 1 on success, 0 for non-existent key.
  */
 static int
 bdel(const void *key, void *base, size_t nmemb, size_t size,
     int (*compar) (const void *, const void *))
 {
 	caddr_t item;
 	size_t sz;
 
 	item = (caddr_t)bsearch(key, base, nmemb, size, compar);
 
 	if (item == NULL)
 		return (0);
 
 	sz = (caddr_t)base + nmemb * size - item;
 
 	if (sz > 0)
 		memmove(item, item + size, sz);
 
 	return (1);
 }
 
 static struct ifidx *
 ifidx_find(struct table_info *ti, void *key)
 {
 	struct ifidx *ifi;
 
 	ifi = bsearch(key, ti->state, ti->data, sizeof(struct ifidx),
 	    compare_ifidx);
 
 	return (ifi);
 }
 
 static int
 ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct ifidx *ifi;
 
 	ifi = ifidx_find(ti, key);
 
 	if (ifi != NULL) {
 		*val = ifi->value;
 		return (1);
 	}
 
 	return (0);
 }
 
 static int
 ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
     char *data, uint8_t tflags)
 {
 	struct iftable_cfg *icfg;
 
 	icfg = malloc(sizeof(struct iftable_cfg), M_IPFW, M_WAITOK | M_ZERO);
 
 	icfg->ii = ipfw_objhash_create(DEFAULT_IFIDX_SIZE);
 	icfg->size = DEFAULT_IFIDX_SIZE;
 	icfg->main_ptr = malloc(sizeof(struct ifidx) * icfg->size, M_IPFW,
 	    M_WAITOK | M_ZERO);
 	icfg->ch = ch;
 
 	*ta_state = icfg;
 	ti->state = icfg->main_ptr;
 	ti->lookup = ta_lookup_ifidx;
 
 	return (0);
 }
 
 /*
  * Handle tableinfo @ti pointer change (on table array resize).
  */
 static void
 ta_change_ti_ifidx(void *ta_state, struct table_info *ti)
 {
 	struct iftable_cfg *icfg;
 
 	icfg = (struct iftable_cfg *)ta_state;
 	icfg->ti = ti;
 }
 
 static void
 destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no,
     void *arg)
 {
 	struct ifentry *ife;
 	struct ip_fw_chain *ch;
 
 	ch = (struct ip_fw_chain *)arg;
 	ife = (struct ifentry *)no;
 
 	ipfw_iface_del_notify(ch, &ife->ic);
 	free(ife, M_IPFW_TBL);
 }
 
 
 /*
  * Destroys table @ti
  */
 static void
 ta_destroy_ifidx(void *ta_state, struct table_info *ti)
 {
 	struct iftable_cfg *icfg;
 	struct ip_fw_chain *ch;
 
 	icfg = (struct iftable_cfg *)ta_state;
 	ch = icfg->ch;
 
 	if (icfg->main_ptr != NULL)
 		free(icfg->main_ptr, M_IPFW);
 
 	ipfw_objhash_foreach(icfg->ii, destroy_ifidx_locked, ch);
 
 	ipfw_objhash_destroy(icfg->ii);
 
 	free(icfg, M_IPFW);
 }
 
 /*
  * Provide algo-specific table info
  */
 static void
 ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
 {
 	struct iftable_cfg *cfg;
 
 	cfg = (struct iftable_cfg *)ta_state;
 
 	tinfo->taclass4 = IPFW_TACLASS_ARRAY;
 	tinfo->size4 = cfg->size;
 	tinfo->count4 = cfg->used;
 	tinfo->itemsize4 = sizeof(struct ifidx);
 }
 
 /*
  * Prepare state to add to the table:
  * allocate ifentry and reference needed interface.
  */
 static int
 ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_ifidx *tb;
 	char *ifname;
 	struct ifentry *ife;
 
 	tb = (struct ta_buf_ifidx *)ta_buf;
 
 	/* Check if string is terminated */
 	ifname = (char *)tei->paddr;
 	if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE)
 		return (EINVAL);
 
 	ife = malloc(sizeof(struct ifentry), M_IPFW_TBL, M_WAITOK | M_ZERO);
 	ife->ic.cb = if_notifier;
 	ife->ic.cbdata = ife;
 
 	if (ipfw_iface_ref(ch, ifname, &ife->ic) != 0) {
 		free(ife, M_IPFW_TBL);
 		return (EINVAL);
 	}
 
 	/* Use ipfw_iface 'ifname' field as stable storage */
 	ife->no.name = ife->ic.iface->ifname;
 
 	tb->ife = ife;
 
 	return (0);
 }
 
 static int
 ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct iftable_cfg *icfg;
 	struct ifentry *ife, *tmp;
 	struct ta_buf_ifidx *tb;
 	struct ipfw_iface *iif;
 	struct ifidx *ifi;
 	char *ifname;
 	uint32_t value;
 
 	tb = (struct ta_buf_ifidx *)ta_buf;
 	ifname = (char *)tei->paddr;
 	icfg = (struct iftable_cfg *)ta_state;
 	ife = tb->ife;
 
 	ife->icfg = icfg;
 	ife->value = tei->value;
 
 	tmp = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname);
 
 	if (tmp != NULL) {
 		if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
 			return (EEXIST);
 
 		/* Exchange values in @tmp and @tei */
 		value = tmp->value;
 		tmp->value = tei->value;
 		tei->value = value;
 
 		iif = tmp->ic.iface;
 		if (iif->resolved != 0) {
 			/* We have to update runtime value, too */
 			ifi = ifidx_find(ti, &iif->ifindex);
 			ifi->value = ife->value;
 		}
 
 		/* Indicate that update has happened instead of addition */
 		tei->flags |= TEI_FLAGS_UPDATED;
 		*pnum = 0;
 		return (0);
 	}
 
 	if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
 		return (EFBIG);
 
 	/* Link to internal list */
 	ipfw_objhash_add(icfg->ii, &ife->no);
 
 	/* Link notifier (possible running its callback) */
 	ipfw_iface_add_notify(icfg->ch, &ife->ic);
 	icfg->count++;
 
 	tb->ife = NULL;
 	*pnum = 1;
 
 	return (0);
 }
 
 /*
  * Prepare to delete key from table.
  * Do basic interface name checks.
  */
 static int
 ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_ifidx *tb;
 	char *ifname;
 
 	tb = (struct ta_buf_ifidx *)ta_buf;
 
 	/* Check if string is terminated */
 	ifname = (char *)tei->paddr;
 	if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE)
 		return (EINVAL);
 
 	return (0);
 }
 
 /*
  * Remove key from both configuration list and
  * runtime array. Removed interface notification.
  */
 static int
 ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct iftable_cfg *icfg;
 	struct ifentry *ife;
 	struct ta_buf_ifidx *tb;
 	char *ifname;
 	uint16_t ifindex;
 	int res;
 
 	tb = (struct ta_buf_ifidx *)ta_buf;
 	ifname = (char *)tei->paddr;
 	icfg = (struct iftable_cfg *)ta_state;
 	ife = tb->ife;
 
 	ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname);
 
 	if (ife == NULL)
 		return (ENOENT);
 
 	if (ife->linked != 0) {
 		/* We have to remove item from runtime */
 		ifindex = ife->ic.iface->ifindex;
 
 		res = bdel(&ifindex, icfg->main_ptr, icfg->used,
 		    sizeof(struct ifidx), compare_ifidx);
 
 		KASSERT(res == 1, ("index %d does not exist", ifindex));
 		icfg->used--;
 		ti->data = icfg->used;
 		ife->linked = 0;
 	}
 
 	/* Unlink from local list */
 	ipfw_objhash_del(icfg->ii, &ife->no);
 	/* Unlink notifier */
 	ipfw_iface_del_notify(icfg->ch, &ife->ic);
 
 	icfg->count--;
 	tei->value = ife->value;
 
 	tb->ife = ife;
 	*pnum = 1;
 
 	return (0);
 }
 
 /*
  * Flush deleted entry.
  * Drops interface reference and frees entry.
  */
 static void
 ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_ifidx *tb;
 
 	tb = (struct ta_buf_ifidx *)ta_buf;
 
 	if (tb->ife != NULL) {
 		/* Unlink first */
 		ipfw_iface_unref(ch, &tb->ife->ic);
 		free(tb->ife, M_IPFW_TBL);
 	}
 }
 
 
 /*
  * Handle interface announce/withdrawal for particular table.
  * Every real runtime array modification happens here.
  */
 static void
 if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex)
 {
 	struct ifentry *ife;
 	struct ifidx ifi;
 	struct iftable_cfg *icfg;
 	struct table_info *ti;
 	int res;
 
 	ife = (struct ifentry *)cbdata;
 	icfg = ife->icfg;
 	ti = icfg->ti;
 
 	KASSERT(ti != NULL, ("ti=NULL, check change_ti handler"));
 
 	if (ife->linked == 0 && ifindex != 0) {
 		/* Interface announce */
 		ifi.kidx = ifindex;
 		ifi.spare = 0;
 		ifi.value = ife->value;
 		res = badd(&ifindex, &ifi, icfg->main_ptr, icfg->used,
 		    sizeof(struct ifidx), compare_ifidx);
 		KASSERT(res == 1, ("index %d already exists", ifindex));
 		icfg->used++;
 		ti->data = icfg->used;
 		ife->linked = 1;
 	} else if (ife->linked != 0 && ifindex == 0) {
 		/* Interface withdrawal */
 		ifindex = ife->ic.iface->ifindex;
 
 		res = bdel(&ifindex, icfg->main_ptr, icfg->used,
 		    sizeof(struct ifidx), compare_ifidx);
 
 		KASSERT(res == 1, ("index %d does not exist", ifindex));
 		icfg->used--;
 		ti->data = icfg->used;
 		ife->linked = 0;
 	}
 }
 
 
 /*
  * Table growing callbacks.
  */
 
 static int
 ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count,
     uint64_t *pflags)
 {
 	struct iftable_cfg *cfg;
 	uint32_t size;
 
 	cfg = (struct iftable_cfg *)ta_state;
 
 	size = cfg->size;
 	while (size < cfg->count + count)
 		size *= 2;
 
 	if (size != cfg->size) {
 		*pflags = size;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Allocate ned, larger runtime ifidx array.
  */
 static int
 ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags)
 {
 	struct mod_item *mi;
 
 	mi = (struct mod_item *)ta_buf;
 
 	memset(mi, 0, sizeof(struct mod_item));
 	mi->size = *pflags;
 	mi->main_ptr = malloc(sizeof(struct ifidx) * mi->size, M_IPFW,
 	    M_WAITOK | M_ZERO);
 
 	return (0);
 }
 
 /*
  * Copy data from old runtime array to new one.
  */
 static int
 ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t *pflags)
 {
 	struct mod_item *mi;
 	struct iftable_cfg *icfg;
 
 	mi = (struct mod_item *)ta_buf;
 	icfg = (struct iftable_cfg *)ta_state;
 
 	/* Check if we still need to grow array */
 	if (icfg->size >= mi->size) {
 		*pflags = 0;
 		return (0);
 	}
 
 	memcpy(mi->main_ptr, icfg->main_ptr, icfg->used * sizeof(struct ifidx));
 
 	return (0);
 }
 
 /*
  * Switch old & new arrays.
  */
 static void
 ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags)
 {
 	struct mod_item *mi;
 	struct iftable_cfg *icfg;
 	void *old_ptr;
 
 	mi = (struct mod_item *)ta_buf;
 	icfg = (struct iftable_cfg *)ta_state;
 
 	old_ptr = icfg->main_ptr;
 	icfg->main_ptr = mi->main_ptr;
 	icfg->size = mi->size;
 	ti->state = icfg->main_ptr;
 
 	mi->main_ptr = old_ptr;
 }
 
 /*
  * Free unneded array.
  */
 static void
 ta_flush_mod_ifidx(void *ta_buf)
 {
 	struct mod_item *mi;
 
 	mi = (struct mod_item *)ta_buf;
 	if (mi->main_ptr != NULL)
 		free(mi->main_ptr, M_IPFW);
 }
 
 static int
 ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent)
 {
 	struct ifentry *ife;
 
 	ife = (struct ifentry *)e;
 
 	tent->masklen = 8 * IF_NAMESIZE;
 	memcpy(&tent->k, ife->no.name, IF_NAMESIZE);
 	tent->v.kidx = ife->value;
 
 	return (0);
 }
 
 static int
 ta_find_ifidx_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent)
 {
 	struct iftable_cfg *icfg;
 	struct ifentry *ife;
 	char *ifname;
 
 	icfg = (struct iftable_cfg *)ta_state;
 	ifname = tent->k.iface;
 
 	if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE)
 		return (EINVAL);
 
 	ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname);
 
 	if (ife != NULL) {
 		ta_dump_ifidx_tentry(ta_state, ti, ife, tent);
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 struct wa_ifidx {
 	ta_foreach_f	*f;
 	void		*arg;
 };
 
 static void
 foreach_ifidx(struct namedobj_instance *ii, struct named_object *no,
     void *arg)
 {
 	struct ifentry *ife;
 	struct wa_ifidx *wa;
 
 	ife = (struct ifentry *)no;
 	wa = (struct wa_ifidx *)arg;
 
 	wa->f(ife, wa->arg);
 }
 
 static void
 ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f,
     void *arg)
 {
 	struct iftable_cfg *icfg;
 	struct wa_ifidx wa;
 
 	icfg = (struct iftable_cfg *)ta_state;
 
 	wa.f = f;
 	wa.arg = arg;
 
 	ipfw_objhash_foreach(icfg->ii, foreach_ifidx, &wa);
 }
 
 struct table_algo iface_idx = {
 	.name		= "iface:array",
 	.type		= IPFW_TABLE_INTERFACE,
 	.flags		= TA_FLAG_DEFAULT,
 	.ta_buf_size	= sizeof(struct ta_buf_ifidx),
 	.init		= ta_init_ifidx,
 	.destroy	= ta_destroy_ifidx,
 	.prepare_add	= ta_prepare_add_ifidx,
 	.prepare_del	= ta_prepare_del_ifidx,
 	.add		= ta_add_ifidx,
 	.del		= ta_del_ifidx,
 	.flush_entry	= ta_flush_ifidx_entry,
 	.foreach	= ta_foreach_ifidx,
 	.dump_tentry	= ta_dump_ifidx_tentry,
 	.find_tentry	= ta_find_ifidx_tentry,
 	.dump_tinfo	= ta_dump_ifidx_tinfo,
 	.need_modify	= ta_need_modify_ifidx,
 	.prepare_mod	= ta_prepare_mod_ifidx,
 	.fill_mod	= ta_fill_mod_ifidx,
 	.modify		= ta_modify_ifidx,
 	.flush_mod	= ta_flush_mod_ifidx,
 	.change_ti	= ta_change_ti_ifidx,
 };
 
 /*
  * Number array cmds.
  *
  * Implementation:
  *
  * Runtime part:
  * - sorted array of "struct numarray" pointed by ti->state.
  *   Array is allocated with rounding up to NUMARRAY_CHUNK.
  * - current array size is stored in ti->data
  *
  */
 
 struct numarray {
 	uint32_t	number;
 	uint32_t	value;
 };
 
 struct numarray_cfg {
 	void	*main_ptr;
 	size_t	size;	/* Number of items allocated in array */
 	size_t	used;	/* Number of items _active_ now */
 };
 
 struct ta_buf_numarray
 {
 	struct numarray na;
 };
 
 int compare_numarray(const void *k, const void *v);
 static struct numarray *numarray_find(struct table_info *ti, void *key);
 static int ta_lookup_numarray(struct table_info *ti, void *key,
     uint32_t keylen, uint32_t *val);
 static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state,
     struct table_info *ti, char *data, uint8_t tflags);
 static void ta_destroy_numarray(void *ta_state, struct table_info *ti);
 static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti,
     ipfw_ta_tinfo *tinfo);
 static int ta_prepare_add_numarray(struct ip_fw_chain *ch,
     struct tentry_info *tei, void *ta_buf);
 static int ta_add_numarray(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static int ta_del_numarray(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static void ta_flush_numarray_entry(struct ip_fw_chain *ch,
     struct tentry_info *tei, void *ta_buf);
 static int ta_need_modify_numarray(void *ta_state, struct table_info *ti,
     uint32_t count, uint64_t *pflags);
 static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags);
 static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti,
     void *ta_buf, uint64_t *pflags);
 static void ta_modify_numarray(void *ta_state, struct table_info *ti,
     void *ta_buf, uint64_t pflags);
 static void ta_flush_mod_numarray(void *ta_buf);
 static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti,
     void *e, ipfw_obj_tentry *tent);
 static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent);
 static void ta_foreach_numarray(void *ta_state, struct table_info *ti,
     ta_foreach_f *f, void *arg);
 
 int
 compare_numarray(const void *k, const void *v)
 {
 	const struct numarray *na;
 	uint32_t key;
 
 	key = *((const uint32_t *)k);
 	na = (const struct numarray *)v;
 
 	if (key < na->number)
 		return (-1);
 	else if (key > na->number)
 		return (1);
 
 	return (0);
 }
 
 static struct numarray *
 numarray_find(struct table_info *ti, void *key)
 {
 	struct numarray *ri;
 
 	ri = bsearch(key, ti->state, ti->data, sizeof(struct numarray),
 	    compare_ifidx);
 
 	return (ri);
 }
 
 static int
 ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct numarray *ri;
 
 	ri = numarray_find(ti, key);
 
 	if (ri != NULL) {
 		*val = ri->value;
 		return (1);
 	}
 
 	return (0);
 }
 
 static int
 ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
     char *data, uint8_t tflags)
 {
 	struct numarray_cfg *cfg;
 
 	cfg = malloc(sizeof(*cfg), M_IPFW, M_WAITOK | M_ZERO);
 
 	cfg->size = 16;
 	cfg->main_ptr = malloc(sizeof(struct numarray) * cfg->size, M_IPFW,
 	    M_WAITOK | M_ZERO);
 
 	*ta_state = cfg;
 	ti->state = cfg->main_ptr;
 	ti->lookup = ta_lookup_numarray;
 
 	return (0);
 }
 
 /*
  * Destroys table @ti
  */
 static void
 ta_destroy_numarray(void *ta_state, struct table_info *ti)
 {
 	struct numarray_cfg *cfg;
 
 	cfg = (struct numarray_cfg *)ta_state;
 
 	if (cfg->main_ptr != NULL)
 		free(cfg->main_ptr, M_IPFW);
 
 	free(cfg, M_IPFW);
 }
 
 /*
  * Provide algo-specific table info
  */
 static void
 ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
 {
 	struct numarray_cfg *cfg;
 
 	cfg = (struct numarray_cfg *)ta_state;
 
 	tinfo->taclass4 = IPFW_TACLASS_ARRAY;
 	tinfo->size4 = cfg->size;
 	tinfo->count4 = cfg->used;
 	tinfo->itemsize4 = sizeof(struct numarray);
 }
 
 /*
  * Prepare for addition/deletion to an array.
  */
 static int
 ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_numarray *tb;
 
 	tb = (struct ta_buf_numarray *)ta_buf;
 
 	tb->na.number = *((uint32_t *)tei->paddr);
 
 	return (0);
 }
 
 static int
 ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct numarray_cfg *cfg;
 	struct ta_buf_numarray *tb;
 	struct numarray *ri;
 	int res;
 	uint32_t value;
 
 	tb = (struct ta_buf_numarray *)ta_buf;
 	cfg = (struct numarray_cfg *)ta_state;
 
 	/* Read current value from @tei */
 	tb->na.value = tei->value;
 
 	ri = numarray_find(ti, &tb->na.number);
 	
 	if (ri != NULL) {
 		if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
 			return (EEXIST);
 
 		/* Exchange values between ri and @tei */
 		value = ri->value;
 		ri->value = tei->value;
 		tei->value = value;
 		/* Indicate that update has happened instead of addition */
 		tei->flags |= TEI_FLAGS_UPDATED;
 		*pnum = 0;
 		return (0);
 	}
 
 	if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
 		return (EFBIG);
 
 	res = badd(&tb->na.number, &tb->na, cfg->main_ptr, cfg->used,
 	    sizeof(struct numarray), compare_numarray);
 
 	KASSERT(res == 1, ("number %d already exists", tb->na.number));
 	cfg->used++;
 	ti->data = cfg->used;
 	*pnum = 1;
 
 	return (0);
 }
 
 /*
  * Remove key from both configuration list and
  * runtime array. Removed interface notification.
  */
 static int
 ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct numarray_cfg *cfg;
 	struct ta_buf_numarray *tb;
 	struct numarray *ri;
 	int res;
 
 	tb = (struct ta_buf_numarray *)ta_buf;
 	cfg = (struct numarray_cfg *)ta_state;
 
 	ri = numarray_find(ti, &tb->na.number);
 	if (ri == NULL)
 		return (ENOENT);
 
 	tei->value = ri->value;
 	
 	res = bdel(&tb->na.number, cfg->main_ptr, cfg->used,
 	    sizeof(struct numarray), compare_numarray);
 
 	KASSERT(res == 1, ("number %u does not exist", tb->na.number));
 	cfg->used--;
 	ti->data = cfg->used;
 	*pnum = 1;
 
 	return (0);
 }
 
 static void
 ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 
 	/* We don't have any state, do nothing */
 }
 
 
 /*
  * Table growing callbacks.
  */
 
 static int
 ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count,
     uint64_t *pflags)
 {
 	struct numarray_cfg *cfg;
 	size_t size;
 
 	cfg = (struct numarray_cfg *)ta_state;
 
 	size = cfg->size;
 	while (size < cfg->used + count)
 		size *= 2;
 
 	if (size != cfg->size) {
 		*pflags = size;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Allocate new, larger runtime array.
  */
 static int
 ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags)
 {
 	struct mod_item *mi;
 
 	mi = (struct mod_item *)ta_buf;
 
 	memset(mi, 0, sizeof(struct mod_item));
 	mi->size = *pflags;
 	mi->main_ptr = malloc(sizeof(struct numarray) * mi->size, M_IPFW,
 	    M_WAITOK | M_ZERO);
 
 	return (0);
 }
 
 /*
  * Copy data from old runtime array to new one.
  */
 static int
 ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t *pflags)
 {
 	struct mod_item *mi;
 	struct numarray_cfg *cfg;
 
 	mi = (struct mod_item *)ta_buf;
 	cfg = (struct numarray_cfg *)ta_state;
 
 	/* Check if we still need to grow array */
 	if (cfg->size >= mi->size) {
 		*pflags = 0;
 		return (0);
 	}
 
 	memcpy(mi->main_ptr, cfg->main_ptr, cfg->used * sizeof(struct numarray));
 
 	return (0);
 }
 
 /*
  * Switch old & new arrays.
  */
 static void
 ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags)
 {
 	struct mod_item *mi;
 	struct numarray_cfg *cfg;
 	void *old_ptr;
 
 	mi = (struct mod_item *)ta_buf;
 	cfg = (struct numarray_cfg *)ta_state;
 
 	old_ptr = cfg->main_ptr;
 	cfg->main_ptr = mi->main_ptr;
 	cfg->size = mi->size;
 	ti->state = cfg->main_ptr;
 
 	mi->main_ptr = old_ptr;
 }
 
 /*
  * Free unneded array.
  */
 static void
 ta_flush_mod_numarray(void *ta_buf)
 {
 	struct mod_item *mi;
 
 	mi = (struct mod_item *)ta_buf;
 	if (mi->main_ptr != NULL)
 		free(mi->main_ptr, M_IPFW);
 }
 
 static int
 ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent)
 {
 	struct numarray *na;
 
 	na = (struct numarray *)e;
 
 	tent->k.key = na->number;
 	tent->v.kidx = na->value;
 
 	return (0);
 }
 
 static int
 ta_find_numarray_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent)
 {
 	struct numarray_cfg *cfg;
 	struct numarray *ri;
 
 	cfg = (struct numarray_cfg *)ta_state;
 
 	ri = numarray_find(ti, &tent->k.key);
 
 	if (ri != NULL) {
 		ta_dump_numarray_tentry(ta_state, ti, ri, tent);
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f,
     void *arg)
 {
 	struct numarray_cfg *cfg;
 	struct numarray *array;
 	int i;
 
 	cfg = (struct numarray_cfg *)ta_state;
 	array = cfg->main_ptr;
 
 	for (i = 0; i < cfg->used; i++)
 		f(&array[i], arg);
 }
 
 struct table_algo number_array = {
 	.name		= "number:array",
 	.type		= IPFW_TABLE_NUMBER,
 	.ta_buf_size	= sizeof(struct ta_buf_numarray),
 	.init		= ta_init_numarray,
 	.destroy	= ta_destroy_numarray,
 	.prepare_add	= ta_prepare_add_numarray,
 	.prepare_del	= ta_prepare_add_numarray,
 	.add		= ta_add_numarray,
 	.del		= ta_del_numarray,
 	.flush_entry	= ta_flush_numarray_entry,
 	.foreach	= ta_foreach_numarray,
 	.dump_tentry	= ta_dump_numarray_tentry,
 	.find_tentry	= ta_find_numarray_tentry,
 	.dump_tinfo	= ta_dump_numarray_tinfo,
 	.need_modify	= ta_need_modify_numarray,
 	.prepare_mod	= ta_prepare_mod_numarray,
 	.fill_mod	= ta_fill_mod_numarray,
 	.modify		= ta_modify_numarray,
 	.flush_mod	= ta_flush_mod_numarray,
 };
 
 /*
  * flow:hash cmds
  *
  *
  * ti->data:
  * [inv.mask4][inv.mask6][log2hsize4][log2hsize6]
  * [        8][        8[          8][         8]
  *
  * inv.mask4: 32 - mask
  * inv.mask6:
  * 1) _slow lookup: mask
  * 2) _aligned: (128 - mask) / 8
  * 3) _64: 8
  *
  *
  * pflags:
  * [hsize4][hsize6]
  * [    16][    16]
  */
 
 struct fhashentry;
 
 SLIST_HEAD(fhashbhead, fhashentry);
 
 struct fhashentry {
 	SLIST_ENTRY(fhashentry)	next;
 	uint8_t		af;
 	uint8_t		proto;
 	uint16_t	spare0;
 	uint16_t	dport;
 	uint16_t	sport;
 	uint32_t	value;
 	uint32_t	spare1;
 };
 
 struct fhashentry4 {
 	struct fhashentry	e;
 	struct in_addr		dip;
 	struct in_addr		sip;
 };
 
 struct fhashentry6 {
 	struct fhashentry	e;
 	struct in6_addr		dip6;
 	struct in6_addr		sip6;
 };
 
 struct fhash_cfg {
 	struct fhashbhead	*head;
 	size_t			size;
 	size_t			items;
 	struct fhashentry4	fe4;
 	struct fhashentry6	fe6;
 };
 
 struct ta_buf_fhash {
 	void	*ent_ptr;
 	struct fhashentry6 fe6;
 };
 
 static __inline int cmp_flow_ent(struct fhashentry *a,
     struct fhashentry *b, size_t sz);
 static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize);
 static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize);
 static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size);
 static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state,
 struct table_info *ti, char *data, uint8_t tflags);
 static void ta_destroy_fhash(void *ta_state, struct table_info *ti);
 static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti,
     ipfw_ta_tinfo *tinfo);
 static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti,
     void *e, ipfw_obj_tentry *tent);
 static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent);
 static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent);
 static void ta_foreach_fhash(void *ta_state, struct table_info *ti,
     ta_foreach_f *f, void *arg);
 static int ta_prepare_add_fhash(struct ip_fw_chain *ch,
     struct tentry_info *tei, void *ta_buf);
 static int ta_add_fhash(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_del_fhash(void *ta_state, struct table_info *ti,
     struct tentry_info *tei, void *ta_buf, uint32_t *pnum);
 static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf);
 static int ta_need_modify_fhash(void *ta_state, struct table_info *ti,
     uint32_t count, uint64_t *pflags);
 static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags);
 static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti,
     void *ta_buf, uint64_t *pflags);
 static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags);
 static void ta_flush_mod_fhash(void *ta_buf);
 
 static __inline int
 cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz)
 {
 	uint64_t *ka, *kb;
 
 	ka = (uint64_t *)(&a->next + 1);
 	kb = (uint64_t *)(&b->next + 1);
 
 	if (*ka == *kb && (memcmp(a + 1, b + 1, sz) == 0))
 		return (1);
 
 	return (0);
 }
 
 static __inline uint32_t
 hash_flow4(struct fhashentry4 *f, int hsize)
 {
 	uint32_t i;
 
 	i = (f->dip.s_addr) ^ (f->sip.s_addr) ^ (f->e.dport) ^ (f->e.sport);
 
 	return (i % (hsize - 1));
 }
 
 static __inline uint32_t
 hash_flow6(struct fhashentry6 *f, int hsize)
 {
 	uint32_t i;
 
 	i = (f->dip6.__u6_addr.__u6_addr32[2]) ^
 	    (f->dip6.__u6_addr.__u6_addr32[3]) ^
 	    (f->sip6.__u6_addr.__u6_addr32[2]) ^
 	    (f->sip6.__u6_addr.__u6_addr32[3]) ^
 	    (f->e.dport) ^ (f->e.sport);
 
 	return (i % (hsize - 1));
 }
 
 static uint32_t
 hash_flow_ent(struct fhashentry *ent, uint32_t size)
 {
 	uint32_t hash;
 
 	if (ent->af == AF_INET) {
 		hash = hash_flow4((struct fhashentry4 *)ent, size);
 	} else {
 		hash = hash_flow6((struct fhashentry6 *)ent, size);
 	}
 
 	return (hash);
 }
 
 static int
 ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct fhashbhead *head;
 	struct fhashentry *ent;
 	struct fhashentry4 *m4;
 	struct ipfw_flow_id *id;
 	uint16_t hash, hsize;
 
 	id = (struct ipfw_flow_id *)key;
 	head = (struct fhashbhead *)ti->state;
 	hsize = ti->data;
 	m4 = (struct fhashentry4 *)ti->xstate;
 
 	if (id->addr_type == 4) {
 		struct fhashentry4 f;
 
 		/* Copy hash mask */
 		f = *m4;
 
 		f.dip.s_addr &= id->dst_ip;
 		f.sip.s_addr &= id->src_ip;
 		f.e.dport &= id->dst_port;
 		f.e.sport &= id->src_port;
 		f.e.proto &= id->proto;
 		hash = hash_flow4(&f, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			if (cmp_flow_ent(ent, &f.e, 2 * 4) != 0) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 	} else if (id->addr_type == 6) {
 		struct fhashentry6 f;
 		uint64_t *fp, *idp;
 
 		/* Copy hash mask */
 		f = *((struct fhashentry6 *)(m4 + 1));
 
 		/* Handle lack of __u6_addr.__u6_addr64 */
 		fp = (uint64_t *)&f.dip6;
 		idp = (uint64_t *)&id->dst_ip6;
 		/* src IPv6 is stored after dst IPv6 */
 		*fp++ &= *idp++;
 		*fp++ &= *idp++;
 		*fp++ &= *idp++;
 		*fp &= *idp;
 		f.e.dport &= id->dst_port;
 		f.e.sport &= id->src_port;
 		f.e.proto &= id->proto;
 		hash = hash_flow6(&f, hsize);
 		SLIST_FOREACH(ent, &head[hash], next) {
 			if (cmp_flow_ent(ent, &f.e, 2 * 16) != 0) {
 				*val = ent->value;
 				return (1);
 			}
 		}
 	}
 
 	return (0);
 }
 
 /*
  * New table.
  */
 static int
 ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
     char *data, uint8_t tflags)
 {
 	int i;
 	struct fhash_cfg *cfg;
 	struct fhashentry4 *fe4;
 	struct fhashentry6 *fe6;
 
 	cfg = malloc(sizeof(struct fhash_cfg), M_IPFW, M_WAITOK | M_ZERO);
 
 	cfg->size = 512;
 
 	cfg->head = malloc(sizeof(struct fhashbhead) * cfg->size, M_IPFW,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < cfg->size; i++)
 		SLIST_INIT(&cfg->head[i]);
 
 	/* Fill in fe masks based on @tflags */
 	fe4 = &cfg->fe4;
 	fe6 = &cfg->fe6;
 	if (tflags & IPFW_TFFLAG_SRCIP) {
 		memset(&fe4->sip, 0xFF, sizeof(fe4->sip));
 		memset(&fe6->sip6, 0xFF, sizeof(fe6->sip6));
 	}
 	if (tflags & IPFW_TFFLAG_DSTIP) {
 		memset(&fe4->dip, 0xFF, sizeof(fe4->dip));
 		memset(&fe6->dip6, 0xFF, sizeof(fe6->dip6));
 	}
 	if (tflags & IPFW_TFFLAG_SRCPORT) {
 		memset(&fe4->e.sport, 0xFF, sizeof(fe4->e.sport));
 		memset(&fe6->e.sport, 0xFF, sizeof(fe6->e.sport));
 	}
 	if (tflags & IPFW_TFFLAG_DSTPORT) {
 		memset(&fe4->e.dport, 0xFF, sizeof(fe4->e.dport));
 		memset(&fe6->e.dport, 0xFF, sizeof(fe6->e.dport));
 	}
 	if (tflags & IPFW_TFFLAG_PROTO) {
 		memset(&fe4->e.proto, 0xFF, sizeof(fe4->e.proto));
 		memset(&fe6->e.proto, 0xFF, sizeof(fe6->e.proto));
 	}
 
 	fe4->e.af = AF_INET;
 	fe6->e.af = AF_INET6;
 
 	*ta_state = cfg;
 	ti->state = cfg->head;
 	ti->xstate = &cfg->fe4;
 	ti->data = cfg->size;
 	ti->lookup = ta_lookup_fhash;
 
 	return (0);
 }
 
 static void
 ta_destroy_fhash(void *ta_state, struct table_info *ti)
 {
 	struct fhash_cfg *cfg;
 	struct fhashentry *ent, *ent_next;
 	int i;
 
 	cfg = (struct fhash_cfg *)ta_state;
 
 	for (i = 0; i < cfg->size; i++)
 		SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next)
 			free(ent, M_IPFW_TBL);
 
 	free(cfg->head, M_IPFW);
 	free(cfg, M_IPFW);
 }
 
 /*
  * Provide algo-specific table info
  */
 static void
 ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
 {
 	struct fhash_cfg *cfg;
 
 	cfg = (struct fhash_cfg *)ta_state;
 
 	tinfo->flags = IPFW_TATFLAGS_AFITEM;
 	tinfo->taclass4 = IPFW_TACLASS_HASH;
 	tinfo->size4 = cfg->size;
 	tinfo->count4 = cfg->items;
 	tinfo->itemsize4 = sizeof(struct fhashentry4);
 	tinfo->itemsize6 = sizeof(struct fhashentry6);
 }
 
 static int
 ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent)
 {
 	struct fhash_cfg *cfg;
 	struct fhashentry *ent;
 	struct fhashentry4 *fe4;
 #ifdef INET6
 	struct fhashentry6 *fe6;
 #endif
 	struct tflow_entry *tfe;
 
 	cfg = (struct fhash_cfg *)ta_state;
 	ent = (struct fhashentry *)e;
 	tfe = &tent->k.flow;
 
 	tfe->af = ent->af;
 	tfe->proto = ent->proto;
 	tfe->dport = htons(ent->dport);
 	tfe->sport = htons(ent->sport);
 	tent->v.kidx = ent->value;
 	tent->subtype = ent->af;
 
 	if (ent->af == AF_INET) {
 		fe4 = (struct fhashentry4 *)ent;
 		tfe->a.a4.sip.s_addr = htonl(fe4->sip.s_addr);
 		tfe->a.a4.dip.s_addr = htonl(fe4->dip.s_addr);
 		tent->masklen = 32;
 #ifdef INET6
 	} else {
 		fe6 = (struct fhashentry6 *)ent;
 		tfe->a.a6.sip6 = fe6->sip6;
 		tfe->a.a6.dip6 = fe6->dip6;
 		tent->masklen = 128;
 #endif
 	}
 
 	return (0);
 }
 
 static int
 tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent)
 {
 #ifdef INET
 	struct fhashentry4 *fe4;
 #endif
 #ifdef INET6
 	struct fhashentry6 *fe6;
 #endif
 	struct tflow_entry *tfe;
 
 	tfe = (struct tflow_entry *)tei->paddr;
 
 	ent->af = tei->subtype;
 	ent->proto = tfe->proto;
 	ent->dport = ntohs(tfe->dport);
 	ent->sport = ntohs(tfe->sport);
 
 	if (tei->subtype == AF_INET) {
 #ifdef INET
 		fe4 = (struct fhashentry4 *)ent;
 		fe4->sip.s_addr = ntohl(tfe->a.a4.sip.s_addr);
 		fe4->dip.s_addr = ntohl(tfe->a.a4.dip.s_addr);
 #endif
 #ifdef INET6
 	} else if (tei->subtype == AF_INET6) {
 		fe6 = (struct fhashentry6 *)ent;
 		fe6->sip6 = tfe->a.a6.sip6;
 		fe6->dip6 = tfe->a.a6.dip6;
 #endif
 	} else {
 		/* Unknown CIDR type */
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 
 static int
 ta_find_fhash_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent)
 {
 	struct fhash_cfg *cfg;
 	struct fhashbhead *head;
 	struct fhashentry *ent, *tmp;
 	struct fhashentry6 fe6;
 	struct tentry_info tei;
 	int error;
 	uint32_t hash;
 	size_t sz;
 
 	cfg = (struct fhash_cfg *)ta_state;
 
 	ent = &fe6.e;
 
 	memset(&fe6, 0, sizeof(fe6));
 	memset(&tei, 0, sizeof(tei));
 
 	tei.paddr = &tent->k.flow;
 	tei.subtype = tent->subtype;
 
 	if ((error = tei_to_fhash_ent(&tei, ent)) != 0)
 		return (error);
 
 	head = cfg->head;
 	hash = hash_flow_ent(ent, cfg->size);
 
 	if (tei.subtype == AF_INET)
 		sz = 2 * sizeof(struct in_addr);
 	else
 		sz = 2 * sizeof(struct in6_addr);
 
 	/* Check for existence */
 	SLIST_FOREACH(tmp, &head[hash], next) {
 		if (cmp_flow_ent(tmp, ent, sz) != 0) {
 			ta_dump_fhash_tentry(ta_state, ti, tmp, tent);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f,
     void *arg)
 {
 	struct fhash_cfg *cfg;
 	struct fhashentry *ent, *ent_next;
 	int i;
 
 	cfg = (struct fhash_cfg *)ta_state;
 
 	for (i = 0; i < cfg->size; i++)
 		SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next)
 			f(ent, arg);
 }
 
 static int
 ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_fhash *tb;
 	struct fhashentry *ent;
 	size_t sz;
 	int error;
 
 	tb = (struct ta_buf_fhash *)ta_buf;
 
 	if (tei->subtype == AF_INET)
 		sz = sizeof(struct fhashentry4);
 	else if (tei->subtype == AF_INET6)
 		sz = sizeof(struct fhashentry6);
 	else
 		return (EINVAL);
 
 	ent = malloc(sz, M_IPFW_TBL, M_WAITOK | M_ZERO);
 
 	error = tei_to_fhash_ent(tei, ent);
 	if (error != 0) {
 		free(ent, M_IPFW_TBL);
 		return (error);
 	}
 	tb->ent_ptr = ent;
 
 	return (0);
 }
 
 static int
 ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct fhash_cfg *cfg;
 	struct fhashbhead *head;
 	struct fhashentry *ent, *tmp;
 	struct ta_buf_fhash *tb;
 	int exists;
 	uint32_t hash, value;
 	size_t sz;
 
 	cfg = (struct fhash_cfg *)ta_state;
 	tb = (struct ta_buf_fhash *)ta_buf;
 	ent = (struct fhashentry *)tb->ent_ptr;
 	exists = 0;
 
 	/* Read current value from @tei */
 	ent->value = tei->value;
 
 	head = cfg->head;
 	hash = hash_flow_ent(ent, cfg->size);
 
 	if (tei->subtype == AF_INET)
 		sz = 2 * sizeof(struct in_addr);
 	else
 		sz = 2 * sizeof(struct in6_addr);
 
 	/* Check for existence */
 	SLIST_FOREACH(tmp, &head[hash], next) {
 		if (cmp_flow_ent(tmp, ent, sz) != 0) {
 			exists = 1;
 			break;
 		}
 	}
 
 	if (exists == 1) {
 		if ((tei->flags & TEI_FLAGS_UPDATE) == 0)
 			return (EEXIST);
 		/* Record already exists. Update value if we're asked to */
 		/* Exchange values between tmp and @tei */
 		value = tmp->value;
 		tmp->value = tei->value;
 		tei->value = value;
 		/* Indicate that update has happened instead of addition */
 		tei->flags |= TEI_FLAGS_UPDATED;
 		*pnum = 0;
 	} else {
 		if ((tei->flags & TEI_FLAGS_DONTADD) != 0)
 			return (EFBIG);
 
 		SLIST_INSERT_HEAD(&head[hash], ent, next);
 		tb->ent_ptr = NULL;
 		*pnum = 1;
 
 		/* Update counters and check if we need to grow hash */
 		cfg->items++;
 	}
 
 	return (0);
 }
 
 static int
 ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_fhash *tb;
 
 	tb = (struct ta_buf_fhash *)ta_buf;
 
 	return (tei_to_fhash_ent(tei, &tb->fe6.e));
 }
 
 static int
 ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei,
     void *ta_buf, uint32_t *pnum)
 {
 	struct fhash_cfg *cfg;
 	struct fhashbhead *head;
 	struct fhashentry *ent, *tmp;
 	struct ta_buf_fhash *tb;
 	uint32_t hash;
 	size_t sz;
 
 	cfg = (struct fhash_cfg *)ta_state;
 	tb = (struct ta_buf_fhash *)ta_buf;
 	ent = &tb->fe6.e;
 
 	head = cfg->head;
 	hash = hash_flow_ent(ent, cfg->size);
 
 	if (tei->subtype == AF_INET)
 		sz = 2 * sizeof(struct in_addr);
 	else
 		sz = 2 * sizeof(struct in6_addr);
 
 	/* Check for existence */
 	SLIST_FOREACH(tmp, &head[hash], next) {
 		if (cmp_flow_ent(tmp, ent, sz) == 0)
 			continue;
 
 		SLIST_REMOVE(&head[hash], tmp, fhashentry, next);
 		tei->value = tmp->value;
 		*pnum = 1;
 		cfg->items--;
 		tb->ent_ptr = tmp;
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei,
     void *ta_buf)
 {
 	struct ta_buf_fhash *tb;
 
 	tb = (struct ta_buf_fhash *)ta_buf;
 
 	if (tb->ent_ptr != NULL)
 		free(tb->ent_ptr, M_IPFW_TBL);
 }
 
 /*
  * Hash growing callbacks.
  */
 
 static int
 ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count,
     uint64_t *pflags)
 {
 	struct fhash_cfg *cfg;
 
 	cfg = (struct fhash_cfg *)ta_state;
 
 	if (cfg->items > cfg->size && cfg->size < 65536) {
 		*pflags = cfg->size * 2;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Allocate new, larger fhash.
  */
 static int
 ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags)
 {
 	struct mod_item *mi;
 	struct fhashbhead *head;
 	int i;
 
 	mi = (struct mod_item *)ta_buf;
 
 	memset(mi, 0, sizeof(struct mod_item));
 	mi->size = *pflags;
 	head = malloc(sizeof(struct fhashbhead) * mi->size, M_IPFW,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < mi->size; i++)
 		SLIST_INIT(&head[i]);
 
 	mi->main_ptr = head;
 
 	return (0);
 }
 
 /*
  * Copy data from old runtime array to new one.
  */
 static int
 ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t *pflags)
 {
 
 	/* In is not possible to do rehash if we're not holidng WLOCK. */
 	return (0);
 }
 
 /*
  * Switch old & new arrays.
  */
 static void
 ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf,
     uint64_t pflags)
 {
 	struct mod_item *mi;
 	struct fhash_cfg *cfg;
 	struct fhashbhead *old_head, *new_head;
 	struct fhashentry *ent, *ent_next;
 	int i;
 	uint32_t nhash;
 	size_t old_size;
 
 	mi = (struct mod_item *)ta_buf;
 	cfg = (struct fhash_cfg *)ta_state;
 
 	old_size = cfg->size;
 	old_head = ti->state;
 
 	new_head = (struct fhashbhead *)mi->main_ptr;
 	for (i = 0; i < old_size; i++) {
 		SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) {
 			nhash = hash_flow_ent(ent, mi->size);
 			SLIST_INSERT_HEAD(&new_head[nhash], ent, next);
 		}
 	}
 
 	ti->state = new_head;
 	ti->data = mi->size;
 	cfg->head = new_head;
 	cfg->size = mi->size;
 
 	mi->main_ptr = old_head;
 }
 
 /*
  * Free unneded array.
  */
 static void
 ta_flush_mod_fhash(void *ta_buf)
 {
 	struct mod_item *mi;
 
 	mi = (struct mod_item *)ta_buf;
 	if (mi->main_ptr != NULL)
 		free(mi->main_ptr, M_IPFW);
 }
 
 struct table_algo flow_hash = {
 	.name		= "flow:hash",
 	.type		= IPFW_TABLE_FLOW,
 	.flags		= TA_FLAG_DEFAULT,
 	.ta_buf_size	= sizeof(struct ta_buf_fhash),
 	.init		= ta_init_fhash,
 	.destroy	= ta_destroy_fhash,
 	.prepare_add	= ta_prepare_add_fhash,
 	.prepare_del	= ta_prepare_del_fhash,
 	.add		= ta_add_fhash,
 	.del		= ta_del_fhash,
 	.flush_entry	= ta_flush_fhash_entry,
 	.foreach	= ta_foreach_fhash,
 	.dump_tentry	= ta_dump_fhash_tentry,
 	.find_tentry	= ta_find_fhash_tentry,
 	.dump_tinfo	= ta_dump_fhash_tinfo,
 	.need_modify	= ta_need_modify_fhash,
 	.prepare_mod	= ta_prepare_mod_fhash,
 	.fill_mod	= ta_fill_mod_fhash,
 	.modify		= ta_modify_fhash,
 	.flush_mod	= ta_flush_mod_fhash,
 };
 
 /*
  * Kernel fibs bindings.
  *
  * Implementation:
  *
  * Runtime part:
  * - fully relies on route API
  * - fib number is stored in ti->data
  *
  */
 
 static struct rtentry *lookup_kfib(void *key, int keylen, int fib);
 static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 static int kfib_parse_opts(int *pfib, char *data);
 static void ta_print_kfib_config(void *ta_state, struct table_info *ti,
     char *buf, size_t bufsize);
 static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state,
     struct table_info *ti, char *data, uint8_t tflags);
 static void ta_destroy_kfib(void *ta_state, struct table_info *ti);
 static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti,
     ipfw_ta_tinfo *tinfo);
 static int contigmask(uint8_t *p, int len);
 static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent);
 static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent);
 static void ta_foreach_kfib(void *ta_state, struct table_info *ti,
     ta_foreach_f *f, void *arg);
 
 static struct rtentry *
 lookup_kfib(void *key, int keylen, int fib)
 {
 	struct sockaddr *s;
 
 	if (keylen == 4) {
 		struct sockaddr_in sin;
 		bzero(&sin, sizeof(sin));
 		sin.sin_len = sizeof(struct sockaddr_in);
 		sin.sin_family = AF_INET;
 		sin.sin_addr.s_addr = *(in_addr_t *)key;
 		s = (struct sockaddr *)&sin;
 	} else {
 		struct sockaddr_in6 sin6;
 		bzero(&sin6, sizeof(sin6));
 		sin6.sin6_len = sizeof(struct sockaddr_in6);
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_addr = *(struct in6_addr *)key;
 		s = (struct sockaddr *)&sin6;
 	}
 
 	return (rtalloc1_fib(s, 0, 0, fib));
 }
 
 static int
 ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val)
 {
 	struct rtentry *rte;
 
 	if ((rte = lookup_kfib(key, keylen, ti->data)) == NULL)
 		return (0);
 
 	*val = 0;
 	RTFREE_LOCKED(rte);
 
 	return (1);
 }
 
 /* Parse 'fib=%d' */
 static int
 kfib_parse_opts(int *pfib, char *data)
 {
 	char *pdel, *pend, *s;
 	int fibnum;
 
 	if (data == NULL)
 		return (0);
 	if ((pdel = strchr(data, ' ')) == NULL)
 		return (0);
 	while (*pdel == ' ')
 		pdel++;
 	if (strncmp(pdel, "fib=", 4) != 0)
 		return (EINVAL);
 	if ((s = strchr(pdel, ' ')) != NULL)
 		*s++ = '\0';
 
 	pdel += 4;
 	/* Need \d+ */
 	fibnum = strtol(pdel, &pend, 10);
 	if (*pend != '\0')
 		return (EINVAL);
 
 	*pfib = fibnum;
 
 	return (0);
 }
 
 static void
 ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf,
     size_t bufsize)
 {
 
 	if (ti->data != 0)
 		snprintf(buf, bufsize, "%s fib=%lu", "addr:kfib", ti->data);
 	else
 		snprintf(buf, bufsize, "%s", "addr:kfib");
 }
 
 static int
 ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti,
     char *data, uint8_t tflags)
 {
 	int error, fibnum;
 
 	fibnum = 0;
 	if ((error = kfib_parse_opts(&fibnum, data)) != 0)
 		return (error);
 
 	if (fibnum >= rt_numfibs)
 		return (E2BIG);
 
 	ti->data = fibnum;
 	ti->lookup = ta_lookup_kfib;
 
 	return (0);
 }
 
 /*
  * Destroys table @ti
  */
 static void
 ta_destroy_kfib(void *ta_state, struct table_info *ti)
 {
 
 }
 
 /*
  * Provide algo-specific table info
  */
 static void
 ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo)
 {
 
 	tinfo->flags = IPFW_TATFLAGS_AFDATA;
 	tinfo->taclass4 = IPFW_TACLASS_RADIX;
 	tinfo->count4 = 0;
 	tinfo->itemsize4 = sizeof(struct rtentry);
 	tinfo->taclass6 = IPFW_TACLASS_RADIX;
 	tinfo->count6 = 0;
 	tinfo->itemsize6 = sizeof(struct rtentry);
 }
 
 static int
 contigmask(uint8_t *p, int len)
 {
 	int i, n;
 
 	for (i = 0; i < len ; i++)
 		if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */
 			break;
 	for (n= i + 1; n < len; n++)
 		if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0)
 			return (-1); /* mask not contiguous */
 	return (i);
 }
 
 
 static int
 ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e,
     ipfw_obj_tentry *tent)
 {
 	struct rtentry *rte;
 #ifdef INET
 	struct sockaddr_in *addr, *mask;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *addr6, *mask6;
 #endif
 	int len;
 
 	rte = (struct rtentry *)e;
 	addr = (struct sockaddr_in *)rt_key(rte);
 	mask = (struct sockaddr_in *)rt_mask(rte);
 	len = 0;
 
 	/* Guess IPv4/IPv6 radix by sockaddr family */
 #ifdef INET
 	if (addr->sin_family == AF_INET) {
 		tent->k.addr.s_addr = addr->sin_addr.s_addr;
 		len = 32;
 		if (mask != NULL)
 			len = contigmask((uint8_t *)&mask->sin_addr, 32);
 		if (len == -1)
 			len = 0;
 		tent->masklen = len;
 		tent->subtype = AF_INET;
 		tent->v.kidx = 0; /* Do we need to put GW here? */
 	}
 #endif
 #ifdef INET6
 	if (addr->sin_family == AF_INET6) {
 		addr6 = (struct sockaddr_in6 *)addr;
 		mask6 = (struct sockaddr_in6 *)mask;
 		memcpy(&tent->k, &addr6->sin6_addr, sizeof(struct in6_addr));
 		len = 128;
 		if (mask6 != NULL)
 			len = contigmask((uint8_t *)&mask6->sin6_addr, 128);
 		if (len == -1)
 			len = 0;
 		tent->masklen = len;
 		tent->subtype = AF_INET6;
 		tent->v.kidx = 0;
 	}
 #endif
 
 	return (0);
 }
 
 static int
 ta_find_kfib_tentry(void *ta_state, struct table_info *ti,
     ipfw_obj_tentry *tent)
 {
 	struct rtentry *rte;
 	void *key;
 	int keylen;
 
 	if (tent->subtype == AF_INET) {
 		key = &tent->k.addr;
 		keylen = sizeof(struct in_addr);
 	} else {
 		key = &tent->k.addr6;
 		keylen = sizeof(struct in6_addr);
 	}
 
 	if ((rte = lookup_kfib(key, keylen, ti->data)) == NULL)
 		return (0);
 
 	if (rte != NULL) {
 		ta_dump_kfib_tentry(ta_state, ti, rte, tent);
 		RTFREE_LOCKED(rte);
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 static void
 ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f,
     void *arg)
 {
 	struct rib_head *rh;
+	RIB_LOCK_READER;
 	int error;
 
 	rh = rt_tables_get_rnh(ti->data, AF_INET);
 	if (rh != NULL) {
 		RIB_RLOCK(rh); 
 		error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg);
 		RIB_RUNLOCK(rh);
 	}
 
 	rh = rt_tables_get_rnh(ti->data, AF_INET6);
 	if (rh != NULL) {
 		RIB_RLOCK(rh); 
 		error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg);
 		RIB_RUNLOCK(rh);
 	}
 }
 
 struct table_algo addr_kfib = {
 	.name		= "addr:kfib",
 	.type		= IPFW_TABLE_ADDR,
 	.flags		= TA_FLAG_READONLY,
 	.ta_buf_size	= 0,
 	.init		= ta_init_kfib,
 	.destroy	= ta_destroy_kfib,
 	.foreach	= ta_foreach_kfib,
 	.dump_tentry	= ta_dump_kfib_tentry,
 	.find_tentry	= ta_find_kfib_tentry,
 	.dump_tinfo	= ta_dump_kfib_tinfo,
 	.print_config	= ta_print_kfib_config,
 };
 
 void
 ipfw_table_algo_init(struct ip_fw_chain *ch)
 {
 	size_t sz;
 
 	/*
 	 * Register all algorithms presented here.
 	 */
 	sz = sizeof(struct table_algo);
 	ipfw_add_table_algo(ch, &addr_radix, sz, &addr_radix.idx);
 	ipfw_add_table_algo(ch, &addr_hash, sz, &addr_hash.idx);
 	ipfw_add_table_algo(ch, &iface_idx, sz, &iface_idx.idx);
 	ipfw_add_table_algo(ch, &number_array, sz, &number_array.idx);
 	ipfw_add_table_algo(ch, &flow_hash, sz, &flow_hash.idx);
 	ipfw_add_table_algo(ch, &addr_kfib, sz, &addr_kfib.idx);
 }
 
 void
 ipfw_table_algo_destroy(struct ip_fw_chain *ch)
 {
 
 	ipfw_del_table_algo(ch, addr_radix.idx);
 	ipfw_del_table_algo(ch, addr_hash.idx);
 	ipfw_del_table_algo(ch, iface_idx.idx);
 	ipfw_del_table_algo(ch, number_array.idx);
 	ipfw_del_table_algo(ch, flow_hash.idx);
 	ipfw_del_table_algo(ch, addr_kfib.idx);
 }
 
 
Index: projects/routing/sys/nfs/bootp_subr.c
===================================================================
--- projects/routing/sys/nfs/bootp_subr.c	(revision 274335)
+++ projects/routing/sys/nfs/bootp_subr.c	(revision 274336)
@@ -1,1866 +1,1866 @@
 /*-
  * Copyright (c) 1995 Gordon Ross, Adam Glass
  * Copyright (c) 1992 Regents of the University of California.
  * All rights reserved.
  *
  * This software was developed by the Computer Systems Engineering group
  * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
  * contributed to Berkeley.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Lawrence Berkeley Laboratory and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * based on:
  *      nfs/krpc_subr.c
  *	$NetBSD: krpc_subr.c,v 1.10 1995/08/08 20:43:43 gwr Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bootp.h"
 #include "opt_nfs.h"
 #include "opt_rootdevname.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/sockio.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/vnet.h>
 
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfs/nfsdiskless.h>
 #include <nfs/krpc.h>
 #include <nfs/xdr_subs.h>
 
 
 #define BOOTP_MIN_LEN		300	/* Minimum size of bootp udp packet */
 
 #ifndef BOOTP_SETTLE_DELAY
 #define BOOTP_SETTLE_DELAY 3
 #endif
 
 /* 
  * Wait 10 seconds for interface appearance
  * USB ethernet adapters might require some time to pop up
  */
 #ifndef	BOOTP_IFACE_WAIT_TIMEOUT
 #define	BOOTP_IFACE_WAIT_TIMEOUT	10
 #endif
 
 /*
  * What is the longest we will wait before re-sending a request?
  * Note this is also the frequency of "RPC timeout" messages.
  * The re-send loop count sup linearly to this maximum, so the
  * first complaint will happen after (1+2+3+4+5)=15 seconds.
  */
 #define	MAX_RESEND_DELAY 5	/* seconds */
 
 /* Definitions from RFC951 */
 struct bootp_packet {
 	u_int8_t op;
 	u_int8_t htype;
 	u_int8_t hlen;
 	u_int8_t hops;
 	u_int32_t xid;
 	u_int16_t secs;
 	u_int16_t flags;
 	struct in_addr ciaddr;
 	struct in_addr yiaddr;
 	struct in_addr siaddr;
 	struct in_addr giaddr;
 	unsigned char chaddr[16];
 	char sname[64];
 	char file[128];
 	unsigned char vend[1222];
 };
 
 struct bootpc_ifcontext {
 	STAILQ_ENTRY(bootpc_ifcontext) next;
 	struct bootp_packet call;
 	struct bootp_packet reply;
 	int replylen;
 	int overload;
 	union {
 		struct ifreq _ifreq;
 		struct in_aliasreq _in_alias_req;
 	} _req;
 #define	ireq	_req._ifreq
 #define	iareq	_req._in_alias_req
 	struct ifnet *ifp;
 	struct sockaddr_dl *sdl;
 	struct sockaddr_in myaddr;
 	struct sockaddr_in netmask;
 	struct sockaddr_in gw;
 	int gotgw;
 	int gotnetmask;
 	int gotrootpath;
 	int outstanding;
 	int sentmsg;
 	u_int32_t xid;
 	enum {
 		IF_BOOTP_UNRESOLVED,
 		IF_BOOTP_RESOLVED,
 		IF_BOOTP_FAILED,
 		IF_DHCP_UNRESOLVED,
 		IF_DHCP_OFFERED,
 		IF_DHCP_RESOLVED,
 		IF_DHCP_FAILED,
 	} state;
 	int dhcpquerytype;		/* dhcp type sent */
 	struct in_addr dhcpserver;
 	int gotdhcpserver;
 };
 
 #define TAG_MAXLEN 1024
 struct bootpc_tagcontext {
 	char buf[TAG_MAXLEN + 1];
 	int overload;
 	int badopt;
 	int badtag;
 	int foundopt;
 	int taglen;
 };
 
 struct bootpc_globalcontext {
 	STAILQ_HEAD(, bootpc_ifcontext) interfaces;
 	u_int32_t xid;
 	int any_root_overrides;
 	int gotrootpath;
 	int gotgw;
 	int ifnum;
 	int secs;
 	int starttime;
 	struct bootp_packet reply;
 	int replylen;
 	struct bootpc_ifcontext *setrootfs;
 	struct bootpc_ifcontext *sethostname;
 	struct bootpc_tagcontext tmptag;
 	struct bootpc_tagcontext tag;
 };
 
 #define IPPORT_BOOTPC 68
 #define IPPORT_BOOTPS 67
 
 #define BOOTP_REQUEST 1
 #define BOOTP_REPLY 2
 
 /* Common tags */
 #define TAG_PAD		  0  /* Pad option, implicit length 1 */
 #define TAG_SUBNETMASK	  1  /* RFC 950 subnet mask */
 #define TAG_ROUTERS	  3  /* Routers (in order of preference) */
 #define TAG_HOSTNAME	 12  /* Client host name */
 #define TAG_ROOT	 17  /* Root path */
 
 /* DHCP specific tags */
 #define TAG_OVERLOAD	 52  /* Option Overload */
 #define TAG_MAXMSGSIZE   57  /* Maximum DHCP Message Size */
 
 #define TAG_END		255  /* End Option (i.e. no more options) */
 
 /* Overload values */
 #define OVERLOAD_FILE     1
 #define OVERLOAD_SNAME    2
 
 /* Site specific tags: */
 #define TAG_ROOTOPTS	130
 #define TAG_COOKIE	134	/* ascii info for userland, via sysctl */
 
 #define TAG_DHCP_MSGTYPE 53
 #define TAG_DHCP_REQ_ADDR 50
 #define TAG_DHCP_SERVERID 54
 #define TAG_DHCP_LEASETIME 51
 
 #define TAG_VENDOR_INDENTIFIER 60
 
 #define DHCP_NOMSG    0
 #define DHCP_DISCOVER 1
 #define DHCP_OFFER    2
 #define DHCP_REQUEST  3
 #define DHCP_ACK      5
 
 /* NFS read/write block size */
 #ifndef BOOTP_BLOCKSIZE
 #define	BOOTP_BLOCKSIZE	8192
 #endif
 
 static char bootp_cookie[128];
 static struct socket *bootp_so;
 SYSCTL_STRING(_kern, OID_AUTO, bootp_cookie, CTLFLAG_RD,
 	bootp_cookie, 0, "Cookie (T134) supplied by bootp server");
 
 /* mountd RPC */
 static int	md_mount(struct sockaddr_in *mdsin, char *path, u_char *fhp,
 		    int *fhsizep, struct nfs_args *args, struct thread *td);
 static int	setfs(struct sockaddr_in *addr, char *path, char *p,
 		    const struct in_addr *siaddr);
 static int	getdec(char **ptr);
 static int	getip(char **ptr, struct in_addr *ip);
 static void	mountopts(struct nfs_args *args, char *p);
 static int	xdr_opaque_decode(struct mbuf **ptr, u_char *buf, int len);
 static int	xdr_int_decode(struct mbuf **ptr, int *iptr);
 static void	print_in_addr(struct in_addr addr);
 static void	print_sin_addr(struct sockaddr_in *addr);
 static void	clear_sinaddr(struct sockaddr_in *sin);
 static void	allocifctx(struct bootpc_globalcontext *gctx);
 static void	bootpc_compose_query(struct bootpc_ifcontext *ifctx,
 		    struct thread *td);
 static unsigned char *bootpc_tag(struct bootpc_tagcontext *tctx,
 		    struct bootp_packet *bp, int len, int tag);
 static void bootpc_tag_helper(struct bootpc_tagcontext *tctx,
 		    unsigned char *start, int len, int tag);
 
 #ifdef BOOTP_DEBUG
 void bootpboot_p_sa(struct sockaddr *sa, struct sockaddr *ma);
 void bootpboot_p_rtentry(struct rtentry *rt);
 void bootpboot_p_tree(struct radix_node *rn);
 void bootpboot_p_rtlist(void);
 void bootpboot_p_if(struct ifnet *ifp, struct ifaddr *ifa);
 void bootpboot_p_iflist(void);
 #endif
 
 static int	bootpc_call(struct bootpc_globalcontext *gctx,
 		    struct thread *td);
 
 static void	bootpc_fakeup_interface(struct bootpc_ifcontext *ifctx,
 		    struct thread *td);
 
 static int	bootpc_adjust_interface(struct bootpc_ifcontext *ifctx,
 		    struct bootpc_globalcontext *gctx, struct thread *td);
 
 static void	bootpc_decode_reply(struct nfsv3_diskless *nd,
 		    struct bootpc_ifcontext *ifctx,
 		    struct bootpc_globalcontext *gctx);
 
 static int	bootpc_received(struct bootpc_globalcontext *gctx,
 		    struct bootpc_ifcontext *ifctx);
 
 static __inline int bootpc_ifctx_isresolved(struct bootpc_ifcontext *ifctx);
 static __inline int bootpc_ifctx_isunresolved(struct bootpc_ifcontext *ifctx);
 static __inline int bootpc_ifctx_isfailed(struct bootpc_ifcontext *ifctx);
 
 /*
  * In order to have multiple active interfaces with address 0.0.0.0
  * and be able to send data to a selected interface, we first set
  * mask to /8 on all interfaces, and temporarily set it to /0 when
  * doing sosend().
  */
 
 #ifdef BOOTP_DEBUG
 void
 bootpboot_p_sa(struct sockaddr *sa, struct sockaddr *ma)
 {
 
 	if (sa == NULL) {
 		printf("(sockaddr *) <null>");
 		return;
 	}
 	switch (sa->sa_family) {
 	case AF_INET:
 	{
 		struct sockaddr_in *sin;
 
 		sin = (struct sockaddr_in *) sa;
 		printf("inet ");
 		print_sin_addr(sin);
 		if (ma != NULL) {
 			sin = (struct sockaddr_in *) ma;
 			printf(" mask ");
 			print_sin_addr(sin);
 		}
 	}
 	break;
 	case AF_LINK:
 	{
 		struct sockaddr_dl *sli;
 		int i;
 
 		sli = (struct sockaddr_dl *) sa;
 		printf("link %.*s ", sli->sdl_nlen, sli->sdl_data);
 		for (i = 0; i < sli->sdl_alen; i++) {
 			if (i > 0)
 				printf(":");
 			printf("%x", ((unsigned char *) LLADDR(sli))[i]);
 		}
 	}
 	break;
 	default:
 		printf("af%d", sa->sa_family);
 	}
 }
 
 void
 bootpboot_p_rtentry(struct rtentry *rt)
 {
 
 	bootpboot_p_sa(rt_key(rt), rt_mask(rt));
 	printf(" ");
 	bootpboot_p_sa(rt->rt_gateway, NULL);
 	printf(" ");
 	printf("flags %x", (unsigned short) rt->rt_flags);
 	printf(" %d", (int) rt->rt_expire);
 	printf(" %s\n", rt->rt_ifp->if_xname);
 }
 
 void
 bootpboot_p_tree(struct radix_node *rn)
 {
 
 	while (rn != NULL) {
 		if (rn->rn_bit < 0) {
 			if ((rn->rn_flags & RNF_ROOT) != 0) {
 			} else {
 				bootpboot_p_rtentry((struct rtentry *) rn);
 			}
 			rn = rn->rn_dupedkey;
 		} else {
 			bootpboot_p_tree(rn->rn_left);
 			bootpboot_p_tree(rn->rn_right);
 			return;
 		}
 	}
 }
 
 void
 bootpboot_p_rtlist(void)
 {
 	struct rib_head *rh;
 
 	printf("Routing table:\n");
 	rnh = rt_tables_get_rnh(0, AF_INET);
 	if (rnh == NULL)
 		return;
-	RIB_RLOCK(rnh);	/* could sleep XXX */
+	RIB_CFG_RLOCK(rnh);	/* could sleep XXX */
 	bootpboot_p_tree(rh->rnh_treetop);
-	RIB_RUNLOCK(rnh);
+	RIB_CFG_RUNLOCK(rnh);
 }
 
 void
 bootpboot_p_if(struct ifnet *ifp, struct ifaddr *ifa)
 {
 
 	printf("%s flags %x, addr ",
 	       ifp->if_xname, ifp->if_flags);
 	print_sin_addr((struct sockaddr_in *) ifa->ifa_addr);
 	printf(", broadcast ");
 	print_sin_addr((struct sockaddr_in *) ifa->ifa_dstaddr);
 	printf(", netmask ");
 	print_sin_addr((struct sockaddr_in *) ifa->ifa_netmask);
 	printf("\n");
 }
 
 void
 bootpboot_p_iflist(void)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	printf("Interface list:\n");
 	IFNET_RLOCK();
 	for (ifp = TAILQ_FIRST(&V_ifnet);
 	     ifp != NULL;
 	     ifp = TAILQ_NEXT(ifp, if_link)) {
 		for (ifa = TAILQ_FIRST(&ifp->if_addrhead);
 		     ifa != NULL;
 		     ifa = TAILQ_NEXT(ifa, ifa_link))
 			if (ifa->ifa_addr->sa_family == AF_INET)
 				bootpboot_p_if(ifp, ifa);
 	}
 	IFNET_RUNLOCK();
 }
 #endif /* defined(BOOTP_DEBUG) */
 
 static void
 clear_sinaddr(struct sockaddr_in *sin)
 {
 
 	bzero(sin, sizeof(*sin));
 	sin->sin_len = sizeof(*sin);
 	sin->sin_family = AF_INET;
 	sin->sin_addr.s_addr = INADDR_ANY; /* XXX: htonl(INAADDR_ANY) ? */
 	sin->sin_port = 0;
 }
 
 static void
 allocifctx(struct bootpc_globalcontext *gctx)
 {
 	struct bootpc_ifcontext *ifctx;
 
 	ifctx = malloc(sizeof(*ifctx), M_TEMP, M_WAITOK | M_ZERO);
 	ifctx->xid = gctx->xid;
 #ifdef BOOTP_NO_DHCP
 	ifctx->state = IF_BOOTP_UNRESOLVED;
 #else
 	ifctx->state = IF_DHCP_UNRESOLVED;
 #endif
 	gctx->xid += 0x100;
 	STAILQ_INSERT_TAIL(&gctx->interfaces, ifctx, next);
 }
 
 static __inline int
 bootpc_ifctx_isresolved(struct bootpc_ifcontext *ifctx)
 {
 
 	if (ifctx->state == IF_BOOTP_RESOLVED ||
 	    ifctx->state == IF_DHCP_RESOLVED)
 		return 1;
 	return 0;
 }
 
 static __inline int
 bootpc_ifctx_isunresolved(struct bootpc_ifcontext *ifctx)
 {
 
 	if (ifctx->state == IF_BOOTP_UNRESOLVED ||
 	    ifctx->state == IF_DHCP_UNRESOLVED)
 		return 1;
 	return 0;
 }
 
 static __inline int
 bootpc_ifctx_isfailed(struct bootpc_ifcontext *ifctx)
 {
 
 	if (ifctx->state == IF_BOOTP_FAILED ||
 	    ifctx->state == IF_DHCP_FAILED)
 		return 1;
 	return 0;
 }
 
 static int
 bootpc_received(struct bootpc_globalcontext *gctx,
     struct bootpc_ifcontext *ifctx)
 {
 	unsigned char dhcpreplytype;
 	char *p;
 
 	/*
 	 * Need timeout for fallback to less
 	 * desirable alternative.
 	 */
 
 	/* This call used for the side effect (badopt flag) */
 	(void) bootpc_tag(&gctx->tmptag, &gctx->reply,
 			  gctx->replylen,
 			  TAG_END);
 
 	/* If packet is invalid, ignore it */
 	if (gctx->tmptag.badopt != 0)
 		return 0;
 
 	p = bootpc_tag(&gctx->tmptag, &gctx->reply,
 		       gctx->replylen, TAG_DHCP_MSGTYPE);
 	if (p != NULL)
 		dhcpreplytype = *p;
 	else
 		dhcpreplytype = DHCP_NOMSG;
 
 	switch (ifctx->dhcpquerytype) {
 	case DHCP_DISCOVER:
 		if (dhcpreplytype != DHCP_OFFER 	/* Normal DHCP offer */
 #ifndef BOOTP_FORCE_DHCP
 		    && dhcpreplytype != DHCP_NOMSG	/* Fallback to BOOTP */
 #endif
 			)
 			return 0;
 		break;
 	case DHCP_REQUEST:
 		if (dhcpreplytype != DHCP_ACK)
 			return 0;
 	case DHCP_NOMSG:
 		break;
 	}
 
 	/* Ignore packet unless it gives us a root tag we didn't have */
 
 	if ((ifctx->state == IF_BOOTP_RESOLVED ||
 	     (ifctx->dhcpquerytype == DHCP_DISCOVER &&
 	      (ifctx->state == IF_DHCP_OFFERED ||
 	       ifctx->state == IF_DHCP_RESOLVED))) &&
 	    (bootpc_tag(&gctx->tmptag, &ifctx->reply,
 			ifctx->replylen,
 			TAG_ROOT) != NULL ||
 	     bootpc_tag(&gctx->tmptag, &gctx->reply,
 			gctx->replylen,
 			TAG_ROOT) == NULL))
 		return 0;
 
 	bcopy(&gctx->reply, &ifctx->reply, gctx->replylen);
 	ifctx->replylen = gctx->replylen;
 
 	/* XXX: Only reset if 'perfect' response */
 	if (ifctx->state == IF_BOOTP_UNRESOLVED)
 		ifctx->state = IF_BOOTP_RESOLVED;
 	else if (ifctx->state == IF_DHCP_UNRESOLVED &&
 		 ifctx->dhcpquerytype == DHCP_DISCOVER) {
 		if (dhcpreplytype == DHCP_OFFER)
 			ifctx->state = IF_DHCP_OFFERED;
 		else
 			ifctx->state = IF_BOOTP_RESOLVED;	/* Fallback */
 	} else if (ifctx->state == IF_DHCP_OFFERED &&
 		   ifctx->dhcpquerytype == DHCP_REQUEST)
 		ifctx->state = IF_DHCP_RESOLVED;
 
 
 	if (ifctx->dhcpquerytype == DHCP_DISCOVER &&
 	    ifctx->state != IF_BOOTP_RESOLVED) {
 		p = bootpc_tag(&gctx->tmptag, &ifctx->reply,
 			       ifctx->replylen, TAG_DHCP_SERVERID);
 		if (p != NULL && gctx->tmptag.taglen == 4) {
 			memcpy(&ifctx->dhcpserver, p, 4);
 			ifctx->gotdhcpserver = 1;
 		} else
 			ifctx->gotdhcpserver = 0;
 		return 1;
 	}
 
 	ifctx->gotrootpath = (bootpc_tag(&gctx->tmptag, &ifctx->reply,
 					 ifctx->replylen,
 					 TAG_ROOT) != NULL);
 	ifctx->gotgw = (bootpc_tag(&gctx->tmptag, &ifctx->reply,
 				   ifctx->replylen,
 				   TAG_ROUTERS) != NULL);
 	ifctx->gotnetmask = (bootpc_tag(&gctx->tmptag, &ifctx->reply,
 					ifctx->replylen,
 					TAG_SUBNETMASK) != NULL);
 	return 1;
 }
 
 static int
 bootpc_call(struct bootpc_globalcontext *gctx, struct thread *td)
 {
 	struct sockaddr_in *sin, dst;
 	struct uio auio;
 	struct sockopt sopt;
 	struct iovec aio;
 	int error, on, rcvflg, timo, len;
 	time_t atimo;
 	time_t rtimo;
 	struct timeval tv;
 	struct bootpc_ifcontext *ifctx;
 	int outstanding;
 	int gotrootpath;
 	int retry;
 	const char *s;
 
 	tv.tv_sec = 1;
 	tv.tv_usec = 0;
 	bzero(&sopt, sizeof(sopt));
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_level = SOL_SOCKET;
 	sopt.sopt_name = SO_RCVTIMEO;
 	sopt.sopt_val = &tv;
 	sopt.sopt_valsize = sizeof tv;
 
 	error = sosetopt(bootp_so, &sopt);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Enable broadcast.
 	 */
 	on = 1;
 	sopt.sopt_name = SO_BROADCAST;
 	sopt.sopt_val = &on;
 	sopt.sopt_valsize = sizeof on;
 
 	error = sosetopt(bootp_so, &sopt);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Disable routing.
 	 */
 
 	on = 1;
 	sopt.sopt_name = SO_DONTROUTE;
 	sopt.sopt_val = &on;
 	sopt.sopt_valsize = sizeof on;
 
 	error = sosetopt(bootp_so, &sopt);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Bind the local endpoint to a bootp client port.
 	 */
 	sin = &dst;
 	clear_sinaddr(sin);
 	sin->sin_port = htons(IPPORT_BOOTPC);
 	error = sobind(bootp_so, (struct sockaddr *)sin, td);
 	if (error != 0) {
 		printf("bind failed\n");
 		goto out;
 	}
 
 	/*
 	 * Setup socket address for the server.
 	 */
 	sin = &dst;
 	clear_sinaddr(sin);
 	sin->sin_addr.s_addr = INADDR_BROADCAST;
 	sin->sin_port = htons(IPPORT_BOOTPS);
 
 	/*
 	 * Send it, repeatedly, until a reply is received,
 	 * but delay each re-send by an increasing amount.
 	 * If the delay hits the maximum, start complaining.
 	 */
 	timo = 0;
 	rtimo = 0;
 	for (;;) {
 
 		outstanding = 0;
 		gotrootpath = 0;
 
 		STAILQ_FOREACH(ifctx, &gctx->interfaces, next) {
 			if (bootpc_ifctx_isresolved(ifctx) != 0 &&
 			    bootpc_tag(&gctx->tmptag, &ifctx->reply,
 				       ifctx->replylen,
 				       TAG_ROOT) != NULL)
 				gotrootpath = 1;
 		}
 
 		STAILQ_FOREACH(ifctx, &gctx->interfaces, next) {
 			struct in_aliasreq *ifra = &ifctx->iareq;
 			sin = (struct sockaddr_in *)&ifra->ifra_mask;
 
 			ifctx->outstanding = 0;
 			if (bootpc_ifctx_isresolved(ifctx)  != 0 &&
 			    gotrootpath != 0) {
 				continue;
 			}
 			if (bootpc_ifctx_isfailed(ifctx) != 0)
 				continue;
 
 			outstanding++;
 			ifctx->outstanding = 1;
 
 			/* Proceed to next step in DHCP negotiation */
 			if ((ifctx->state == IF_DHCP_OFFERED &&
 			     ifctx->dhcpquerytype != DHCP_REQUEST) ||
 			    (ifctx->state == IF_DHCP_UNRESOLVED &&
 			     ifctx->dhcpquerytype != DHCP_DISCOVER) ||
 			    (ifctx->state == IF_BOOTP_UNRESOLVED &&
 			     ifctx->dhcpquerytype != DHCP_NOMSG)) {
 				ifctx->sentmsg = 0;
 				bootpc_compose_query(ifctx, td);
 			}
 
 			/* Send BOOTP request (or re-send). */
 
 			if (ifctx->sentmsg == 0) {
 				switch(ifctx->dhcpquerytype) {
 				case DHCP_DISCOVER:
 					s = "DHCP Discover";
 					break;
 				case DHCP_REQUEST:
 					s = "DHCP Request";
 					break;
 				case DHCP_NOMSG:
 				default:
 					s = "BOOTP Query";
 					break;
 				}
 				printf("Sending %s packet from "
 				       "interface %s (%*D)\n",
 				       s,
 				       ifctx->ireq.ifr_name,
 				       ifctx->sdl->sdl_alen,
 				       (unsigned char *) LLADDR(ifctx->sdl),
 				       ":");
 				ifctx->sentmsg = 1;
 			}
 
 			aio.iov_base = (caddr_t) &ifctx->call;
 			aio.iov_len = sizeof(ifctx->call);
 
 			auio.uio_iov = &aio;
 			auio.uio_iovcnt = 1;
 			auio.uio_segflg = UIO_SYSSPACE;
 			auio.uio_rw = UIO_WRITE;
 			auio.uio_offset = 0;
 			auio.uio_resid = sizeof(ifctx->call);
 			auio.uio_td = td;
 
 			/* Set netmask to 0.0.0.0 */
 			clear_sinaddr(sin);
 			error = ifioctl(bootp_so, SIOCAIFADDR, (caddr_t)ifra,
 			    td);
 			if (error != 0)
 				panic("%s: SIOCAIFADDR, error=%d", __func__,
 				    error);
 
 			error = sosend(bootp_so, (struct sockaddr *) &dst,
 				       &auio, NULL, NULL, 0, td);
 			if (error != 0)
 				printf("%s: sosend: %d state %08x\n", __func__,
 				    error, (int )bootp_so->so_state);
 
 			/* Set netmask to 255.0.0.0 */
 			sin->sin_addr.s_addr = htonl(IN_CLASSA_NET);
 			error = ifioctl(bootp_so, SIOCAIFADDR, (caddr_t)ifra,
 			    td);
 			if (error != 0)
 				panic("%s: SIOCAIFADDR, error=%d", __func__,
 				    error);
 		}
 
 		if (outstanding == 0 &&
 		    (rtimo == 0 || time_second >= rtimo)) {
 			error = 0;
 			goto out;
 		}
 
 		/* Determine new timeout. */
 		if (timo < MAX_RESEND_DELAY)
 			timo++;
 		else {
 			printf("DHCP/BOOTP timeout for server ");
 			print_sin_addr(&dst);
 			printf("\n");
 		}
 
 		/*
 		 * Wait for up to timo seconds for a reply.
 		 * The socket receive timeout was set to 1 second.
 		 */
 		atimo = timo + time_second;
 		while (time_second < atimo) {
 			aio.iov_base = (caddr_t) &gctx->reply;
 			aio.iov_len = sizeof(gctx->reply);
 
 			auio.uio_iov = &aio;
 			auio.uio_iovcnt = 1;
 			auio.uio_segflg = UIO_SYSSPACE;
 			auio.uio_rw = UIO_READ;
 			auio.uio_offset = 0;
 			auio.uio_resid = sizeof(gctx->reply);
 			auio.uio_td = td;
 
 			rcvflg = 0;
 			error = soreceive(bootp_so, NULL, &auio,
 					  NULL, NULL, &rcvflg);
 			gctx->secs = time_second - gctx->starttime;
 			STAILQ_FOREACH(ifctx, &gctx->interfaces, next) {
 				if (bootpc_ifctx_isresolved(ifctx) != 0 ||
 				    bootpc_ifctx_isfailed(ifctx) != 0)
 					continue;
 
 				ifctx->call.secs = htons(gctx->secs);
 			}
 			if (error == EWOULDBLOCK)
 				continue;
 			if (error != 0)
 				goto out;
 			len = sizeof(gctx->reply) - auio.uio_resid;
 
 			/* Do we have the required number of bytes ? */
 			if (len < BOOTP_MIN_LEN)
 				continue;
 			gctx->replylen = len;
 
 			/* Is it a reply? */
 			if (gctx->reply.op != BOOTP_REPLY)
 				continue;
 
 			/* Is this an answer to our query */
 			STAILQ_FOREACH(ifctx, &gctx->interfaces, next) {
 				if (gctx->reply.xid != ifctx->call.xid)
 					continue;
 
 				/* Same HW address size ? */
 				if (gctx->reply.hlen != ifctx->call.hlen)
 					continue;
 
 				/* Correct HW address ? */
 				if (bcmp(gctx->reply.chaddr,
 					 ifctx->call.chaddr,
 					 ifctx->call.hlen) != 0)
 					continue;
 
 				break;
 			}
 
 			if (ifctx != NULL) {
 				s =  bootpc_tag(&gctx->tmptag,
 						&gctx->reply,
 						gctx->replylen,
 						TAG_DHCP_MSGTYPE);
 				if (s != NULL) {
 					switch (*s) {
 					case DHCP_OFFER:
 						s = "DHCP Offer";
 						break;
 					case DHCP_ACK:
 						s = "DHCP Ack";
 						break;
 					default:
 						s = "DHCP (unexpected)";
 						break;
 					}
 				} else
 					s = "BOOTP Reply";
 
 				printf("Received %s packet"
 				       " on %s from ",
 				       s,
 				       ifctx->ireq.ifr_name);
 				print_in_addr(gctx->reply.siaddr);
 				if (gctx->reply.giaddr.s_addr !=
 				    htonl(INADDR_ANY)) {
 					printf(" via ");
 					print_in_addr(gctx->reply.giaddr);
 				}
 				if (bootpc_received(gctx, ifctx) != 0) {
 					printf(" (accepted)");
 					if (ifctx->outstanding) {
 						ifctx->outstanding = 0;
 						outstanding--;
 					}
 					/* Network settle delay */
 					if (outstanding == 0)
 						atimo = time_second +
 							BOOTP_SETTLE_DELAY;
 				} else
 					printf(" (ignored)");
 				if (ifctx->gotrootpath || 
 				    gctx->any_root_overrides) {
 					gotrootpath = 1;
 					rtimo = time_second +
 						BOOTP_SETTLE_DELAY;
 					if (ifctx->gotrootpath)
 						printf(" (got root path)");
 				}
 				printf("\n");
 			}
 		} /* while secs */
 #ifdef BOOTP_TIMEOUT
 		if (gctx->secs > BOOTP_TIMEOUT && BOOTP_TIMEOUT > 0)
 			break;
 #endif
 		/* Force a retry if halfway in DHCP negotiation */
 		retry = 0;
 		STAILQ_FOREACH(ifctx, &gctx->interfaces, next)
 			if (ifctx->state == IF_DHCP_OFFERED) {
 				if (ifctx->dhcpquerytype == DHCP_DISCOVER)
 					retry = 1;
 				else
 					ifctx->state = IF_DHCP_UNRESOLVED;
 			}
 
 		if (retry != 0)
 			continue;
 
 		if (gotrootpath != 0) {
 			gctx->gotrootpath = gotrootpath;
 			if (rtimo != 0 && time_second >= rtimo)
 				break;
 		}
 	} /* forever send/receive */
 
 	/*
 	 * XXX: These are errors of varying seriousness being silently
 	 * ignored
 	 */
 
 	STAILQ_FOREACH(ifctx, &gctx->interfaces, next)
 		if (bootpc_ifctx_isresolved(ifctx) == 0) {
 			printf("%s timeout for interface %s\n",
 			       ifctx->dhcpquerytype != DHCP_NOMSG ?
 			       "DHCP" : "BOOTP",
 			       ifctx->ireq.ifr_name);
 		}
 
 	if (gctx->gotrootpath != 0) {
 #if 0
 		printf("Got a root path, ignoring remaining timeout\n");
 #endif
 		error = 0;
 		goto out;
 	}
 #ifndef BOOTP_NFSROOT
 	STAILQ_FOREACH(ifctx, &gctx->interfaces, next)
 		if (bootpc_ifctx_isresolved(ifctx) != 0) {
 			error = 0;
 			goto out;
 		}
 #endif
 	error = ETIMEDOUT;
 
 out:
 	return (error);
 }
 
 static void
 bootpc_fakeup_interface(struct bootpc_ifcontext *ifctx, struct thread *td)
 {
 	struct ifreq *ifr;
 	struct in_aliasreq *ifra;
 	struct sockaddr_in *sin;
 	int error;
 
 	ifr = &ifctx->ireq;
 	ifra = &ifctx->iareq;
 
 	/*
 	 * Bring up the interface.
 	 *
 	 * Get the old interface flags and or IFF_UP into them; if
 	 * IFF_UP set blindly, interface selection can be clobbered.
 	 */
 	error = ifioctl(bootp_so, SIOCGIFFLAGS, (caddr_t)ifr, td);
 	if (error != 0)
 		panic("%s: SIOCGIFFLAGS, error=%d", __func__, error);
 	ifr->ifr_flags |= IFF_UP;
 	error = ifioctl(bootp_so, SIOCSIFFLAGS, (caddr_t)ifr, td);
 	if (error != 0)
 		panic("%s: SIOCSIFFLAGS, error=%d", __func__, error);
 
 	/*
 	 * Do enough of ifconfig(8) so that the chosen interface
 	 * can talk to the servers. Set address to 0.0.0.0/8 and
 	 * broadcast address to local broadcast.
 	 */
 	sin = (struct sockaddr_in *)&ifra->ifra_addr;
 	clear_sinaddr(sin);
 	sin = (struct sockaddr_in *)&ifra->ifra_mask;
 	clear_sinaddr(sin);
 	sin->sin_addr.s_addr = htonl(IN_CLASSA_NET);
 	sin = (struct sockaddr_in *)&ifra->ifra_broadaddr;
 	clear_sinaddr(sin);
 	sin->sin_addr.s_addr = htonl(INADDR_BROADCAST);
 	error = ifioctl(bootp_so, SIOCAIFADDR, (caddr_t)ifra, td);
 	if (error != 0)
 		panic("%s: SIOCAIFADDR, error=%d", __func__, error);
 }
 
 static void
 bootpc_shutdown_interface(struct bootpc_ifcontext *ifctx, struct thread *td)
 {
 	struct ifreq *ifr;
 	struct sockaddr_in *sin;
 	int error;
 
 	ifr = &ifctx->ireq;
 
 	printf("Shutdown interface %s\n", ifctx->ireq.ifr_name);
 	error = ifioctl(bootp_so, SIOCGIFFLAGS, (caddr_t)ifr, td);
 	if (error != 0)
 		panic("%s: SIOCGIFFLAGS, error=%d", __func__, error);
 	ifr->ifr_flags &= ~IFF_UP;
 	error = ifioctl(bootp_so, SIOCSIFFLAGS, (caddr_t)ifr, td);
 	if (error != 0)
 		panic("%s: SIOCSIFFLAGS, error=%d", __func__, error);
 
 	sin = (struct sockaddr_in *) &ifr->ifr_addr;
 	clear_sinaddr(sin);
 	error = ifioctl(bootp_so, SIOCDIFADDR, (caddr_t) ifr, td);
 	if (error != 0)
 		panic("%s: SIOCDIFADDR, error=%d", __func__, error);
 }
 
 static int
 bootpc_adjust_interface(struct bootpc_ifcontext *ifctx,
     struct bootpc_globalcontext *gctx, struct thread *td)
 {
 	int error;
 	struct sockaddr_in defdst;
 	struct sockaddr_in defmask;
 	struct sockaddr_in *sin;
 	struct ifreq *ifr;
 	struct in_aliasreq *ifra;
 	struct sockaddr_in *myaddr;
 	struct sockaddr_in *netmask;
 	struct sockaddr_in *gw;
 
 	ifr = &ifctx->ireq;
 	ifra = &ifctx->iareq;
 	myaddr = &ifctx->myaddr;
 	netmask = &ifctx->netmask;
 	gw = &ifctx->gw;
 
 	if (bootpc_ifctx_isresolved(ifctx) == 0) {
 		/* Shutdown interfaces where BOOTP failed */
 		bootpc_shutdown_interface(ifctx, td);
 		return (0);
 	}
 
 	printf("Adjusted interface %s\n", ifctx->ireq.ifr_name);
 	/*
 	 * Do enough of ifconfig(8) so that the chosen interface
 	 * can talk to the servers.  (just set the address)
 	 */
 	sin = (struct sockaddr_in *) &ifr->ifr_addr;
 	clear_sinaddr(sin);
 	error = ifioctl(bootp_so, SIOCDIFADDR, (caddr_t) ifr, td);
 	if (error != 0)
 		panic("%s: SIOCDIFADDR, error=%d", __func__, error);
 
 	bcopy(myaddr, &ifra->ifra_addr, sizeof(*myaddr));
 	bcopy(netmask, &ifra->ifra_mask, sizeof(*netmask));
 	clear_sinaddr(&ifra->ifra_broadaddr);
 	ifra->ifra_broadaddr.sin_addr.s_addr = myaddr->sin_addr.s_addr |
 	    ~netmask->sin_addr.s_addr;
 
 	error = ifioctl(bootp_so, SIOCAIFADDR, (caddr_t)ifra, td);
 	if (error != 0)
 		panic("%s: SIOCAIFADDR, error=%d", __func__, error);
 
 	/* Add new default route */
 
 	if (ifctx->gotgw != 0 || gctx->gotgw == 0) {
 		clear_sinaddr(&defdst);
 		clear_sinaddr(&defmask);
 		/* XXX MRT just table 0 */
 		error = rtrequest_fib(RTM_ADD,
 		    (struct sockaddr *) &defdst, (struct sockaddr *) gw,
 		    (struct sockaddr *) &defmask,
 		    (RTF_UP | RTF_GATEWAY | RTF_STATIC), NULL, RT_DEFAULT_FIB);
 		if (error != 0) {
 			printf("%s: RTM_ADD, error=%d\n", __func__, error);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static int
 setfs(struct sockaddr_in *addr, char *path, char *p,
     const struct in_addr *siaddr)
 {
 
 	if (getip(&p, &addr->sin_addr) == 0) {
 		if (siaddr != NULL && *p == '/')
 			bcopy(siaddr, &addr->sin_addr, sizeof(struct in_addr));
 		else
 			return 0;
 	} else {
 		if (*p != ':')
 			return 0;
 		p++;
 	}
 		
 	addr->sin_len = sizeof(struct sockaddr_in);
 	addr->sin_family = AF_INET;
 
 	strlcpy(path, p, MNAMELEN);
 	return 1;
 }
 
 static int
 getip(char **ptr, struct in_addr *addr)
 {
 	char *p;
 	unsigned int ip;
 	int val;
 
 	p = *ptr;
 	ip = 0;
 	if (((val = getdec(&p)) < 0) || (val > 255))
 		return 0;
 	ip = val << 24;
 	if (*p != '.')
 		return 0;
 	p++;
 	if (((val = getdec(&p)) < 0) || (val > 255))
 		return 0;
 	ip |= (val << 16);
 	if (*p != '.')
 		return 0;
 	p++;
 	if (((val = getdec(&p)) < 0) || (val > 255))
 		return 0;
 	ip |= (val << 8);
 	if (*p != '.')
 		return 0;
 	p++;
 	if (((val = getdec(&p)) < 0) || (val > 255))
 		return 0;
 	ip |= val;
 
 	addr->s_addr = htonl(ip);
 	*ptr = p;
 	return 1;
 }
 
 static int
 getdec(char **ptr)
 {
 	char *p;
 	int ret;
 
 	p = *ptr;
 	ret = 0;
 	if ((*p < '0') || (*p > '9'))
 		return -1;
 	while ((*p >= '0') && (*p <= '9')) {
 		ret = ret * 10 + (*p - '0');
 		p++;
 	}
 	*ptr = p;
 	return ret;
 }
 
 static void
 mountopts(struct nfs_args *args, char *p)
 {
 	args->version = NFS_ARGSVERSION;
 	args->rsize = BOOTP_BLOCKSIZE;
 	args->wsize = BOOTP_BLOCKSIZE;
 	args->flags = NFSMNT_RSIZE | NFSMNT_WSIZE | NFSMNT_RESVPORT;
 	args->sotype = SOCK_DGRAM;
 	if (p != NULL)
 		nfs_parse_options(p, args);
 }
 
 static int
 xdr_opaque_decode(struct mbuf **mptr, u_char *buf, int len)
 {
 	struct mbuf *m;
 	int alignedlen;
 
 	m = *mptr;
 	alignedlen = ( len + 3 ) & ~3;
 
 	if (m->m_len < alignedlen) {
 		m = m_pullup(m, alignedlen);
 		if (m == NULL) {
 			*mptr = NULL;
 			return EBADRPC;
 		}
 	}
 	bcopy(mtod(m, u_char *), buf, len);
 	m_adj(m, alignedlen);
 	*mptr = m;
 	return 0;
 }
 
 static int
 xdr_int_decode(struct mbuf **mptr, int *iptr)
 {
 	u_int32_t i;
 
 	if (xdr_opaque_decode(mptr, (u_char *) &i, sizeof(u_int32_t)) != 0)
 		return EBADRPC;
 	*iptr = fxdr_unsigned(u_int32_t, i);
 	return 0;
 }
 
 static void
 print_sin_addr(struct sockaddr_in *sin)
 {
 
 	print_in_addr(sin->sin_addr);
 }
 
 static void
 print_in_addr(struct in_addr addr)
 {
 	unsigned int ip;
 
 	ip = ntohl(addr.s_addr);
 	printf("%d.%d.%d.%d",
 	       ip >> 24, (ip >> 16) & 255, (ip >> 8) & 255, ip & 255);
 }
 
 static void
 bootpc_compose_query(struct bootpc_ifcontext *ifctx, struct thread *td)
 {
 	unsigned char *vendp;
 	unsigned char vendor_client[64];
 	uint32_t leasetime;
 	uint8_t vendor_client_len;
 
 	ifctx->gotrootpath = 0;
 
 	bzero((caddr_t) &ifctx->call, sizeof(ifctx->call));
 
 	/* bootpc part */
 	ifctx->call.op = BOOTP_REQUEST; 	/* BOOTREQUEST */
 	ifctx->call.htype = 1;			/* 10mb ethernet */
 	ifctx->call.hlen = ifctx->sdl->sdl_alen;/* Hardware address length */
 	ifctx->call.hops = 0;
 	if (bootpc_ifctx_isunresolved(ifctx) != 0)
 		ifctx->xid++;
 	ifctx->call.xid = txdr_unsigned(ifctx->xid);
 	bcopy(LLADDR(ifctx->sdl), &ifctx->call.chaddr, ifctx->sdl->sdl_alen);
 
 	vendp = ifctx->call.vend;
 	*vendp++ = 99;		/* RFC1048 cookie */
 	*vendp++ = 130;
 	*vendp++ = 83;
 	*vendp++ = 99;
 	*vendp++ = TAG_MAXMSGSIZE;
 	*vendp++ = 2;
 	*vendp++ = (sizeof(struct bootp_packet) >> 8) & 255;
 	*vendp++ = sizeof(struct bootp_packet) & 255;
 
 	snprintf(vendor_client, sizeof(vendor_client), "%s:%s:%s",
 		ostype, MACHINE, osrelease);
 	vendor_client_len = strlen(vendor_client);
 	*vendp++ = TAG_VENDOR_INDENTIFIER;
 	*vendp++ = vendor_client_len;
 	memcpy(vendp, vendor_client, vendor_client_len);
 	vendp += vendor_client_len;
 	ifctx->dhcpquerytype = DHCP_NOMSG;
 	switch (ifctx->state) {
 	case IF_DHCP_UNRESOLVED:
 		*vendp++ = TAG_DHCP_MSGTYPE;
 		*vendp++ = 1;
 		*vendp++ = DHCP_DISCOVER;
 		ifctx->dhcpquerytype = DHCP_DISCOVER;
 		ifctx->gotdhcpserver = 0;
 		break;
 	case IF_DHCP_OFFERED:
 		*vendp++ = TAG_DHCP_MSGTYPE;
 		*vendp++ = 1;
 		*vendp++ = DHCP_REQUEST;
 		ifctx->dhcpquerytype = DHCP_REQUEST;
 		*vendp++ = TAG_DHCP_REQ_ADDR;
 		*vendp++ = 4;
 		memcpy(vendp, &ifctx->reply.yiaddr, 4);
 		vendp += 4;
 		if (ifctx->gotdhcpserver != 0) {
 			*vendp++ = TAG_DHCP_SERVERID;
 			*vendp++ = 4;
 			memcpy(vendp, &ifctx->dhcpserver, 4);
 			vendp += 4;
 		}
 		*vendp++ = TAG_DHCP_LEASETIME;
 		*vendp++ = 4;
 		leasetime = htonl(300);
 		memcpy(vendp, &leasetime, 4);
 		vendp += 4;
 		break;
 	default:
 		break;
 	}
 	*vendp = TAG_END;
 
 	ifctx->call.secs = 0;
 	ifctx->call.flags = htons(0x8000); /* We need a broadcast answer */
 }
 
 static int
 bootpc_hascookie(struct bootp_packet *bp)
 {
 
 	return (bp->vend[0] == 99 && bp->vend[1] == 130 &&
 		bp->vend[2] == 83 && bp->vend[3] == 99);
 }
 
 static void
 bootpc_tag_helper(struct bootpc_tagcontext *tctx,
     unsigned char *start, int len, int tag)
 {
 	unsigned char *j;
 	unsigned char *ej;
 	unsigned char code;
 
 	if (tctx->badtag != 0 || tctx->badopt != 0)
 		return;
 
 	j = start;
 	ej = j + len;
 
 	while (j < ej) {
 		code = *j++;
 		if (code == TAG_PAD)
 			continue;
 		if (code == TAG_END)
 			return;
 		if (j >= ej || j + *j + 1 > ej) {
 			tctx->badopt = 1;
 			return;
 		}
 		len = *j++;
 		if (code == tag) {
 			if (tctx->taglen + len > TAG_MAXLEN) {
 				tctx->badtag = 1;
 				return;
 			}
 			tctx->foundopt = 1;
 			if (len > 0)
 				memcpy(tctx->buf + tctx->taglen,
 				       j, len);
 			tctx->taglen += len;
 		}
 		if (code == TAG_OVERLOAD)
 			tctx->overload = *j;
 
 		j += len;
 	}
 }
 
 static unsigned char *
 bootpc_tag(struct bootpc_tagcontext *tctx,
     struct bootp_packet *bp, int len, int tag)
 {
 	tctx->overload = 0;
 	tctx->badopt = 0;
 	tctx->badtag = 0;
 	tctx->foundopt = 0;
 	tctx->taglen = 0;
 
 	if (bootpc_hascookie(bp) == 0)
 		return NULL;
 
 	bootpc_tag_helper(tctx, &bp->vend[4],
 			  (unsigned char *) bp + len - &bp->vend[4], tag);
 
 	if ((tctx->overload & OVERLOAD_FILE) != 0)
 		bootpc_tag_helper(tctx,
 				  (unsigned char *) bp->file,
 				  sizeof(bp->file),
 				  tag);
 	if ((tctx->overload & OVERLOAD_SNAME) != 0)
 		bootpc_tag_helper(tctx,
 				  (unsigned char *) bp->sname,
 				  sizeof(bp->sname),
 				  tag);
 
 	if (tctx->badopt != 0 || tctx->badtag != 0 || tctx->foundopt == 0)
 		return NULL;
 	tctx->buf[tctx->taglen] = '\0';
 	return tctx->buf;
 }
 
 static void
 bootpc_decode_reply(struct nfsv3_diskless *nd, struct bootpc_ifcontext *ifctx,
     struct bootpc_globalcontext *gctx)
 {
 	char *p, *s;
 	unsigned int ip;
 
 	ifctx->gotgw = 0;
 	ifctx->gotnetmask = 0;
 
 	clear_sinaddr(&ifctx->myaddr);
 	clear_sinaddr(&ifctx->netmask);
 	clear_sinaddr(&ifctx->gw);
 
 	ifctx->myaddr.sin_addr = ifctx->reply.yiaddr;
 
 	ip = ntohl(ifctx->myaddr.sin_addr.s_addr);
 
 	printf("%s at ", ifctx->ireq.ifr_name);
 	print_sin_addr(&ifctx->myaddr);
 	printf(" server ");
 	print_in_addr(ifctx->reply.siaddr);
 
 	ifctx->gw.sin_addr = ifctx->reply.giaddr;
 	if (ifctx->reply.giaddr.s_addr != htonl(INADDR_ANY)) {
 		printf(" via gateway ");
 		print_in_addr(ifctx->reply.giaddr);
 	}
 
 	/* This call used for the side effect (overload flag) */
 	(void) bootpc_tag(&gctx->tmptag,
 			  &ifctx->reply, ifctx->replylen, TAG_END);
 
 	if ((gctx->tmptag.overload & OVERLOAD_SNAME) == 0)
 		if (ifctx->reply.sname[0] != '\0')
 			printf(" server name %s", ifctx->reply.sname);
 	if ((gctx->tmptag.overload & OVERLOAD_FILE) == 0)
 		if (ifctx->reply.file[0] != '\0')
 			printf(" boot file %s", ifctx->reply.file);
 
 	printf("\n");
 
 	p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen,
 		       TAG_SUBNETMASK);
 	if (p != NULL) {
 		if (gctx->tag.taglen != 4)
 			panic("bootpc: subnet mask len is %d",
 			      gctx->tag.taglen);
 		bcopy(p, &ifctx->netmask.sin_addr, 4);
 		ifctx->gotnetmask = 1;
 		printf("subnet mask ");
 		print_sin_addr(&ifctx->netmask);
 		printf(" ");
 	}
 
 	p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen,
 		       TAG_ROUTERS);
 	if (p != NULL) {
 		/* Routers */
 		if (gctx->tag.taglen % 4)
 			panic("bootpc: Router Len is %d", gctx->tag.taglen);
 		if (gctx->tag.taglen > 0) {
 			bcopy(p, &ifctx->gw.sin_addr, 4);
 			printf("router ");
 			print_sin_addr(&ifctx->gw);
 			printf(" ");
 			ifctx->gotgw = 1;
 			gctx->gotgw = 1;
 		}
 	}
 
 	/*
 	 * Choose a root filesystem.  If a value is forced in the environment
 	 * and it contains "nfs:", use it unconditionally.  Otherwise, if the
 	 * kernel is compiled with the ROOTDEVNAME option, then use it if:
 	 *  - The server doesn't provide a pathname.
 	 *  - The boothowto flags include RB_DFLTROOT (user said to override
 	 *    the server value).
 	 */
 	p = NULL;
 	if ((s = kern_getenv("vfs.root.mountfrom")) != NULL) {
 		if ((p = strstr(s, "nfs:")) != NULL)
 			p = strdup(p + 4, M_TEMP);
 		freeenv(s);
 	}
 	if (p == NULL) {
 		p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen,
 		       TAG_ROOT);
 	}
 #ifdef ROOTDEVNAME
 	if ((p == NULL || (boothowto & RB_DFLTROOT) != 0) && 
 	    (p = strstr(ROOTDEVNAME, "nfs:")) != NULL) {
 		p += 4;
 	}
 #endif
 	if (p != NULL) {
 		if (gctx->setrootfs != NULL) {
 			printf("rootfs %s (ignored) ", p);
 		} else 	if (setfs(&nd->root_saddr,
 				  nd->root_hostnam, p, &ifctx->reply.siaddr)) {
 			if (*p == '/') {
 				printf("root_server ");
 				print_sin_addr(&nd->root_saddr);
 				printf(" ");
 			}
 			printf("rootfs %s ", p);
 			gctx->gotrootpath = 1;
 			ifctx->gotrootpath = 1;
 			gctx->setrootfs = ifctx;
 
 			p = bootpc_tag(&gctx->tag, &ifctx->reply,
 				       ifctx->replylen,
 				       TAG_ROOTOPTS);
 			if (p != NULL) {
 				mountopts(&nd->root_args, p);
 				printf("rootopts %s ", p);
 			}
 		} else
 			panic("Failed to set rootfs to %s", p);
 	}
 
 	p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen,
 		       TAG_HOSTNAME);
 	if (p != NULL) {
 		if (gctx->tag.taglen >= MAXHOSTNAMELEN)
 			panic("bootpc: hostname >= %d bytes",
 			      MAXHOSTNAMELEN);
 		if (gctx->sethostname != NULL) {
 			printf("hostname %s (ignored) ", p);
 		} else {
 			strcpy(nd->my_hostnam, p);
 			mtx_lock(&prison0.pr_mtx);
 			strcpy(prison0.pr_hostname, p);
 			mtx_unlock(&prison0.pr_mtx);
 			printf("hostname %s ", p);
 			gctx->sethostname = ifctx;
 		}
 	}
 	p = bootpc_tag(&gctx->tag, &ifctx->reply, ifctx->replylen,
 			TAG_COOKIE);
 	if (p != NULL) {        /* store in a sysctl variable */
 		int i, l = sizeof(bootp_cookie) - 1;
 		for (i = 0; i < l && p[i] != '\0'; i++)
 			bootp_cookie[i] = p[i];
 		p[i] = '\0';
 	}
 
 
 	printf("\n");
 
 	if (ifctx->gotnetmask == 0) {
 		if (IN_CLASSA(ntohl(ifctx->myaddr.sin_addr.s_addr)))
 			ifctx->netmask.sin_addr.s_addr = htonl(IN_CLASSA_NET);
 		else if (IN_CLASSB(ntohl(ifctx->myaddr.sin_addr.s_addr)))
 			ifctx->netmask.sin_addr.s_addr = htonl(IN_CLASSB_NET);
 		else
 			ifctx->netmask.sin_addr.s_addr = htonl(IN_CLASSC_NET);
 	}
 	if (ifctx->gotgw == 0) {
 		/* Use proxyarp */
 		ifctx->gw.sin_addr.s_addr = ifctx->myaddr.sin_addr.s_addr;
 	}
 }
 
 void
 bootpc_init(void)
 {
 	struct bootpc_ifcontext *ifctx;		/* Interface BOOTP contexts */
 	struct bootpc_globalcontext *gctx; 	/* Global BOOTP context */
 	struct ifnet *ifp;
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	int error;
 #ifndef BOOTP_WIRED_TO
 	int ifcnt;
 #endif
 	struct nfsv3_diskless *nd;
 	struct thread *td;
 	int timeout;
 	int delay;
 
 	timeout = BOOTP_IFACE_WAIT_TIMEOUT * hz;
 	delay = hz / 10;
 
 	nd = &nfsv3_diskless;
 	td = curthread;
 
 	/*
 	 * If already filled in, don't touch it here
 	 */
 	if (nfs_diskless_valid != 0)
 		return;
 
 	gctx = malloc(sizeof(*gctx), M_TEMP, M_WAITOK | M_ZERO);
 	STAILQ_INIT(&gctx->interfaces);
 	gctx->xid = ~0xFFFF;
 	gctx->starttime = time_second;
 
 	/*
 	 * If ROOTDEVNAME is defined or vfs.root.mountfrom is set then we have
 	 * root-path overrides that can potentially let us boot even if we don't
 	 * get a root path from the server, so we can treat that as a non-error.
 	 */
 #ifdef ROOTDEVNAME
 	gctx->any_root_overrides = 1;
 #else
 	gctx->any_root_overrides = testenv("vfs.root.mountfrom");
 #endif
 
 	/*
 	 * Find a network interface.
 	 */
 	CURVNET_SET(TD_TO_VNET(td));
 #ifdef BOOTP_WIRED_TO
 	printf("%s: wired to interface '%s'\n", __func__, 
 	       __XSTRING(BOOTP_WIRED_TO));
 	allocifctx(gctx);
 #else
 	/*
 	 * Preallocate interface context storage, if another interface
 	 * attaches and wins the race, it won't be eligible for bootp.
 	 */
 	ifcnt = 0;
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((ifp->if_flags &
 		     (IFF_LOOPBACK | IFF_POINTOPOINT | IFF_BROADCAST)) !=
 		    IFF_BROADCAST)
 			continue;
 		switch (ifp->if_alloctype) {
 			case IFT_ETHER:
 			case IFT_FDDI:
 			case IFT_ISO88025:
 				break;
 			default:
 				continue;
 		}
 		ifcnt++;
 	}
 	IFNET_RUNLOCK();
 	if (ifcnt == 0)
 		panic("%s: no eligible interfaces", __func__);
 	for (; ifcnt > 0; ifcnt--)
 		allocifctx(gctx);
 #endif
 
 retry:
 	ifctx = STAILQ_FIRST(&gctx->interfaces);
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifctx == NULL)
 			break;
 #ifdef BOOTP_WIRED_TO
 		if (strcmp(ifp->if_xname, __XSTRING(BOOTP_WIRED_TO)) != 0)
 			continue;
 #else
 		if ((ifp->if_flags &
 		     (IFF_LOOPBACK | IFF_POINTOPOINT | IFF_BROADCAST)) !=
 		    IFF_BROADCAST)
 			continue;
 		switch (ifp->if_alloctype) {
 			case IFT_ETHER:
 			case IFT_FDDI:
 			case IFT_ISO88025:
 				break;
 			default:
 				continue;
 		}
 #endif
 		strlcpy(ifctx->ireq.ifr_name, ifp->if_xname,
 		    sizeof(ifctx->ireq.ifr_name));
 		ifctx->ifp = ifp;
 
 		/* Get HW address */
 		sdl = NULL;
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 			if (ifa->ifa_addr->sa_family == AF_LINK) {
 				sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 				if (sdl->sdl_type == IFT_ETHER)
 					break;
 			}
 		if (sdl == NULL)
 			panic("bootpc: Unable to find HW address for %s",
 			    ifctx->ireq.ifr_name);
 		ifctx->sdl = sdl;
 
 		ifctx = STAILQ_NEXT(ifctx, next);
 	}
 	IFNET_RUNLOCK();
 	CURVNET_RESTORE();
 
 	if (STAILQ_EMPTY(&gctx->interfaces) ||
 	    STAILQ_FIRST(&gctx->interfaces)->ifp == NULL) {
 		if (timeout > 0) {
 			pause("bootpc", delay);
 			timeout -= delay;
 			goto retry;
 		}
 #ifdef BOOTP_WIRED_TO
 		panic("%s: Could not find interface specified "
 		      "by BOOTP_WIRED_TO: "
 		      __XSTRING(BOOTP_WIRED_TO), __func__);
 #else
 		panic("%s: no suitable interface", __func__);
 #endif
 	}
 
 	error = socreate(AF_INET, &bootp_so, SOCK_DGRAM, 0, td->td_ucred, td);
 	if (error != 0)
 		panic("%s: socreate, error=%d", __func__, error);
 
 	STAILQ_FOREACH(ifctx, &gctx->interfaces, next)
 		bootpc_fakeup_interface(ifctx, td);
 
 	STAILQ_FOREACH(ifctx, &gctx->interfaces, next)
 		bootpc_compose_query(ifctx, td);
 
 	error = bootpc_call(gctx, td);
 	if (error != 0) {
 		printf("BOOTP call failed\n");
 	}
 
 	mountopts(&nd->root_args, NULL);
 
 	STAILQ_FOREACH(ifctx, &gctx->interfaces, next)
 		if (bootpc_ifctx_isresolved(ifctx) != 0)
 			bootpc_decode_reply(nd, ifctx, gctx);
 
 #ifdef BOOTP_NFSROOT
 	if (gctx->gotrootpath == 0 && gctx->any_root_overrides == 0)
 		panic("bootpc: No root path offered");
 #endif
 
 	STAILQ_FOREACH(ifctx, &gctx->interfaces, next)
 		bootpc_adjust_interface(ifctx, gctx, td);
 
 	soclose(bootp_so);
 
 	STAILQ_FOREACH(ifctx, &gctx->interfaces, next)
 		if (ifctx->gotrootpath != 0)
 			break;
 	if (ifctx == NULL) {
 		STAILQ_FOREACH(ifctx, &gctx->interfaces, next)
 			if (bootpc_ifctx_isresolved(ifctx) != 0)
 				break;
 	}
 	if (ifctx == NULL)
 		goto out;
 
 	if (gctx->gotrootpath != 0) {
 
 		kern_setenv("boot.netif.name", ifctx->ifp->if_xname);
 
 		error = md_mount(&nd->root_saddr, nd->root_hostnam,
 				 nd->root_fh, &nd->root_fhsize,
 				 &nd->root_args, td);
 		if (error != 0) {
 			if (gctx->any_root_overrides == 0)
 				panic("nfs_boot: mount root, error=%d", error);
 			else
 				goto out;
 		}
 		rootdevnames[0] = "nfs:";
 #ifdef NFSCLIENT
 		rootdevnames[1] = "oldnfs:";
 #endif
 		nfs_diskless_valid = 3;
 	}
 
 	strcpy(nd->myif.ifra_name, ifctx->ireq.ifr_name);
 	bcopy(&ifctx->myaddr, &nd->myif.ifra_addr, sizeof(ifctx->myaddr));
 	bcopy(&ifctx->myaddr, &nd->myif.ifra_broadaddr, sizeof(ifctx->myaddr));
 	((struct sockaddr_in *) &nd->myif.ifra_broadaddr)->sin_addr.s_addr =
 		ifctx->myaddr.sin_addr.s_addr |
 		~ ifctx->netmask.sin_addr.s_addr;
 	bcopy(&ifctx->netmask, &nd->myif.ifra_mask, sizeof(ifctx->netmask));
 
 out:
 	while((ifctx = STAILQ_FIRST(&gctx->interfaces)) != NULL) {
 		STAILQ_REMOVE_HEAD(&gctx->interfaces, next);
 		free(ifctx, M_TEMP);
 	}
 	free(gctx, M_TEMP);
 }
 
 /*
  * RPC: mountd/mount
  * Given a server pathname, get an NFS file handle.
  * Also, sets sin->sin_port to the NFS service port.
  */
 static int
 md_mount(struct sockaddr_in *mdsin, char *path, u_char *fhp, int *fhsizep,
     struct nfs_args *args, struct thread *td)
 {
 	struct mbuf *m;
 	int error;
 	int authunixok;
 	int authcount;
 	int authver;
 
 #define	RPCPROG_MNT	100005
 #define	RPCMNT_VER1	1
 #define RPCMNT_VER3	3
 #define	RPCMNT_MOUNT	1
 #define	AUTH_SYS	1		/* unix style (uid, gids) */
 #define AUTH_UNIX	AUTH_SYS
 
 	/* XXX honor v2/v3 flags in args->flags? */
 #ifdef BOOTP_NFSV3
 	/* First try NFS v3 */
 	/* Get port number for MOUNTD. */
 	error = krpc_portmap(mdsin, RPCPROG_MNT, RPCMNT_VER3,
 			     &mdsin->sin_port, td);
 	if (error == 0) {
 		m = xdr_string_encode(path, strlen(path));
 
 		/* Do RPC to mountd. */
 		error = krpc_call(mdsin, RPCPROG_MNT, RPCMNT_VER3,
 				  RPCMNT_MOUNT, &m, NULL, td);
 	}
 	if (error == 0) {
 		args->flags |= NFSMNT_NFSV3;
 	} else {
 #endif
 		/* Fallback to NFS v2 */
 
 		/* Get port number for MOUNTD. */
 		error = krpc_portmap(mdsin, RPCPROG_MNT, RPCMNT_VER1,
 				     &mdsin->sin_port, td);
 		if (error != 0)
 			return error;
 
 		m = xdr_string_encode(path, strlen(path));
 
 		/* Do RPC to mountd. */
 		error = krpc_call(mdsin, RPCPROG_MNT, RPCMNT_VER1,
 				  RPCMNT_MOUNT, &m, NULL, td);
 		if (error != 0)
 			return error;	/* message already freed */
 
 #ifdef BOOTP_NFSV3
 	}
 #endif
 
 	if (xdr_int_decode(&m, &error) != 0 || error != 0)
 		goto bad;
 
 	if ((args->flags & NFSMNT_NFSV3) != 0) {
 		if (xdr_int_decode(&m, fhsizep) != 0 ||
 		    *fhsizep > NFSX_V3FHMAX ||
 		    *fhsizep <= 0)
 			goto bad;
 	} else
 		*fhsizep = NFSX_V2FH;
 
 	if (xdr_opaque_decode(&m, fhp, *fhsizep) != 0)
 		goto bad;
 
 	if (args->flags & NFSMNT_NFSV3) {
 		if (xdr_int_decode(&m, &authcount) != 0)
 			goto bad;
 		authunixok = 0;
 		if (authcount < 0 || authcount > 100)
 			goto bad;
 		while (authcount > 0) {
 			if (xdr_int_decode(&m, &authver) != 0)
 				goto bad;
 			if (authver == AUTH_UNIX)
 				authunixok = 1;
 			authcount--;
 		}
 		if (authunixok == 0)
 			goto bad;
 	}
 
 	/* Set port number for NFS use. */
 	error = krpc_portmap(mdsin, NFS_PROG,
 			     (args->flags &
 			      NFSMNT_NFSV3) ? NFS_VER3 : NFS_VER2,
 			     &mdsin->sin_port, td);
 
 	goto out;
 
 bad:
 	error = EBADRPC;
 
 out:
 	m_freem(m);
 	return error;
 }
 
 SYSINIT(bootp_rootconf, SI_SUB_ROOT_CONF, SI_ORDER_FIRST, bootpc_init, NULL);