Index: head/sys/net/route.c
===================================================================
--- head/sys/net/route.c	(revision 297224)
+++ head/sys/net/route.c	(revision 297225)
@@ -1,2293 +1,2304 @@
 /*-
  * Copyright (c) 1980, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)route.c	8.3.1.1 (Berkeley) 2/23/95
  * $FreeBSD$
  */
 /************************************************************************
  * Note: In this file a 'fib' is a "forwarding information base"	*
  * Which is the new name for an in kernel routing (next hop) table.	*
  ***********************************************************************/
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 #include "opt_sctp.h"
 #include "opt_mrouting.h"
 #include "opt_mpath.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route_var.h>
 #include <net/vnet.h>
 #include <net/flowtable.h>
 
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 
 #include <netinet/in.h>
 #include <netinet/ip_mroute.h>
 
 #include <vm/uma.h>
 
 #define	RT_MAXFIBS	UINT16_MAX
 
 /* Kernel config default option. */
 #ifdef ROUTETABLES
 #if ROUTETABLES <= 0
 #error "ROUTETABLES defined too low"
 #endif
 #if ROUTETABLES > RT_MAXFIBS
 #error "ROUTETABLES defined too big"
 #endif
 #define	RT_NUMFIBS	ROUTETABLES
 #endif /* ROUTETABLES */
 /* Initialize to default if not otherwise set. */
 #ifndef	RT_NUMFIBS
 #define	RT_NUMFIBS	1
 #endif
 
 #if defined(INET) || defined(INET6)
 #ifdef SCTP
 extern void sctp_addr_change(struct ifaddr *ifa, int cmd);
 #endif /* SCTP */
 #endif
 
 
 /* This is read-only.. */
 u_int rt_numfibs = RT_NUMFIBS;
 SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, "");
 
 /*
  * By default add routes to all fibs for new interfaces.
  * Once this is set to 0 then only allocate routes on interface
  * changes for the FIB of the caller when adding a new set of addresses
  * to an interface.  XXX this is a shotgun aproach to a problem that needs
  * a more fine grained solution.. that will come.
  * XXX also has the problems getting the FIB from curthread which will not
  * always work given the fib can be overridden and prefixes can be added
  * from the network stack context.
  */
 VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1;
 SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET,
     &VNET_NAME(rt_add_addr_allfibs), 0, "");
 
 VNET_DEFINE(struct rtstat, rtstat);
 #define	V_rtstat	VNET(rtstat)
 
 VNET_DEFINE(struct rib_head *, rt_tables);
 #define	V_rt_tables	VNET(rt_tables)
 
 VNET_DEFINE(int, rttrash);		/* routes not in table but not freed */
 #define	V_rttrash	VNET(rttrash)
 
 
 /*
  * Convert a 'struct radix_node *' to a 'struct rtentry *'.
  * The operation can be done safely (in this code) because a
  * 'struct rtentry' starts with two 'struct radix_node''s, the first
  * one representing leaf nodes in the routing tree, which is
  * what the code in radix.c passes us as a 'struct radix_node'.
  *
  * But because there are a lot of assumptions in this conversion,
  * do not cast explicitly, but always use the macro below.
  */
 #define RNTORT(p)	((struct rtentry *)(p))
 
 static VNET_DEFINE(uma_zone_t, rtzone);		/* Routing table UMA zone. */
 #define	V_rtzone	VNET(rtzone)
 
 static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *,
     struct rtentry **, u_int);
 static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *);
 static int rt_ifdelroute(const struct rtentry *rt, void *arg);
 static struct rtentry *rt_unlinkrte(struct rib_head *rnh,
     struct rt_addrinfo *info, int *perror);
 static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info);
 #ifdef RADIX_MPATH
 static struct radix_node *rt_mpath_unlink(struct rib_head *rnh,
     struct rt_addrinfo *info, struct rtentry *rto, int *perror);
 #endif
 static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info,
     int flags);
 
 struct if_mtuinfo
 {
 	struct ifnet	*ifp;
 	int		mtu;
 };
 
 static int	if_updatemtu_cb(struct radix_node *, void *);
 
 /*
  * handler for net.my_fibnum
  */
 static int
 sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
 {
         int fibnum;
         int error;
  
         fibnum = curthread->td_proc->p_fibnum;
         error = sysctl_handle_int(oidp, &fibnum, 0, req);
         return (error);
 }
 
 SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD,
             NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");
 
 static __inline struct rib_head **
 rt_tables_get_rnh_ptr(int table, int fam)
 {
 	struct rib_head **rnh;
 
 	KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.",
 	    __func__));
 	KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.",
 	    __func__));
 
 	/* rnh is [fib=0][af=0]. */
 	rnh = (struct rib_head **)V_rt_tables;
 	/* Get the offset to the requested table and fam. */
 	rnh += table * (AF_MAX+1) + fam;
 
 	return (rnh);
 }
 
 struct rib_head *
 rt_tables_get_rnh(int table, int fam)
 {
 
 	return (*rt_tables_get_rnh_ptr(table, fam));
 }
 
+rt_gen_t
+rt_tables_get_gen(int table, int fam)
+{
+	struct rib_head *rnh;
+
+	rnh = *rt_tables_get_rnh_ptr(table, fam);
+	return (rnh->rnh_gen);
+}
+
+
 /*
  * route initialization must occur before ip6_init2(), which happenas at
  * SI_ORDER_MIDDLE.
  */
 static void
 route_init(void)
 {
 
 	/* whack the tunable ints into  line. */
 	if (rt_numfibs > RT_MAXFIBS)
 		rt_numfibs = RT_MAXFIBS;
 	if (rt_numfibs == 0)
 		rt_numfibs = 1;
 }
 SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
 
 static int
 rtentry_zinit(void *mem, int size, int how)
 {
 	struct rtentry *rt = mem;
 
 	rt->rt_pksent = counter_u64_alloc(how);
 	if (rt->rt_pksent == NULL)
 		return (ENOMEM);
 
 	RT_LOCK_INIT(rt);
 
 	return (0);
 }
 
 static void
 rtentry_zfini(void *mem, int size)
 {
 	struct rtentry *rt = mem;
 
 	RT_LOCK_DESTROY(rt);
 	counter_u64_free(rt->rt_pksent);
 }
 
 static int
 rtentry_ctor(void *mem, int size, void *arg, int how)
 {
 	struct rtentry *rt = mem;
 
 	bzero(rt, offsetof(struct rtentry, rt_endzero));
 	counter_u64_zero(rt->rt_pksent);
 	rt->rt_chain = NULL;
 
 	return (0);
 }
 
 static void
 rtentry_dtor(void *mem, int size, void *arg)
 {
 	struct rtentry *rt = mem;
 
 	RT_UNLOCK_COND(rt);
 }
 
 static void
 vnet_route_init(const void *unused __unused)
 {
 	struct domain *dom;
 	struct rib_head **rnh;
 	int table;
 	int fam;
 
 	V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
 	    sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO);
 
 	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry),
 	    rtentry_ctor, rtentry_dtor,
 	    rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0);
 	for (dom = domains; dom; dom = dom->dom_next) {
 		if (dom->dom_rtattach == NULL)
 			continue;
 
 		for  (table = 0; table < rt_numfibs; table++) {
 			fam = dom->dom_family;
 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
 				break;
 
 			rnh = rt_tables_get_rnh_ptr(table, fam);
 			if (rnh == NULL)
 				panic("%s: rnh NULL", __func__);
 			dom->dom_rtattach((void **)rnh, 0);
 		}
 	}
 }
 VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
     vnet_route_init, 0);
 
 #ifdef VIMAGE
 static void
 vnet_route_uninit(const void *unused __unused)
 {
 	int table;
 	int fam;
 	struct domain *dom;
 	struct rib_head **rnh;
 
 	for (dom = domains; dom; dom = dom->dom_next) {
 		if (dom->dom_rtdetach == NULL)
 			continue;
 
 		for (table = 0; table < rt_numfibs; table++) {
 			fam = dom->dom_family;
 
 			if (table != 0 && fam != AF_INET6 && fam != AF_INET)
 				break;
 
 			rnh = rt_tables_get_rnh_ptr(table, fam);
 			if (rnh == NULL)
 				panic("%s: rnh NULL", __func__);
 			dom->dom_rtdetach((void **)rnh, 0);
 		}
 	}
 
 	free(V_rt_tables, M_RTABLE);
 	uma_zdestroy(V_rtzone);
 }
 VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_route_uninit, 0);
 #endif
 
 struct rib_head *
 rt_table_init(int offset)
 {
 	struct rib_head *rh;
 
 	rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO);
 
 	/* TODO: These details should be hidded inside radix.c */
 	/* Init masks tree */
 	rn_inithead_internal(&rh->head, rh->rnh_nodes, offset);
 	rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0);
 	rh->head.rnh_masks = &rh->rmhead;
 
 	/* Init locks */
 	rw_init(&rh->rib_lock, "rib head lock");
 
 	/* Finally, set base callbacks */
 	rh->rnh_addaddr = rn_addroute;
 	rh->rnh_deladdr = rn_delete;
 	rh->rnh_matchaddr = rn_match;
 	rh->rnh_lookup = rn_lookup;
 	rh->rnh_walktree = rn_walktree;
 	rh->rnh_walktree_from = rn_walktree_from;
 
 	return (rh);
 }
 
 static int
 rt_freeentry(struct radix_node *rn, void *arg)
 {
 	struct radix_head * const rnh = arg;
 	struct radix_node *x;
 
 	x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh);
 	if (x != NULL)
 		R_Free(x);
 	return (0);
 }
 
 void
 rt_table_destroy(struct rib_head *rh)
 {
 
 	rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head);
 
 	/* Assume table is already empty */
 	rw_destroy(&rh->rib_lock);
 	free(rh, M_RTABLE);
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct setfib_args {
 	int     fibnum;
 };
 #endif
 int
 sys_setfib(struct thread *td, struct setfib_args *uap)
 {
 	if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
 		return EINVAL;
 	td->td_proc->p_fibnum = uap->fibnum;
 	return (0);
 }
 
 /*
  * Packet routing routines.
  */
 void
 rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
 {
 	struct rtentry *rt;
 
 	if ((rt = ro->ro_rt) != NULL) {
 		if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
 			return;
 		RTFREE(rt);
 		ro->ro_rt = NULL;
 	}
 	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
 	if (ro->ro_rt)
 		RT_UNLOCK(ro->ro_rt);
 }
 
 /*
  * Look up the route that matches the address given
  * Or, at least try.. Create a cloned route if needed.
  *
  * The returned route, if any, is locked.
  */
 struct rtentry *
 rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
 {
 
 	return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB));
 }
 
 struct rtentry *
 rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
 		    u_int fibnum)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct rtentry *newrt;
 	struct rt_addrinfo info;
 	int err = 0, msgtype = RTM_MISS;
 
 	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	newrt = NULL;
 	if (rh == NULL)
 		goto miss;
 
 	/*
 	 * Look up the address in the table for that Address Family
 	 */
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr(dst, &rh->head);
 	if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		newrt = RNTORT(rn);
 		RT_LOCK(newrt);
 		RT_ADDREF(newrt);
 		RIB_RUNLOCK(rh);
 		return (newrt);
 
 	} else
 		RIB_RUNLOCK(rh);
 	
 	/*
 	 * Either we hit the root or couldn't find any match,
 	 * Which basically means
 	 * "caint get there frm here"
 	 */
 miss:
 	V_rtstat.rts_unreach++;
 
 	if (report) {
 		/*
 		 * If required, report the failure to the supervising
 		 * Authorities.
 		 * For a delete, this is not an error. (report == 0)
 		 */
 		bzero(&info, sizeof(info));
 		info.rti_info[RTAX_DST] = dst;
 		rt_missmsg_fib(msgtype, &info, 0, err, fibnum);
 	}
 	return (newrt);
 }
 
 /*
  * Remove a reference count from an rtentry.
  * If the count gets low enough, take it out of the routing table
  */
 void
 rtfree(struct rtentry *rt)
 {
 	struct rib_head *rnh;
 
 	KASSERT(rt != NULL,("%s: NULL rt", __func__));
 	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
 	KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
 
 	RT_LOCK_ASSERT(rt);
 
 	/*
 	 * The callers should use RTFREE_LOCKED() or RTFREE(), so
 	 * we should come here exactly with the last reference.
 	 */
 	RT_REMREF(rt);
 	if (rt->rt_refcnt > 0) {
 		log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
 		goto done;
 	}
 
 	/*
 	 * On last reference give the "close method" a chance
 	 * to cleanup private state.  This also permits (for
 	 * IPv4 and IPv6) a chance to decide if the routing table
 	 * entry should be purged immediately or at a later time.
 	 * When an immediate purge is to happen the close routine
 	 * typically calls rtexpunge which clears the RTF_UP flag
 	 * on the entry so that the code below reclaims the storage.
 	 */
 	if (rt->rt_refcnt == 0 && rnh->rnh_close)
 		rnh->rnh_close((struct radix_node *)rt, &rnh->head);
 
 	/*
 	 * If we are no longer "up" (and ref == 0)
 	 * then we can free the resources associated
 	 * with the route.
 	 */
 	if ((rt->rt_flags & RTF_UP) == 0) {
 		if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 			panic("rtfree 2");
 		/*
 		 * the rtentry must have been removed from the routing table
 		 * so it is represented in rttrash.. remove that now.
 		 */
 		V_rttrash--;
 #ifdef	DIAGNOSTIC
 		if (rt->rt_refcnt < 0) {
 			printf("rtfree: %p not freed (neg refs)\n", rt);
 			goto done;
 		}
 #endif
 		/*
 		 * release references on items we hold them on..
 		 * e.g other routes and ifaddrs.
 		 */
 		if (rt->rt_ifa)
 			ifa_free(rt->rt_ifa);
 		/*
 		 * The key is separatly alloc'd so free it (see rt_setgate()).
 		 * This also frees the gateway, as they are always malloc'd
 		 * together.
 		 */
 		R_Free(rt_key(rt));
 
 		/*
 		 * and the rtentry itself of course
 		 */
 		uma_zfree(V_rtzone, rt);
 		return;
 	}
 done:
 	RT_UNLOCK(rt);
 }
 
 
 /*
  * Force a routing table entry to the specified
  * destination to go through the given gateway.
  * Normally called as a result of a routing redirect
  * message from the network layer.
  */
 void
 rtredirect_fib(struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct sockaddr *src,
 	u_int fibnum)
 {
 	struct rtentry *rt;
 	int error = 0;
 	short *stat = NULL;
 	struct rt_addrinfo info;
 	struct ifaddr *ifa;
 	struct rib_head *rnh;
 
 	ifa = NULL;
 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rnh == NULL) {
 		error = EAFNOSUPPORT;
 		goto out;
 	}
 
 	/* verify the gateway is directly reachable */
 	if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) {
 		error = ENETUNREACH;
 		goto out;
 	}
 	rt = rtalloc1_fib(dst, 0, 0UL, fibnum);	/* NB: rt is locked */
 	/*
 	 * If the redirect isn't from our current router for this dst,
 	 * it's either old or wrong.  If it redirects us to ourselves,
 	 * we have a routing loop, perhaps as a result of an interface
 	 * going down recently.
 	 */
 	if (!(flags & RTF_DONE) && rt) {
 		if (!sa_equal(src, rt->rt_gateway)) {
 			error = EINVAL;
 			goto done;
 		}
 		if (rt->rt_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) {
 			error = EINVAL;
 			goto done;
 		}
 	}
 	if ((flags & RTF_GATEWAY) && ifa_ifwithaddr_check(gateway)) {
 		error = EHOSTUNREACH;
 		goto done;
 	}
 	/*
 	 * Create a new entry if we just got back a wildcard entry
 	 * or the lookup failed.  This is necessary for hosts
 	 * which use routing redirects generated by smart gateways
 	 * to dynamically build the routing tables.
 	 */
 	if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
 		goto create;
 	/*
 	 * Don't listen to the redirect if it's
 	 * for a route to an interface.
 	 */
 	if (rt->rt_flags & RTF_GATEWAY) {
 		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
 			/*
 			 * Changing from route to net => route to host.
 			 * Create new route, rather than smashing route to net.
 			 */
 		create:
 			if (rt != NULL)
 				RTFREE_LOCKED(rt);
 		
 			flags |= RTF_DYNAMIC;
 			bzero((caddr_t)&info, sizeof(info));
 			info.rti_info[RTAX_DST] = dst;
 			info.rti_info[RTAX_GATEWAY] = gateway;
 			info.rti_info[RTAX_NETMASK] = netmask;
 			info.rti_ifa = ifa;
 			info.rti_flags = flags;
 			error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
 			if (rt != NULL) {
 				RT_LOCK(rt);
 				flags = rt->rt_flags;
 			}
 			
 			stat = &V_rtstat.rts_dynamic;
 		} else {
 
 			/*
 			 * Smash the current notion of the gateway to
 			 * this destination.  Should check about netmask!!!
 			 */
 			if ((flags & RTF_GATEWAY) == 0)
 				rt->rt_flags &= ~RTF_GATEWAY;
 			rt->rt_flags |= RTF_MODIFIED;
 			flags |= RTF_MODIFIED;
 			stat = &V_rtstat.rts_newgateway;
 			/*
 			 * add the key and gateway (in one malloc'd chunk).
 			 */
 			RT_UNLOCK(rt);
 			RIB_WLOCK(rnh);
 			RT_LOCK(rt);
 			rt_setgate(rt, rt_key(rt), gateway);
 			RIB_WUNLOCK(rnh);
 		}
 	} else
 		error = EHOSTUNREACH;
 done:
 	if (rt)
 		RTFREE_LOCKED(rt);
 out:
 	if (error)
 		V_rtstat.rts_badredirect++;
 	else if (stat != NULL)
 		(*stat)++;
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_info[RTAX_NETMASK] = netmask;
 	info.rti_info[RTAX_AUTHOR] = src;
 	rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum);
 	if (ifa != NULL)
 		ifa_free(ifa);
 }
 
 /*
  * Routing table ioctl interface.
  */
 int
 rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
 {
 
 	/*
 	 * If more ioctl commands are added here, make sure the proper
 	 * super-user checks are being performed because it is possible for
 	 * prison-root to make it this far if raw sockets have been enabled
 	 * in jails.
 	 */
 #ifdef INET
 	/* Multicast goop, grrr... */
 	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
 #else /* INET */
 	return ENXIO;
 #endif /* INET */
 }
 
 struct ifaddr *
 ifa_ifwithroute(int flags, const struct sockaddr *dst, struct sockaddr *gateway,
 				u_int fibnum)
 {
 	struct ifaddr *ifa;
 	int not_found = 0;
 
 	if ((flags & RTF_GATEWAY) == 0) {
 		/*
 		 * If we are adding a route to an interface,
 		 * and the interface is a pt to pt link
 		 * we should search for the destination
 		 * as our clue to the interface.  Otherwise
 		 * we can use the local address.
 		 */
 		ifa = NULL;
 		if (flags & RTF_HOST)
 			ifa = ifa_ifwithdstaddr(dst, fibnum);
 		if (ifa == NULL)
 			ifa = ifa_ifwithaddr(gateway);
 	} else {
 		/*
 		 * If we are adding a route to a remote net
 		 * or host, the gateway may still be on the
 		 * other end of a pt to pt link.
 		 */
 		ifa = ifa_ifwithdstaddr(gateway, fibnum);
 	}
 	if (ifa == NULL)
 		ifa = ifa_ifwithnet(gateway, 0, fibnum);
 	if (ifa == NULL) {
 		struct rtentry *rt = rtalloc1_fib(gateway, 0, 0, fibnum);
 		if (rt == NULL)
 			return (NULL);
 		/*
 		 * dismiss a gateway that is reachable only
 		 * through the default router
 		 */
 		switch (gateway->sa_family) {
 		case AF_INET:
 			if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
 				not_found = 1;
 			break;
 		case AF_INET6:
 			if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
 				not_found = 1;
 			break;
 		default:
 			break;
 		}
 		if (!not_found && rt->rt_ifa != NULL) {
 			ifa = rt->rt_ifa;
 			ifa_ref(ifa);
 		}
 		RT_REMREF(rt);
 		RT_UNLOCK(rt);
 		if (not_found || ifa == NULL)
 			return (NULL);
 	}
 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
 		struct ifaddr *oifa = ifa;
 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
 		if (ifa == NULL)
 			ifa = oifa;
 		else
 			ifa_free(oifa);
 	}
 	return (ifa);
 }
 
 /*
  * Do appropriate manipulations of a routing tree given
  * all the bits of info needed
  */
 int
 rtrequest_fib(int req,
 	struct sockaddr *dst,
 	struct sockaddr *gateway,
 	struct sockaddr *netmask,
 	int flags,
 	struct rtentry **ret_nrt,
 	u_int fibnum)
 {
 	struct rt_addrinfo info;
 
 	if (dst->sa_len == 0)
 		return(EINVAL);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_flags = flags;
 	info.rti_info[RTAX_DST] = dst;
 	info.rti_info[RTAX_GATEWAY] = gateway;
 	info.rti_info[RTAX_NETMASK] = netmask;
 	return rtrequest1_fib(req, &info, ret_nrt, fibnum);
 }
 
 
 /*
  * Copy most of @rt data into @info.
  *
  * If @flags contains NHR_COPY, copies dst,netmask and gw to the
  * pointers specified by @info structure. Assume such pointers
  * are zeroed sockaddr-like structures with sa_len field initialized
  * to reflect size of the provided buffer. if no NHR_COPY is specified,
  * point dst,netmask and gw @info fields to appropriate @rt values.
  *
  * if @flags contains NHR_REF, do refcouting on rt_ifp.
  *
  * Returns 0 on success.
  */
 int
 rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags)
 {
 	struct rt_metrics *rmx;
 	struct sockaddr *src, *dst;
 	int sa_len;
 
 	if (flags & NHR_COPY) {
 		/* Copy destination if dst is non-zero */
 		src = rt_key(rt);
 		dst = info->rti_info[RTAX_DST];
 		sa_len = src->sa_len;
 		if (dst != NULL) {
 			if (src->sa_len > dst->sa_len)
 				return (ENOMEM);
 			memcpy(dst, src, src->sa_len);
 			info->rti_addrs |= RTA_DST;
 		}
 
 		/* Copy mask if set && dst is non-zero */
 		src = rt_mask(rt);
 		dst = info->rti_info[RTAX_NETMASK];
 		if (src != NULL && dst != NULL) {
 
 			/*
 			 * Radix stores different value in sa_len,
 			 * assume rt_mask() to have the same length
 			 * as rt_key()
 			 */
 			if (sa_len > dst->sa_len)
 				return (ENOMEM);
 			memcpy(dst, src, src->sa_len);
 			info->rti_addrs |= RTA_NETMASK;
 		}
 
 		/* Copy gateway is set && dst is non-zero */
 		src = rt->rt_gateway;
 		dst = info->rti_info[RTAX_GATEWAY];
 		if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){
 			if (src->sa_len > dst->sa_len)
 				return (ENOMEM);
 			memcpy(dst, src, src->sa_len);
 			info->rti_addrs |= RTA_GATEWAY;
 		}
 	} else {
 		info->rti_info[RTAX_DST] = rt_key(rt);
 		info->rti_addrs |= RTA_DST;
 		if (rt_mask(rt) != NULL) {
 			info->rti_info[RTAX_NETMASK] = rt_mask(rt);
 			info->rti_addrs |= RTA_NETMASK;
 		}
 		if (rt->rt_flags & RTF_GATEWAY) {
 			info->rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 			info->rti_addrs |= RTA_GATEWAY;
 		}
 	}
 
 	rmx = info->rti_rmx;
 	if (rmx != NULL) {
 		info->rti_mflags |= RTV_MTU;
 		rmx->rmx_mtu = rt->rt_mtu;
 	}
 
 	info->rti_flags = rt->rt_flags;
 	info->rti_ifp = rt->rt_ifp;
 	info->rti_ifa = rt->rt_ifa;
 
 	if (flags & NHR_REF) {
 		/* Do 'traditional' refcouting */
 		if_ref(info->rti_ifp);
 	}
 
 	return (0);
 }
 
 /*
  * Lookups up route entry for @dst in RIB database for fib @fibnum.
  * Exports entry data to @info using rt_exportinfo().
  *
  * if @flags contains NHR_REF, refcouting is performed on rt_ifp.
  *   All references can be released later by calling rib_free_info()
  *
  * Returns 0 on success.
  * Returns ENOENT for lookup failure, ENOMEM for export failure.
  */
 int
 rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags,
     uint32_t flowid, struct rt_addrinfo *info)
 {
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct rtentry *rt;
 	int error;
 
 	KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rh == NULL)
 		return (ENOENT);
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rt = RNTORT(rn);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(rt->rt_ifp)) {
 			flags = (flags & NHR_REF) | NHR_COPY;
 			error = rt_exportinfo(rt, info, flags);
 			RIB_RUNLOCK(rh);
 
 			return (error);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	return (ENOENT);
 }
 
 /*
  * Releases all references acquired by rib_lookup_info() when
  * called with NHR_REF flags.
  */
 void
 rib_free_info(struct rt_addrinfo *info)
 {
 
 	if_rele(info->rti_ifp);
 }
 
 /*
  * Iterates over all existing fibs in system calling
  *  @setwa_f function prior to traversing each fib.
  *  Calls @wa_f function for each element in current fib.
  * If af is not AF_UNSPEC, iterates over fibs in particular
  * address family.
  */
 void
 rt_foreach_fib_walk(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f,
     void *arg)
 {
 	struct rib_head *rnh;
 	uint32_t fibnum;
 	int i;
 
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		/* Do we want some specific family? */
 		if (af != AF_UNSPEC) {
 			rnh = rt_tables_get_rnh(fibnum, af);
 			if (rnh == NULL)
 				continue;
 			if (setwa_f != NULL)
 				setwa_f(rnh, fibnum, af, arg);
 
 			RIB_WLOCK(rnh);
 			rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg);
 			RIB_WUNLOCK(rnh);
 			continue;
 		}
 
 		for (i = 1; i <= AF_MAX; i++) {
 			rnh = rt_tables_get_rnh(fibnum, i);
 			if (rnh == NULL)
 				continue;
 			if (setwa_f != NULL)
 				setwa_f(rnh, fibnum, i, arg);
 
 			RIB_WLOCK(rnh);
 			rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg);
 			RIB_WUNLOCK(rnh);
 		}
 	}
 }
 
 struct rt_delinfo
 {
 	struct rt_addrinfo info;
 	struct rib_head *rnh;
 	struct rtentry *head;
 };
 
 /*
  * Conditionally unlinks @rn from radix tree based
  * on info data passed in @arg.
  */
 static int
 rt_checkdelroute(struct radix_node *rn, void *arg)
 {
 	struct rt_delinfo *di;
 	struct rt_addrinfo *info;
 	struct rtentry *rt;
 	int error;
 
 	di = (struct rt_delinfo *)arg;
 	rt = (struct rtentry *)rn;
 	info = &di->info;
 	error = 0;
 
 	info->rti_info[RTAX_DST] = rt_key(rt);
 	info->rti_info[RTAX_NETMASK] = rt_mask(rt);
 	info->rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 
 	rt = rt_unlinkrte(di->rnh, info, &error);
 	if (rt == NULL) {
 		/* Either not allowed or not matched. Skip entry */
 		return (0);
 	}
 
 	/* Entry was unlinked. Add to the list and return */
 	rt->rt_chain = di->head;
 	di->head = rt;
 
 	return (0);
 }
 
 /*
  * Iterates over all existing fibs in system.
  * Deletes each element for which @filter_f function returned
  * non-zero value.
  * If @af is not AF_UNSPEC, iterates over fibs in particular
  * address family.
  */
 void
 rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg)
 {
 	struct rib_head *rnh;
 	struct rt_delinfo di;
 	struct rtentry *rt;
 	uint32_t fibnum;
 	int i, start, end;
 
 	bzero(&di, sizeof(di));
 	di.info.rti_filter = filter_f;
 	di.info.rti_filterdata = arg;
 
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 		/* Do we want some specific family? */
 		if (af != AF_UNSPEC) {
 			start = af;
 			end = af;
 		} else {
 			start = 1;
 			end = AF_MAX;
 		}
 
 		for (i = start; i <= end; i++) {
 			rnh = rt_tables_get_rnh(fibnum, i);
 			if (rnh == NULL)
 				continue;
 			di.rnh = rnh;
 
 			RIB_WLOCK(rnh);
 			rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
 			RIB_WUNLOCK(rnh);
 
 			if (di.head == NULL)
 				continue;
 
 			/* We might have something to reclaim */
 			while (di.head != NULL) {
 				rt = di.head;
 				di.head = rt->rt_chain;
 				rt->rt_chain = NULL;
 
 				/* TODO std rt -> rt_addrinfo export */
 				di.info.rti_info[RTAX_DST] = rt_key(rt);
 				di.info.rti_info[RTAX_NETMASK] = rt_mask(rt);
 
 				rt_notifydelete(rt, &di.info);
 				RTFREE_LOCKED(rt);
 			}
 
 		}
 	}
 }
 
 /*
  * Delete Routes for a Network Interface
  *
  * Called for each routing entry via the rnh->rnh_walktree() call above
  * to delete all route entries referencing a detaching network interface.
  *
  * Arguments:
  *	rt	pointer to rtentry
  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
  *
  * Returns:
  *	0	successful
  *	errno	failed - reason indicated
  */
 static int
 rt_ifdelroute(const struct rtentry *rt, void *arg)
 {
 	struct ifnet	*ifp = arg;
 
 	if (rt->rt_ifp != ifp)
 		return (0);
 
 	/*
 	 * Protect (sorta) against walktree recursion problems
 	 * with cloned routes
 	 */
 	if ((rt->rt_flags & RTF_UP) == 0)
 		return (0);
 
 	return (1);
 }
 
 /*
  * Delete all remaining routes using this interface
  * Unfortuneatly the only way to do this is to slog through
  * the entire routing table looking for routes which point
  * to this interface...oh well...
  */
 void
 rt_flushifroutes(struct ifnet *ifp)
 {
 
 	rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp);
 }
 
 /*
  * Conditionally unlinks rtentry matching data inside @info from @rnh.
  * Returns unlinked, locked and referenced @rtentry on success,
  * Returns NULL and sets @perror to:
  * ESRCH - if prefix was not found,
  * EADDRINUSE - if trying to delete PINNED route without appropriate flag.
  * ENOENT - if supplied filter function returned 0 (not matched).
  */
 static struct rtentry *
 rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror)
 {
 	struct sockaddr *dst, *netmask;
 	struct rtentry *rt;
 	struct radix_node *rn;
 
 	dst = info->rti_info[RTAX_DST];
 	netmask = info->rti_info[RTAX_NETMASK];
 
 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
 	if (rt == NULL) {
 		*perror = ESRCH;
 		return (NULL);
 	}
 
 	if ((info->rti_flags & RTF_PINNED) == 0) {
 		/* Check if target route can be deleted */
 		if (rt->rt_flags & RTF_PINNED) {
 			*perror = EADDRINUSE;
 			return (NULL);
 		}
 	}
 
 	if (info->rti_filter != NULL) {
 		if (info->rti_filter(rt, info->rti_filterdata) == 0) {
 			/* Not matched */
 			*perror = ENOENT;
 			return (NULL);
 		}
 
 		/*
 		 * Filter function requested rte deletion.
 		 * Ease the caller work by filling in remaining info
 		 * from that particular entry.
 		 */
 		info->rti_info[RTAX_GATEWAY] = rt->rt_gateway;
 	}
 
 	/*
 	 * Remove the item from the tree and return it.
 	 * Complain if it is not there and do no more processing.
 	 */
 	*perror = ESRCH;
 #ifdef RADIX_MPATH
 	if (rt_mpath_capable(rnh))
 		rn = rt_mpath_unlink(rnh, info, rt, perror);
 	else
 #endif
 	rn = rnh->rnh_deladdr(dst, netmask, &rnh->head);
 	if (rn == NULL)
 		return (NULL);
 
 	if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 		panic ("rtrequest delete");
 
 	rt = RNTORT(rn);
 	RT_LOCK(rt);
 	RT_ADDREF(rt);
 	rt->rt_flags &= ~RTF_UP;
 
 	*perror = 0;
 
 	return (rt);
 }
 
 static void
 rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info)
 {
 	struct ifaddr *ifa;
 
 	/*
 	 * give the protocol a chance to keep things in sync.
 	 */
 	ifa = rt->rt_ifa;
 	if (ifa != NULL && ifa->ifa_rtrequest != NULL)
 		ifa->ifa_rtrequest(RTM_DELETE, rt, info);
 
 	/*
 	 * One more rtentry floating around that is not
 	 * linked to the routing table. rttrash will be decremented
 	 * when RTFREE(rt) is eventually called.
 	 */
 	V_rttrash++;
 }
 
 
 /*
  * These (questionable) definitions of apparent local variables apply
  * to the next two functions.  XXXXXX!!!
  */
 #define	dst	info->rti_info[RTAX_DST]
 #define	gateway	info->rti_info[RTAX_GATEWAY]
 #define	netmask	info->rti_info[RTAX_NETMASK]
 #define	ifaaddr	info->rti_info[RTAX_IFA]
 #define	ifpaddr	info->rti_info[RTAX_IFP]
 #define	flags	info->rti_flags
 
 /*
  * Look up rt_addrinfo for a specific fib.  Note that if rti_ifa is defined,
  * it will be referenced so the caller must free it.
  */
 int
 rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
 {
 	struct ifaddr *ifa;
 	int error = 0;
 
 	/*
 	 * ifp may be specified by sockaddr_dl
 	 * when protocol address is ambiguous.
 	 */
 	if (info->rti_ifp == NULL && ifpaddr != NULL &&
 	    ifpaddr->sa_family == AF_LINK &&
 	    (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) {
 		info->rti_ifp = ifa->ifa_ifp;
 		ifa_free(ifa);
 	}
 	if (info->rti_ifa == NULL && ifaaddr != NULL)
 		info->rti_ifa = ifa_ifwithaddr(ifaaddr);
 	if (info->rti_ifa == NULL) {
 		struct sockaddr *sa;
 
 		sa = ifaaddr != NULL ? ifaaddr :
 		    (gateway != NULL ? gateway : dst);
 		if (sa != NULL && info->rti_ifp != NULL)
 			info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
 		else if (dst != NULL && gateway != NULL)
 			info->rti_ifa = ifa_ifwithroute(flags, dst, gateway,
 							fibnum);
 		else if (sa != NULL)
 			info->rti_ifa = ifa_ifwithroute(flags, sa, sa,
 							fibnum);
 	}
 	if ((ifa = info->rti_ifa) != NULL) {
 		if (info->rti_ifp == NULL)
 			info->rti_ifp = ifa->ifa_ifp;
 	} else
 		error = ENETUNREACH;
 	return (error);
 }
 
 static int
 if_updatemtu_cb(struct radix_node *rn, void *arg)
 {
 	struct rtentry *rt;
 	struct if_mtuinfo *ifmtu;
 
 	rt = (struct rtentry *)rn;
 	ifmtu = (struct if_mtuinfo *)arg;
 
 	if (rt->rt_ifp != ifmtu->ifp)
 		return (0);
 
 	if (rt->rt_mtu >= ifmtu->mtu) {
 		/* We have to decrease mtu regardless of flags */
 		rt->rt_mtu = ifmtu->mtu;
 		return (0);
 	}
 
 	/*
 	 * New MTU is bigger. Check if are allowed to alter it
 	 */
 	if ((rt->rt_flags & (RTF_FIXEDMTU | RTF_GATEWAY | RTF_HOST)) != 0) {
 
 		/*
 		 * Skip routes with user-supplied MTU and
 		 * non-interface routes
 		 */
 		return (0);
 	}
 
 	/* We are safe to update route MTU */
 	rt->rt_mtu = ifmtu->mtu;
 
 	return (0);
 }
 
 void
 rt_updatemtu(struct ifnet *ifp)
 {
 	struct if_mtuinfo ifmtu;
 	struct rib_head *rnh;
 	int i, j;
 
 	ifmtu.ifp = ifp;
 
 	/*
 	 * Try to update rt_mtu for all routes using this interface
 	 * Unfortunately the only way to do this is to traverse all
 	 * routing tables in all fibs/domains.
 	 */
 	for (i = 1; i <= AF_MAX; i++) {
 		ifmtu.mtu = if_getmtu_family(ifp, i);
 		for (j = 0; j < rt_numfibs; j++) {
 			rnh = rt_tables_get_rnh(j, i);
 			if (rnh == NULL)
 				continue;
 			RIB_WLOCK(rnh);
 			rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu);
 			RIB_WUNLOCK(rnh);
 		}
 	}
 }
 
 
 #if 0
 int p_sockaddr(char *buf, int buflen, struct sockaddr *s);
 int rt_print(char *buf, int buflen, struct rtentry *rt);
 
 int
 p_sockaddr(char *buf, int buflen, struct sockaddr *s)
 {
 	void *paddr = NULL;
 
 	switch (s->sa_family) {
 	case AF_INET:
 		paddr = &((struct sockaddr_in *)s)->sin_addr;
 		break;
 	case AF_INET6:
 		paddr = &((struct sockaddr_in6 *)s)->sin6_addr;
 		break;
 	}
 
 	if (paddr == NULL)
 		return (0);
 
 	if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL)
 		return (0);
 	
 	return (strlen(buf));
 }
 
 int
 rt_print(char *buf, int buflen, struct rtentry *rt)
 {
 	struct sockaddr *addr, *mask;
 	int i = 0;
 
 	addr = rt_key(rt);
 	mask = rt_mask(rt);
 
 	i = p_sockaddr(buf, buflen, addr);
 	if (!(rt->rt_flags & RTF_HOST)) {
 		buf[i++] = '/';
 		i += p_sockaddr(buf + i, buflen - i, mask);
 	}
 
 	if (rt->rt_flags & RTF_GATEWAY) {
 		buf[i++] = '>';
 		i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway);
 	}
 
 	return (i);
 }
 #endif
 
 #ifdef RADIX_MPATH
 /*
  * Deletes key for single-path routes, unlinks rtentry with
  * gateway specified in @info from multi-path routes.
  *
  * Returnes unlinked entry. In case of failure, returns NULL
  * and sets @perror to ESRCH.
  */
 static struct radix_node *
 rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rtentry *rto, int *perror)
 {
 	/*
 	 * if we got multipath routes, we require users to specify
 	 * a matching RTAX_GATEWAY.
 	 */
 	struct rtentry *rt; // *rto = NULL;
 	struct radix_node *rn;
 	struct sockaddr *gw;
 
 	gw = info->rti_info[RTAX_GATEWAY];
 	rt = rt_mpath_matchgate(rto, gw);
 	if (rt == NULL) {
 		*perror = ESRCH;
 		return (NULL);
 	}
 
 	/*
 	 * this is the first entry in the chain
 	 */
 	if (rto == rt) {
 		rn = rn_mpath_next((struct radix_node *)rt);
 		/*
 		 * there is another entry, now it's active
 		 */
 		if (rn) {
 			rto = RNTORT(rn);
 			RT_LOCK(rto);
 			rto->rt_flags |= RTF_UP;
 			RT_UNLOCK(rto);
 		} else if (rt->rt_flags & RTF_GATEWAY) {
 			/*
 			 * For gateway routes, we need to 
 			 * make sure that we we are deleting
 			 * the correct gateway. 
 			 * rt_mpath_matchgate() does not 
 			 * check the case when there is only
 			 * one route in the chain.  
 			 */
 			if (gw &&
 			    (rt->rt_gateway->sa_len != gw->sa_len ||
 				memcmp(rt->rt_gateway, gw, gw->sa_len))) {
 				*perror = ESRCH;
 				return (NULL);
 			}
 		}
 
 		/*
 		 * use the normal delete code to remove
 		 * the first entry
 		 */
 		rn = rnh->rnh_deladdr(dst, netmask, &rnh->head);
 		*perror = 0;
 		return (rn);
 	}
 		
 	/*
 	 * if the entry is 2nd and on up
 	 */
 	if (rt_mpath_deldup(rto, rt) == 0)
 		panic ("rtrequest1: rt_mpath_deldup");
 	*perror = 0;
 	rn = (struct radix_node *)rt;
 	return (rn);
 }
 #endif
 
 #ifdef FLOWTABLE
 static struct rtentry *
 rt_flowtable_check_route(struct rib_head *rnh, struct rt_addrinfo *info)
 {
 #if defined(INET6) || defined(INET)
 	struct radix_node *rn;
 #endif
 	struct rtentry *rt0;
 
 	rt0 = NULL;
 	/* "flow-table" only supports IPv6 and IPv4 at the moment. */
 	switch (dst->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 #endif
 #ifdef INET
 	case AF_INET:
 #endif
 #if defined(INET6) || defined(INET)
 		rn = rnh->rnh_matchaddr(dst, &rnh->head);
 		if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
 			struct sockaddr *mask;
 			u_char *m, *n;
 			int len;
 
 			/*
 			 * compare mask to see if the new route is
 			 * more specific than the existing one
 			 */
 			rt0 = RNTORT(rn);
 			RT_LOCK(rt0);
 			RT_ADDREF(rt0);
 			RT_UNLOCK(rt0);
 			/*
 			 * A host route is already present, so
 			 * leave the flow-table entries as is.
 			 */
 			if (rt0->rt_flags & RTF_HOST) {
 				RTFREE(rt0);
 				rt0 = NULL;
 			} else if (!(flags & RTF_HOST) && netmask) {
 				mask = rt_mask(rt0);
 				len = mask->sa_len;
 				m = (u_char *)mask;
 				n = (u_char *)netmask;
 				while (len-- > 0) {
 					if (*n != *m)
 						break;
 					n++;
 					m++;
 				}
 				if (len == 0 || (*n < *m)) {
 					RTFREE(rt0);
 					rt0 = NULL;
 				}
 			}
 		}
 #endif/* INET6 || INET */
 	}
 
 	return (rt0);
 }
 #endif
 
 int
 rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
 				u_int fibnum)
 {
 	int error = 0;
 	struct rtentry *rt, *rt_old;
 #ifdef FLOWTABLE
 	struct rtentry *rt0;
 #endif
 	struct radix_node *rn;
 	struct rib_head *rnh;
 	struct ifaddr *ifa;
 	struct sockaddr *ndst;
 	struct sockaddr_storage mdst;
 
 	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
 	KASSERT((flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked"));
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We support multiple FIBs. */
 		break;
 	default:
 		fibnum = RT_DEFAULT_FIB;
 		break;
 	}
 
 	/*
 	 * Find the correct routing tree to use for this Address Family
 	 */
 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	/*
 	 * If we are adding a host route then we don't want to put
 	 * a netmask in the tree, nor do we want to clone it.
 	 */
 	if (flags & RTF_HOST)
 		netmask = NULL;
 
 	switch (req) {
 	case RTM_DELETE:
 		if (netmask) {
 			rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
 			dst = (struct sockaddr *)&mdst;
 		}
 
 		RIB_WLOCK(rnh);
 		rt = rt_unlinkrte(rnh, info, &error);
 		RIB_WUNLOCK(rnh);
 		if (error != 0)
 			return (error);
 
 		rt_notifydelete(rt, info);
 
 		/*
 		 * If the caller wants it, then it can have it,
 		 * but it's up to it to free the rtentry as we won't be
 		 * doing it.
 		 */
 		if (ret_nrt) {
 			*ret_nrt = rt;
 			RT_UNLOCK(rt);
 		} else
 			RTFREE_LOCKED(rt);
 		break;
 	case RTM_RESOLVE:
 		/*
 		 * resolve was only used for route cloning
 		 * here for compat
 		 */
 		break;
 	case RTM_ADD:
 		if ((flags & RTF_GATEWAY) && !gateway)
 			return (EINVAL);
 		if (dst && gateway && (dst->sa_family != gateway->sa_family) && 
 		    (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
 			return (EINVAL);
 
 		if (info->rti_ifa == NULL) {
 			error = rt_getifa_fib(info, fibnum);
 			if (error)
 				return (error);
 		} else
 			ifa_ref(info->rti_ifa);
 		ifa = info->rti_ifa;
 		rt = uma_zalloc(V_rtzone, M_NOWAIT);
 		if (rt == NULL) {
 			ifa_free(ifa);
 			return (ENOBUFS);
 		}
 		rt->rt_flags = RTF_UP | flags;
 		rt->rt_fibnum = fibnum;
 		/*
 		 * Add the gateway. Possibly re-malloc-ing the storage for it.
 		 */
 		if ((error = rt_setgate(rt, dst, gateway)) != 0) {
 			ifa_free(ifa);
 			uma_zfree(V_rtzone, rt);
 			return (error);
 		}
 
 		/*
 		 * point to the (possibly newly malloc'd) dest address.
 		 */
 		ndst = (struct sockaddr *)rt_key(rt);
 
 		/*
 		 * make sure it contains the value we want (masked if needed).
 		 */
 		if (netmask) {
 			rt_maskedcopy(dst, ndst, netmask);
 		} else
 			bcopy(dst, ndst, dst->sa_len);
 
 		/*
 		 * We use the ifa reference returned by rt_getifa_fib().
 		 * This moved from below so that rnh->rnh_addaddr() can
 		 * examine the ifa and  ifa->ifa_ifp if it so desires.
 		 */
 		rt->rt_ifa = ifa;
 		rt->rt_ifp = ifa->ifa_ifp;
 		rt->rt_weight = 1;
 
 		rt_setmetrics(info, rt);
 
 		RIB_WLOCK(rnh);
 		RT_LOCK(rt);
 #ifdef RADIX_MPATH
 		/* do not permit exactly the same dst/mask/gw pair */
 		if (rt_mpath_capable(rnh) &&
 			rt_mpath_conflict(rnh, rt, netmask)) {
 			RIB_WUNLOCK(rnh);
 
 			ifa_free(rt->rt_ifa);
 			R_Free(rt_key(rt));
 			uma_zfree(V_rtzone, rt);
 			return (EEXIST);
 		}
 #endif
 
 #ifdef FLOWTABLE
 		rt0 = rt_flowtable_check_route(rnh, info);
 #endif /* FLOWTABLE */
 
 		/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
 		rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes);
 
 		rt_old = NULL;
 		if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) {
 
 			/*
 			 * Force removal and re-try addition
 			 * TODO: better multipath&pinned support
 			 */
 			struct sockaddr *info_dst = info->rti_info[RTAX_DST];
 			info->rti_info[RTAX_DST] = ndst;
 			/* Do not delete existing PINNED(interface) routes */
 			info->rti_flags &= ~RTF_PINNED;
 			rt_old = rt_unlinkrte(rnh, info, &error);
 			info->rti_flags |= RTF_PINNED;
 			info->rti_info[RTAX_DST] = info_dst;
 			if (rt_old != NULL)
 				rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head,
 				    rt->rt_nodes);
 		}
 		RIB_WUNLOCK(rnh);
 
 		if (rt_old != NULL)
 			RT_UNLOCK(rt_old);
 
 		/*
 		 * If it still failed to go into the tree,
 		 * then un-make it (this should be a function)
 		 */
 		if (rn == NULL) {
 			ifa_free(rt->rt_ifa);
 			R_Free(rt_key(rt));
 			uma_zfree(V_rtzone, rt);
 #ifdef FLOWTABLE
 			if (rt0 != NULL)
 				RTFREE(rt0);
 #endif
 			return (EEXIST);
 		} 
 #ifdef FLOWTABLE
 		else if (rt0 != NULL) {
 			flowtable_route_flush(dst->sa_family, rt0);
 			RTFREE(rt0);
 		}
 #endif
 
 		if (rt_old != NULL) {
 			rt_notifydelete(rt_old, info);
 			RTFREE(rt_old);
 		}
 
 		/*
 		 * If this protocol has something to add to this then
 		 * allow it to do that as well.
 		 */
 		if (ifa->ifa_rtrequest)
 			ifa->ifa_rtrequest(req, rt, info);
 
 		/*
 		 * actually return a resultant rtentry and
 		 * give the caller a single reference.
 		 */
 		if (ret_nrt) {
 			*ret_nrt = rt;
 			RT_ADDREF(rt);
 		}
+		rnh->rnh_gen++;		/* Routing table updated */
 		RT_UNLOCK(rt);
 		break;
 	case RTM_CHANGE:
 		RIB_WLOCK(rnh);
 		error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum);
 		RIB_WUNLOCK(rnh);
 		break;
 	default:
 		error = EOPNOTSUPP;
 	}
 
 	return (error);
 }
 
 #undef dst
 #undef gateway
 #undef netmask
 #undef ifaaddr
 #undef ifpaddr
 #undef flags
 
 static int
 rtrequest1_fib_change(struct rib_head *rnh, struct rt_addrinfo *info,
     struct rtentry **ret_nrt, u_int fibnum)
 {
 	struct rtentry *rt = NULL;
 	int error = 0;
 	int free_ifa = 0;
 	int family, mtu;
 	struct if_mtuinfo ifmtu;
 
 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
 	    info->rti_info[RTAX_NETMASK], &rnh->head);
 
 	if (rt == NULL)
 		return (ESRCH);
 
 #ifdef RADIX_MPATH
 	/*
 	 * If we got multipath routes,
 	 * we require users to specify a matching RTAX_GATEWAY.
 	 */
 	if (rt_mpath_capable(rnh)) {
 		rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]);
 		if (rt == NULL)
 			return (ESRCH);
 	}
 #endif
 
 	RT_LOCK(rt);
 
 	rt_setmetrics(info, rt);
 
 	/*
 	 * New gateway could require new ifaddr, ifp;
 	 * flags may also be different; ifp may be specified
 	 * by ll sockaddr when protocol address is ambiguous
 	 */
 	if (((rt->rt_flags & RTF_GATEWAY) &&
 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
 	    info->rti_info[RTAX_IFP] != NULL ||
 	    (info->rti_info[RTAX_IFA] != NULL &&
 	     !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) {
 
 		error = rt_getifa_fib(info, fibnum);
 		if (info->rti_ifa != NULL)
 			free_ifa = 1;
 
 		if (error != 0)
 			goto bad;
 	}
 
 	/* Check if outgoing interface has changed */
 	if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa &&
 	    rt->rt_ifa != NULL && rt->rt_ifa->ifa_rtrequest != NULL) {
 		rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info);
 		ifa_free(rt->rt_ifa);
 	}
 	/* Update gateway address */
 	if (info->rti_info[RTAX_GATEWAY] != NULL) {
 		error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]);
 		if (error != 0)
 			goto bad;
 
 		rt->rt_flags &= ~RTF_GATEWAY;
 		rt->rt_flags |= (RTF_GATEWAY & info->rti_flags);
 	}
 
 	if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) {
 		ifa_ref(info->rti_ifa);
 		rt->rt_ifa = info->rti_ifa;
 		rt->rt_ifp = info->rti_ifp;
 	}
 	/* Allow some flags to be toggled on change. */
 	rt->rt_flags &= ~RTF_FMASK;
 	rt->rt_flags |= info->rti_flags & RTF_FMASK;
 
 	if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL)
 	       rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info);
 
 	/* Alter route MTU if necessary */
 	if (rt->rt_ifp != NULL) {
 		family = info->rti_info[RTAX_DST]->sa_family;
 		mtu = if_getmtu_family(rt->rt_ifp, family);
 		/* Set default MTU */
 		if (rt->rt_mtu == 0)
 			rt->rt_mtu = mtu;
 		if (rt->rt_mtu != mtu) {
 			/* Check if we really need to update */
 			ifmtu.ifp = rt->rt_ifp;
 			ifmtu.mtu = mtu;
 			if_updatemtu_cb(rt->rt_nodes, &ifmtu);
 		}
 	}
 
 	if (ret_nrt) {
 		*ret_nrt = rt;
 		RT_ADDREF(rt);
 	}
 bad:
 	RT_UNLOCK(rt);
 	if (free_ifa != 0)
 		ifa_free(info->rti_ifa);
 	return (error);
 }
 
 static void
 rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt)
 {
 
 	if (info->rti_mflags & RTV_MTU) {
 		if (info->rti_rmx->rmx_mtu != 0) {
 
 			/*
 			 * MTU was explicitly provided by user.
 			 * Keep it.
 			 */
 			rt->rt_flags |= RTF_FIXEDMTU;
 		} else {
 
 			/*
 			 * User explicitly sets MTU to 0.
 			 * Assume rollback to default.
 			 */
 			rt->rt_flags &= ~RTF_FIXEDMTU;
 		}
 		rt->rt_mtu = info->rti_rmx->rmx_mtu;
 	}
 	if (info->rti_mflags & RTV_WEIGHT)
 		rt->rt_weight = info->rti_rmx->rmx_weight;
 	/* Kernel -> userland timebase conversion. */
 	if (info->rti_mflags & RTV_EXPIRE)
 		rt->rt_expire = info->rti_rmx->rmx_expire ?
 		    info->rti_rmx->rmx_expire - time_second + time_uptime : 0;
 }
 
 int
 rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
 {
 	/* XXX dst may be overwritten, can we move this to below */
 	int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
 
 	/*
 	 * Prepare to store the gateway in rt->rt_gateway.
 	 * Both dst and gateway are stored one after the other in the same
 	 * malloc'd chunk. If we have room, we can reuse the old buffer,
 	 * rt_gateway already points to the right place.
 	 * Otherwise, malloc a new block and update the 'dst' address.
 	 */
 	if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
 		caddr_t new;
 
 		R_Malloc(new, caddr_t, dlen + glen);
 		if (new == NULL)
 			return ENOBUFS;
 		/*
 		 * XXX note, we copy from *dst and not *rt_key(rt) because
 		 * rt_setgate() can be called to initialize a newly
 		 * allocated route entry, in which case rt_key(rt) == NULL
 		 * (and also rt->rt_gateway == NULL).
 		 * Free()/free() handle a NULL argument just fine.
 		 */
 		bcopy(dst, new, dlen);
 		R_Free(rt_key(rt));	/* free old block, if any */
 		rt_key(rt) = (struct sockaddr *)new;
 		rt->rt_gateway = (struct sockaddr *)(new + dlen);
 	}
 
 	/*
 	 * Copy the new gateway value into the memory chunk.
 	 */
 	bcopy(gate, rt->rt_gateway, glen);
 
 	return (0);
 }
 
 void
 rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
 {
 	u_char *cp1 = (u_char *)src;
 	u_char *cp2 = (u_char *)dst;
 	u_char *cp3 = (u_char *)netmask;
 	u_char *cplim = cp2 + *cp3;
 	u_char *cplim2 = cp2 + *cp1;
 
 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
 	cp3 += 2;
 	if (cplim > cplim2)
 		cplim = cplim2;
 	while (cp2 < cplim)
 		*cp2++ = *cp1++ & *cp3++;
 	if (cp2 < cplim2)
 		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
 }
 
 /*
  * Set up a routing table entry, normally
  * for an interface.
  */
 #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
 static inline  int
 rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
 {
 	struct sockaddr *dst;
 	struct sockaddr *netmask;
 	struct rtentry *rt = NULL;
 	struct rt_addrinfo info;
 	int error = 0;
 	int startfib, endfib;
 	char tempbuf[_SOCKADDR_TMPSIZE];
 	int didwork = 0;
 	int a_failure = 0;
 	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
 	struct rib_head *rnh;
 
 	if (flags & RTF_HOST) {
 		dst = ifa->ifa_dstaddr;
 		netmask = NULL;
 	} else {
 		dst = ifa->ifa_addr;
 		netmask = ifa->ifa_netmask;
 	}
 	if (dst->sa_len == 0)
 		return(EINVAL);
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We support multiple FIBs. */
 		break;
 	default:
 		fibnum = RT_DEFAULT_FIB;
 		break;
 	}
 	if (fibnum == RT_ALL_FIBS) {
 		if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD)
 			startfib = endfib = ifa->ifa_ifp->if_fib;
 		else {
 			startfib = 0;
 			endfib = rt_numfibs - 1;
 		}
 	} else {
 		KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
 		startfib = fibnum;
 		endfib = fibnum;
 	}
 
 	/*
 	 * If it's a delete, check that if it exists,
 	 * it's on the correct interface or we might scrub
 	 * a route to another ifa which would
 	 * be confusing at best and possibly worse.
 	 */
 	if (cmd == RTM_DELETE) {
 		/*
 		 * It's a delete, so it should already exist..
 		 * If it's a net, mask off the host bits
 		 * (Assuming we have a mask)
 		 * XXX this is kinda inet specific..
 		 */
 		if (netmask != NULL) {
 			rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
 			dst = (struct sockaddr *)tempbuf;
 		}
 	}
 	/*
 	 * Now go through all the requested tables (fibs) and do the
 	 * requested action. Realistically, this will either be fib 0
 	 * for protocols that don't do multiple tables or all the
 	 * tables for those that do.
 	 */
 	for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
 		if (cmd == RTM_DELETE) {
 			struct radix_node *rn;
 			/*
 			 * Look up an rtentry that is in the routing tree and
 			 * contains the correct info.
 			 */
 			rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
 			if (rnh == NULL)
 				/* this table doesn't exist but others might */
 				continue;
 			RIB_RLOCK(rnh);
 			rn = rnh->rnh_lookup(dst, netmask, &rnh->head);
 #ifdef RADIX_MPATH
 			if (rt_mpath_capable(rnh)) {
 
 				if (rn == NULL) 
 					error = ESRCH;
 				else {
 					rt = RNTORT(rn);
 					/*
 					 * for interface route the
 					 * rt->rt_gateway is sockaddr_intf
 					 * for cloning ARP entries, so
 					 * rt_mpath_matchgate must use the
 					 * interface address
 					 */
 					rt = rt_mpath_matchgate(rt,
 					    ifa->ifa_addr);
 					if (rt == NULL) 
 						error = ESRCH;
 				}
 			}
 #endif
 			error = (rn == NULL ||
 			    (rn->rn_flags & RNF_ROOT) ||
 			    RNTORT(rn)->rt_ifa != ifa);
 			RIB_RUNLOCK(rnh);
 			if (error) {
 				/* this is only an error if bad on ALL tables */
 				continue;
 			}
 		}
 		/*
 		 * Do the actual request
 		 */
 		bzero((caddr_t)&info, sizeof(info));
 		info.rti_ifa = ifa;
 		info.rti_flags = flags |
 		    (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
 		info.rti_info[RTAX_DST] = dst;
 		/* 
 		 * doing this for compatibility reasons
 		 */
 		if (cmd == RTM_ADD)
 			info.rti_info[RTAX_GATEWAY] =
 			    (struct sockaddr *)&null_sdl;
 		else
 			info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
 		info.rti_info[RTAX_NETMASK] = netmask;
 		error = rtrequest1_fib(cmd, &info, &rt, fibnum);
 
 		if (error == 0 && rt != NULL) {
 			/*
 			 * notify any listening routing agents of the change
 			 */
 			RT_LOCK(rt);
 #ifdef RADIX_MPATH
 			/*
 			 * in case address alias finds the first address
 			 * e.g. ifconfig bge0 192.0.2.246/24
 			 * e.g. ifconfig bge0 192.0.2.247/24
 			 * the address set in the route is 192.0.2.246
 			 * so we need to replace it with 192.0.2.247
 			 */
 			if (memcmp(rt->rt_ifa->ifa_addr,
 			    ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
 				ifa_free(rt->rt_ifa);
 				ifa_ref(ifa);
 				rt->rt_ifp = ifa->ifa_ifp;
 				rt->rt_ifa = ifa;
 			}
 #endif
 			/* 
 			 * doing this for compatibility reasons
 			 */
 			if (cmd == RTM_ADD) {
 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type  =
 				rt->rt_ifp->if_type;
 			    ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
 				rt->rt_ifp->if_index;
 			}
 			RT_ADDREF(rt);
 			RT_UNLOCK(rt);
 			rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum);
 			RT_LOCK(rt);
 			RT_REMREF(rt);
 			if (cmd == RTM_DELETE) {
 				/*
 				 * If we are deleting, and we found an entry,
 				 * then it's been removed from the tree..
 				 * now throw it away.
 				 */
 				RTFREE_LOCKED(rt);
 			} else {
 				if (cmd == RTM_ADD) {
 					/*
 					 * We just wanted to add it..
 					 * we don't actually need a reference.
 					 */
 					RT_REMREF(rt);
 				}
 				RT_UNLOCK(rt);
 			}
 			didwork = 1;
 		}
 		if (error)
 			a_failure = error;
 	}
 	if (cmd == RTM_DELETE) {
 		if (didwork) {
 			error = 0;
 		} else {
 			/* we only give an error if it wasn't in any table */
 			error = ((flags & RTF_HOST) ?
 			    EHOSTUNREACH : ENETUNREACH);
 		}
 	} else {
 		if (a_failure) {
 			/* return an error if any of them failed */
 			error = a_failure;
 		}
 	}
 	return (error);
 }
 
 /*
  * Set up a routing table entry, normally
  * for an interface.
  */
 int
 rtinit(struct ifaddr *ifa, int cmd, int flags)
 {
 	struct sockaddr *dst;
 	int fib = RT_DEFAULT_FIB;
 
 	if (flags & RTF_HOST) {
 		dst = ifa->ifa_dstaddr;
 	} else {
 		dst = ifa->ifa_addr;
 	}
 
 	switch (dst->sa_family) {
 	case AF_INET6:
 	case AF_INET:
 		/* We do support multiple FIBs. */
 		fib = RT_ALL_FIBS;
 		break;
 	}
 	return (rtinit1(ifa, cmd, flags, fib));
 }
 
 /*
  * Announce interface address arrival/withdraw
  * Returns 0 on success.
  */
 int
 rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 	    ("unexpected cmd %d", cmd));
 	
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 #if defined(INET) || defined(INET6)
 #ifdef SCTP
 	/*
 	 * notify the SCTP stack
 	 * this will only get called when an address is added/deleted
 	 * XXX pass the ifaddr struct instead if ifa->ifa_addr...
 	 */
 	sctp_addr_change(ifa, cmd);
 #endif /* SCTP */
 #endif
 	return (rtsock_addrmsg(cmd, ifa, fibnum));
 }
 
 /*
  * Announce route addition/removal.
  * Users of this function MUST validate input data BEFORE calling.
  * However we have to be able to handle invalid data:
  * if some userland app sends us "invalid" route message (invalid mask,
  * no dst, wrong address families, etc...) we need to pass it back
  * to app (and any other rtsock consumers) with rtm_errno field set to
  * non-zero value.
  * Returns 0 on success.
  */
 int
 rt_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt,
     int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 	    ("unexpected cmd %d", cmd));
 	
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__));
 
 	return (rtsock_routemsg(cmd, ifp, error, rt, fibnum));
 }
 
 void
 rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt)
 {
 
 	rt_newaddrmsg_fib(cmd, ifa, error, rt, RT_ALL_FIBS);
 }
 
 /*
  * This is called to generate messages from the routing socket
  * indicating a network interface has had addresses associated with it.
  */
 void
 rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt,
     int fibnum)
 {
 
 	KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
 		("unexpected cmd %u", cmd));
 	KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs),
 	    ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs));
 
 	if (cmd == RTM_ADD) {
 		rt_addrmsg(cmd, ifa, fibnum);
 		if (rt != NULL)
 			rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum);
 	} else {
 		if (rt != NULL)
 			rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum);
 		rt_addrmsg(cmd, ifa, fibnum);
 	}
 }
 
Index: head/sys/net/route.h
===================================================================
--- head/sys/net/route.h	(revision 297224)
+++ head/sys/net/route.h	(revision 297225)
@@ -1,465 +1,488 @@
 /*-
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)route.h	8.4 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _NET_ROUTE_H_
 #define _NET_ROUTE_H_
 
 #include <sys/counter.h>
 #include <net/vnet.h>
 
 /*
  * Kernel resident routing tables.
  *
  * The routing tables are initialized when interface addresses
  * are set by making entries for all directly connected interfaces.
  */
 
 /*
  * Struct route consiste of a destination address,
  * a route entry pointer, link-layer prepend data pointer along
  * with its length.
  */
 struct route {
 	struct	rtentry *ro_rt;
 	char		*ro_prepend;
 	uint16_t	ro_plen;
 	uint16_t	ro_flags;
 	uint16_t	ro_mtu;	/* saved ro_rt mtu */
 	uint16_t	spare;
 	struct	sockaddr ro_dst;
 };
 
 #define	RT_L2_ME_BIT		2	/* dst L2 addr is our address */
 #define	RT_MAY_LOOP_BIT		3	/* dst may require loop copy */
 #define	RT_HAS_HEADER_BIT	4	/* mbuf already have its header prepended */
 
 #define	RT_CACHING_CONTEXT	0x1	/* XXX: not used anywhere */
 #define	RT_NORTREF		0x2	/* doesn't hold reference on ro_rt */
 #define	RT_L2_ME		(1 << RT_L2_ME_BIT)		/* 0x0004 */
 #define	RT_MAY_LOOP		(1 << RT_MAY_LOOP_BIT)		/* 0x0008 */
 #define	RT_HAS_HEADER		(1 << RT_HAS_HEADER_BIT)	/* 0x0010 */
 
 #define	RT_REJECT		0x0020		/* Destination is reject */
 #define	RT_BLACKHOLE		0x0040		/* Destination is blackhole */
 #define	RT_HAS_GW		0x0080		/* Destination has GW  */
 
 struct rt_metrics {
 	u_long	rmx_locks;	/* Kernel must leave these values alone */
 	u_long	rmx_mtu;	/* MTU for this path */
 	u_long	rmx_hopcount;	/* max hops expected */
 	u_long	rmx_expire;	/* lifetime for route, e.g. redirect */
 	u_long	rmx_recvpipe;	/* inbound delay-bandwidth product */
 	u_long	rmx_sendpipe;	/* outbound delay-bandwidth product */
 	u_long	rmx_ssthresh;	/* outbound gateway buffer limit */
 	u_long	rmx_rtt;	/* estimated round trip time */
 	u_long	rmx_rttvar;	/* estimated rtt variance */
 	u_long	rmx_pksent;	/* packets sent using this route */
 	u_long	rmx_weight;	/* route weight */
 	u_long	rmx_filler[3];	/* will be used for T/TCP later */
 };
 
 /*
  * rmx_rtt and rmx_rttvar are stored as microseconds;
  * RTTTOPRHZ(rtt) converts to a value suitable for use
  * by a protocol slowtimo counter.
  */
 #define	RTM_RTTUNIT	1000000	/* units for rtt, rttvar, as units per sec */
 #define	RTTTOPRHZ(r)	((r) / (RTM_RTTUNIT / PR_SLOWHZ))
 
 /* lle state is exported in rmx_state rt_metrics field */
 #define	rmx_state	rmx_weight
 
+/*
+ * Keep a generation count of routing table, incremented on route addition,
+ * so we can invalidate caches.  This is accessed without a lock, as precision
+ * is not required.
+ */
+typedef volatile u_int rt_gen_t;	/* tree generation (for adds) */
+#define RT_GEN(fibnum, af)	rt_tables_get_gen(fibnum, af)
+
 #define	RT_DEFAULT_FIB	0	/* Explicitly mark fib=0 restricted cases */
 #define	RT_ALL_FIBS	-1	/* Announce event for every fib */
 #ifdef _KERNEL
 extern u_int rt_numfibs;	/* number of usable routing tables */
 VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */
 #define	V_rt_add_addr_allfibs	VNET(rt_add_addr_allfibs)
 #endif
 
 /*
  * We distinguish between routes to hosts and routes to networks,
  * preferring the former if available.  For each route we infer
  * the interface to use from the gateway address supplied when
  * the route was entered.  Routes that forward packets through
  * gateways are marked so that the output routines know to address the
  * gateway rather than the ultimate destination.
  */
 #ifndef RNF_NORMAL
 #include <net/radix.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 #endif
 
 #if defined(_KERNEL) || defined(_WANT_RTENTRY)
 struct rtentry {
 	struct	radix_node rt_nodes[2];	/* tree glue, and other values */
 	/*
 	 * XXX struct rtentry must begin with a struct radix_node (or two!)
 	 * because the code does some casts of a 'struct radix_node *'
 	 * to a 'struct rtentry *'
 	 */
 #define	rt_key(r)	(*((struct sockaddr **)(&(r)->rt_nodes->rn_key)))
 #define	rt_mask(r)	(*((struct sockaddr **)(&(r)->rt_nodes->rn_mask)))
 	struct	sockaddr *rt_gateway;	/* value */
 	struct	ifnet *rt_ifp;		/* the answer: interface to use */
 	struct	ifaddr *rt_ifa;		/* the answer: interface address to use */
 	int		rt_flags;	/* up/down?, host/net */
 	int		rt_refcnt;	/* # held references */
 	u_int		rt_fibnum;	/* which FIB */
 	u_long		rt_mtu;		/* MTU for this path */
 	u_long		rt_weight;	/* absolute weight */ 
 	u_long		rt_expire;	/* lifetime for route, e.g. redirect */
 #define	rt_endzero	rt_pksent
 	counter_u64_t	rt_pksent;	/* packets sent using this route */
 	struct mtx	rt_mtx;		/* mutex for routing entry */
 	struct rtentry	*rt_chain;	/* pointer to next rtentry to delete */
 };
 #endif /* _KERNEL || _WANT_RTENTRY */
 
 #define	RTF_UP		0x1		/* route usable */
 #define	RTF_GATEWAY	0x2		/* destination is a gateway */
 #define	RTF_HOST	0x4		/* host entry (net otherwise) */
 #define	RTF_REJECT	0x8		/* host or net unreachable */
 #define	RTF_DYNAMIC	0x10		/* created dynamically (by redirect) */
 #define	RTF_MODIFIED	0x20		/* modified dynamically (by redirect) */
 #define RTF_DONE	0x40		/* message confirmed */
 /*			0x80		   unused, was RTF_DELCLONE */
 /*			0x100		   unused, was RTF_CLONING */
 #define RTF_XRESOLVE	0x200		/* external daemon resolves name */
 #define RTF_LLINFO	0x400		/* DEPRECATED - exists ONLY for backward 
 					   compatibility */
 #define RTF_LLDATA	0x400		/* used by apps to add/del L2 entries */
 #define RTF_STATIC	0x800		/* manually added */
 #define RTF_BLACKHOLE	0x1000		/* just discard pkts (during updates) */
 #define RTF_PROTO2	0x4000		/* protocol specific routing flag */
 #define RTF_PROTO1	0x8000		/* protocol specific routing flag */
 /*			0x10000		   unused, was RTF_PRCLONING */
 /*			0x20000		   unused, was RTF_WASCLONED */
 #define RTF_PROTO3	0x40000		/* protocol specific routing flag */
 #define	RTF_FIXEDMTU	0x80000		/* MTU was explicitly specified */
 #define RTF_PINNED	0x100000	/* route is immutable */
 #define	RTF_LOCAL	0x200000 	/* route represents a local address */
 #define	RTF_BROADCAST	0x400000	/* route represents a bcast address */
 #define	RTF_MULTICAST	0x800000	/* route represents a mcast address */
 					/* 0x8000000 and up unassigned */
 #define	RTF_STICKY	 0x10000000	/* always route dst->src */
 
 #define	RTF_RNH_LOCKED	 0x40000000	/* unused */
 
 #define	RTF_GWFLAG_COMPAT 0x80000000	/* a compatibility bit for interacting
 					   with existing routing apps */
 
 /* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */
 #define RTF_FMASK	\
 	(RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \
 	 RTF_REJECT | RTF_STATIC | RTF_STICKY)
 
 /*
  * fib_ nexthop API flags.
  */
 
 /* Consumer-visible nexthop info flags */
 #define	NHF_REJECT		0x0010	/* RTF_REJECT */
 #define	NHF_BLACKHOLE		0x0020	/* RTF_BLACKHOLE */
 #define	NHF_REDIRECT		0x0040	/* RTF_DYNAMIC|RTF_MODIFIED */
 #define	NHF_DEFAULT		0x0080	/* Default route */
 #define	NHF_BROADCAST		0x0100	/* RTF_BROADCAST */
 #define	NHF_GATEWAY		0x0200	/* RTF_GATEWAY */
 
 /* Nexthop request flags */
 #define	NHR_IFAIF		0x01	/* Return ifa_ifp interface */
 #define	NHR_REF			0x02	/* For future use */
 
 /* Control plane route request flags */
 #define	NHR_COPY		0x100	/* Copy rte data */
 
 #ifdef _KERNEL
 /* rte<>ro_flags translation */
 static inline void
 rt_update_ro_flags(struct route *ro)
 {
 	int rt_flags = ro->ro_rt->rt_flags;
 
 	ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW);
 
 	ro->ro_flags |= (rt_flags & RTF_REJECT) ? RT_REJECT : 0;
 	ro->ro_flags |= (rt_flags & RTF_BLACKHOLE) ? RT_BLACKHOLE : 0;
 	ro->ro_flags |= (rt_flags & RTF_GATEWAY) ? RT_HAS_GW : 0;
 }
 #endif
 
 /*
  * Routing statistics.
  */
 struct	rtstat {
 	short	rts_badredirect;	/* bogus redirect calls */
 	short	rts_dynamic;		/* routes created by redirects */
 	short	rts_newgateway;		/* routes modified by redirects */
 	short	rts_unreach;		/* lookups which failed */
 	short	rts_wildcard;		/* lookups satisfied by a wildcard */
 };
 /*
  * Structures for routing messages.
  */
 struct rt_msghdr {
 	u_short	rtm_msglen;	/* to skip over non-understood messages */
 	u_char	rtm_version;	/* future binary compatibility */
 	u_char	rtm_type;	/* message type */
 	u_short	rtm_index;	/* index for associated ifp */
 	int	rtm_flags;	/* flags, incl. kern & message, e.g. DONE */
 	int	rtm_addrs;	/* bitmask identifying sockaddrs in msg */
 	pid_t	rtm_pid;	/* identify sender */
 	int	rtm_seq;	/* for sender to identify action */
 	int	rtm_errno;	/* why failed */
 	int	rtm_fmask;	/* bitmask used in RTM_CHANGE message */
 	u_long	rtm_inits;	/* which metrics we are initializing */
 	struct	rt_metrics rtm_rmx; /* metrics themselves */
 };
 
 #define RTM_VERSION	5	/* Up the ante and ignore older versions */
 
 /*
  * Message types.
  */
 #define RTM_ADD		0x1	/* Add Route */
 #define RTM_DELETE	0x2	/* Delete Route */
 #define RTM_CHANGE	0x3	/* Change Metrics or flags */
 #define RTM_GET		0x4	/* Report Metrics */
 #define RTM_LOSING	0x5	/* Kernel Suspects Partitioning */
 #define RTM_REDIRECT	0x6	/* Told to use different route */
 #define RTM_MISS	0x7	/* Lookup failed on this address */
 #define RTM_LOCK	0x8	/* fix specified metrics */
 		    /*	0x9  */
 		    /*	0xa  */
 #define RTM_RESOLVE	0xb	/* req to resolve dst to LL addr */
 #define RTM_NEWADDR	0xc	/* address being added to iface */
 #define RTM_DELADDR	0xd	/* address being removed from iface */
 #define RTM_IFINFO	0xe	/* iface going up/down etc. */
 #define	RTM_NEWMADDR	0xf	/* mcast group membership being added to if */
 #define	RTM_DELMADDR	0x10	/* mcast group membership being deleted */
 #define	RTM_IFANNOUNCE	0x11	/* iface arrival/departure */
 #define	RTM_IEEE80211	0x12	/* IEEE80211 wireless event */
 
 /*
  * Bitmask values for rtm_inits and rmx_locks.
  */
 #define RTV_MTU		0x1	/* init or lock _mtu */
 #define RTV_HOPCOUNT	0x2	/* init or lock _hopcount */
 #define RTV_EXPIRE	0x4	/* init or lock _expire */
 #define RTV_RPIPE	0x8	/* init or lock _recvpipe */
 #define RTV_SPIPE	0x10	/* init or lock _sendpipe */
 #define RTV_SSTHRESH	0x20	/* init or lock _ssthresh */
 #define RTV_RTT		0x40	/* init or lock _rtt */
 #define RTV_RTTVAR	0x80	/* init or lock _rttvar */
 #define RTV_WEIGHT	0x100	/* init or lock _weight */
 
 /*
  * Bitmask values for rtm_addrs.
  */
 #define RTA_DST		0x1	/* destination sockaddr present */
 #define RTA_GATEWAY	0x2	/* gateway sockaddr present */
 #define RTA_NETMASK	0x4	/* netmask sockaddr present */
 #define RTA_GENMASK	0x8	/* cloning mask sockaddr present */
 #define RTA_IFP		0x10	/* interface name sockaddr present */
 #define RTA_IFA		0x20	/* interface addr sockaddr present */
 #define RTA_AUTHOR	0x40	/* sockaddr for author of redirect */
 #define RTA_BRD		0x80	/* for NEWADDR, broadcast or p-p dest addr */
 
 /*
  * Index offsets for sockaddr array for alternate internal encoding.
  */
 #define RTAX_DST	0	/* destination sockaddr present */
 #define RTAX_GATEWAY	1	/* gateway sockaddr present */
 #define RTAX_NETMASK	2	/* netmask sockaddr present */
 #define RTAX_GENMASK	3	/* cloning mask sockaddr present */
 #define RTAX_IFP	4	/* interface name sockaddr present */
 #define RTAX_IFA	5	/* interface addr sockaddr present */
 #define RTAX_AUTHOR	6	/* sockaddr for author of redirect */
 #define RTAX_BRD	7	/* for NEWADDR, broadcast or p-p dest addr */
 #define RTAX_MAX	8	/* size of array to allocate */
 
 typedef int rt_filter_f_t(const struct rtentry *, void *);
 
 struct rt_addrinfo {
 	int	rti_addrs;			/* Route RTF_ flags */
 	int	rti_flags;			/* Route RTF_ flags */
 	struct	sockaddr *rti_info[RTAX_MAX];	/* Sockaddr data */
 	struct	ifaddr *rti_ifa;		/* value of rt_ifa addr */
 	struct	ifnet *rti_ifp;			/* route interface */
 	rt_filter_f_t	*rti_filter;		/* filter function */
 	void	*rti_filterdata;		/* filter paramenters */
 	u_long	rti_mflags;			/* metrics RTV_ flags */
 	u_long	rti_spare;			/* Will be used for fib */
 	struct	rt_metrics *rti_rmx;		/* Pointer to route metrics */
 };
 
 /*
  * This macro returns the size of a struct sockaddr when passed
  * through a routing socket. Basically we round up sa_len to
  * a multiple of sizeof(long), with a minimum of sizeof(long).
  * The check for a NULL pointer is just a convenience, probably never used.
  * The case sa_len == 0 should only apply to empty structures.
  */
 #define SA_SIZE(sa)						\
     (  (!(sa) || ((struct sockaddr *)(sa))->sa_len == 0) ?	\
 	sizeof(long)		:				\
 	1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(long) - 1) ) )
 
 #define	sa_equal(a, b) (	\
     (((const struct sockaddr *)(a))->sa_len == ((const struct sockaddr *)(b))->sa_len) && \
     (bcmp((a), (b), ((const struct sockaddr *)(b))->sa_len) == 0))
 
 #ifdef _KERNEL
 
 #define RT_LINK_IS_UP(ifp)	(!((ifp)->if_capabilities & IFCAP_LINKSTATE) \
 				 || (ifp)->if_link_state == LINK_STATE_UP)
 
 #define	RT_LOCK_INIT(_rt) \
 	mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK)
 #define	RT_LOCK(_rt)		mtx_lock(&(_rt)->rt_mtx)
 #define	RT_UNLOCK(_rt)		mtx_unlock(&(_rt)->rt_mtx)
 #define	RT_LOCK_DESTROY(_rt)	mtx_destroy(&(_rt)->rt_mtx)
 #define	RT_LOCK_ASSERT(_rt)	mtx_assert(&(_rt)->rt_mtx, MA_OWNED)
 #define	RT_UNLOCK_COND(_rt)	do {				\
 	if (mtx_owned(&(_rt)->rt_mtx))				\
 		mtx_unlock(&(_rt)->rt_mtx);			\
 } while (0)
 
 #define	RT_ADDREF(_rt)	do {					\
 	RT_LOCK_ASSERT(_rt);					\
 	KASSERT((_rt)->rt_refcnt >= 0,				\
 		("negative refcnt %d", (_rt)->rt_refcnt));	\
 	(_rt)->rt_refcnt++;					\
 } while (0)
 
 #define	RT_REMREF(_rt)	do {					\
 	RT_LOCK_ASSERT(_rt);					\
 	KASSERT((_rt)->rt_refcnt > 0,				\
 		("bogus refcnt %d", (_rt)->rt_refcnt));	\
 	(_rt)->rt_refcnt--;					\
 } while (0)
 
 #define	RTFREE_LOCKED(_rt) do {					\
 	if ((_rt)->rt_refcnt <= 1)				\
 		rtfree(_rt);					\
 	else {							\
 		RT_REMREF(_rt);					\
 		RT_UNLOCK(_rt);					\
 	}							\
 	/* guard against invalid refs */			\
 	_rt = 0;						\
 } while (0)
 
 #define	RTFREE(_rt) do {					\
 	RT_LOCK(_rt);						\
 	RTFREE_LOCKED(_rt);					\
 } while (0)
 
 #define	RO_RTFREE(_ro) do {					\
 	if ((_ro)->ro_rt) {					\
 		if ((_ro)->ro_flags & RT_NORTREF) {		\
 			(_ro)->ro_flags &= ~RT_NORTREF;		\
 			(_ro)->ro_rt = NULL;			\
 		} else {					\
 			RT_LOCK((_ro)->ro_rt);			\
 			RTFREE_LOCKED((_ro)->ro_rt);		\
 		}						\
 	}							\
 } while (0)
 
+/*
+ * Validate a cached route based on a supplied cookie.  If there is an
+ * out-of-date cache, simply free it.  Update the generation number
+ * for the new allocation
+ */
+#define RT_VALIDATE(ro, cookiep, fibnum) do {				\
+	rt_gen_t cookie = RT_GEN(fibnum, (ro)->ro_dst.sa_family);	\
+	if (*(cookiep) != cookie && (ro)->ro_rt != NULL) {		\
+		RTFREE((ro)->ro_rt);					\
+		(ro)->ro_rt = NULL;					\
+		*(cookiep) = cookie;					\
+	}								\
+} while (0)
+
 struct ifmultiaddr;
 struct rib_head;
 
 void	 rt_ieee80211msg(struct ifnet *, int, void *, size_t);
 void	 rt_ifannouncemsg(struct ifnet *, int);
 void	 rt_ifmsg(struct ifnet *);
 void	 rt_missmsg(int, struct rt_addrinfo *, int, int);
 void	 rt_missmsg_fib(int, struct rt_addrinfo *, int, int, int);
 void	 rt_newaddrmsg(int, struct ifaddr *, int, struct rtentry *);
 void	 rt_newaddrmsg_fib(int, struct ifaddr *, int, struct rtentry *, int);
 int	 rt_addrmsg(int, struct ifaddr *, int);
 int	 rt_routemsg(int, struct ifnet *ifp, int, struct rtentry *, int);
 void	 rt_newmaddrmsg(int, struct ifmultiaddr *);
 int	 rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *);
 void 	 rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *);
 struct rib_head *rt_table_init(int);
 void	rt_table_destroy(struct rib_head *);
+rt_gen_t rt_tables_get_gen(int table, int fam);
 
 int	rtsock_addrmsg(int, struct ifaddr *, int);
 int	rtsock_routemsg(int, struct ifnet *ifp, int, struct rtentry *, int);
 
 /*
  * Note the following locking behavior:
  *
  *    rtalloc1() returns a locked rtentry
  *
  *    rtfree() and RTFREE_LOCKED() require a locked rtentry
  *
  *    RTFREE() uses an unlocked entry.
  */
 
 void	 rtfree(struct rtentry *);
 void	rt_updatemtu(struct ifnet *);
 
 typedef int rt_walktree_f_t(struct rtentry *, void *);
 typedef void rt_setwarg_t(struct rib_head *, uint32_t, int, void *);
 void	rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *);
 void	rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg);
 void	rt_flushifroutes(struct ifnet *ifp);
 
 /* XXX MRT COMPAT VERSIONS THAT SET UNIVERSE to 0 */
 /* Thes are used by old code not yet converted to use multiple FIBS */
 struct rtentry *rtalloc1(struct sockaddr *, int, u_long);
 int	 rtinit(struct ifaddr *, int, int);
 
 /* XXX MRT NEW VERSIONS THAT USE FIBs
  * For now the protocol indepedent versions are the same as the AF_INET ones
  * but this will change.. 
  */
 int	 rt_getifa_fib(struct rt_addrinfo *, u_int fibnum);
 void	 rtalloc_ign_fib(struct route *ro, u_long ignflags, u_int fibnum);
 struct rtentry *rtalloc1_fib(struct sockaddr *, int, u_long, u_int);
 int	 rtioctl_fib(u_long, caddr_t, u_int);
 void	 rtredirect_fib(struct sockaddr *, struct sockaddr *,
 	    struct sockaddr *, int, struct sockaddr *, u_int);
 int	 rtrequest_fib(int, struct sockaddr *,
 	    struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int);
 int	 rtrequest1_fib(int, struct rt_addrinfo *, struct rtentry **, u_int);
 int	rib_lookup_info(uint32_t, const struct sockaddr *, uint32_t, uint32_t,
 	    struct rt_addrinfo *);
 void	rib_free_info(struct rt_addrinfo *info);
 
 #endif
 
 #endif
Index: head/sys/net/route_var.h
===================================================================
--- head/sys/net/route_var.h	(revision 297224)
+++ head/sys/net/route_var.h	(revision 297225)
@@ -1,76 +1,76 @@
 /*-
  * Copyright (c) 2015-2016
  * 	Alexander V. Chernikov <melifaro@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _NET_ROUTE_VAR_H_
 #define _NET_ROUTE_VAR_H_
 
 struct rib_head {
 	struct radix_head	head;
 	rn_matchaddr_f_t	*rnh_matchaddr;	/* longest match for sockaddr */
 	rn_addaddr_f_t		*rnh_addaddr;	/* add based on sockaddr*/
 	rn_deladdr_f_t		*rnh_deladdr;	/* remove based on sockaddr */
 	rn_lookup_f_t		*rnh_lookup;	/* exact match for sockaddr */
 	rn_walktree_t		*rnh_walktree;	/* traverse tree */
 	rn_walktree_from_t	*rnh_walktree_from; /* traverse tree below a */
 	rn_close_t		*rnh_close;	/*do something when the last ref drops*/
-	u_int			rnh_gen;	/* generation counter */
+	rt_gen_t		rnh_gen;	/* generation counter */
 	int			rnh_multipath;	/* multipath capable ? */
 	struct radix_node	rnh_nodes[3];	/* empty tree for common case */
 	struct rwlock		rib_lock;	/* config/data path lock */
 	struct radix_mask_head	rmhead;		/* masks radix head */
 };
 
 #define	RIB_RLOCK(rh)		rw_rlock(&(rh)->rib_lock)
 #define	RIB_RUNLOCK(rh)		rw_runlock(&(rh)->rib_lock)
 #define	RIB_WLOCK(rh)		rw_wlock(&(rh)->rib_lock)
 #define	RIB_WUNLOCK(rh)		rw_wunlock(&(rh)->rib_lock)
 #define	RIB_LOCK_ASSERT(rh)	rw_assert(&(rh)->rib_lock, RA_LOCKED)
 #define	RIB_WLOCK_ASSERT(rh)	rw_assert(&(rh)->rib_lock, RA_WLOCKED)
 
 struct rib_head *rt_tables_get_rnh(int fib, int family);
 
 /* rte<>nhop translation */
 static inline uint16_t
 fib_rte_to_nh_flags(int rt_flags)
 {
 	uint16_t res;
 
 	res = (rt_flags & RTF_REJECT) ? NHF_REJECT : 0;
 	res |= (rt_flags & RTF_BLACKHOLE) ? NHF_BLACKHOLE : 0;
 	res |= (rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) ? NHF_REDIRECT : 0;
 	res |= (rt_flags & RTF_BROADCAST) ? NHF_BROADCAST : 0;
 	res |= (rt_flags & RTF_GATEWAY) ? NHF_GATEWAY : 0;
 
 	return (res);
 }
 
 
 #endif
Index: head/sys/netinet/in_pcb.c
===================================================================
--- head/sys/netinet/in_pcb.c	(revision 297224)
+++ head/sys/netinet/in_pcb.c	(revision 297225)
@@ -1,2658 +1,2680 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_pcbgroup.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #endif
 #ifdef INET
 #include <netinet/in_var.h>
 #endif
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #endif /* INET6 */
 
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/key.h>
 #endif /* IPSEC */
 
 #include <security/mac/mac_framework.h>
 
 static struct callout	ipport_tick_callout;
 
 /*
  * These configure the range of local port addresses assigned to
  * "unspecified" outgoing connections/packets/whatever.
  */
 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
 
 /*
  * Reserved ports accessible only to root. There are significant
  * security considerations that must be accounted for when changing these,
  * but the security benefits can be great. Please be careful.
  */
 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_reservedlow);
 
 /* Variables dealing with random ephemeral port allocation. */
 VNET_DEFINE(int, ipport_randomized) = 1;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomcps) = 10;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomtime) = 45;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_stoprandom);		/* toggled by ipport_tick */
 VNET_DEFINE(int, ipport_tcpallocs);
 static VNET_DEFINE(int, ipport_tcplastcount);
 
 #define	V_ipport_tcplastcount		VNET(ipport_tcplastcount)
 
 static void	in_pcbremlists(struct inpcb *inp);
 #ifdef INET
 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
 			    struct in_addr faddr, u_int fport_arg,
 			    struct in_addr laddr, u_int lport_arg,
 			    int lookupflags, struct ifnet *ifp);
 
 #define RANGECHK(var, min, max) \
 	if ((var) < (min)) { (var) = (min); } \
 	else if ((var) > (max)) { (var) = (max); }
 
 static int
 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, arg1, arg2, req);
 	if (error == 0) {
 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
 	}
 	return (error);
 }
 
 #undef RANGECHK
 
 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
     "IP Ports");
 
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	&VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
 	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
 	&VNET_NAME(ipport_reservedhigh), 0, "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
 	"allocations before switching to a sequental one");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomtime), 0,
 	"Minimum time to keep sequental port "
 	"allocation before switching to a random one");
 #endif /* INET */
 
 /*
  * in_pcb.c: manage the Protocol Control Blocks.
  *
  * NOTE: It is assumed that most of these functions will be called with
  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
  * functions often modify hash chains or addresses in pcbs.
  */
 
 /*
  * Initialize an inpcbinfo -- we should be able to reduce the number of
  * arguments in time.
  */
 void
 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
     char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini,
     uint32_t inpcbzone_flags, u_int hashfields)
 {
 
 	INP_INFO_LOCK_INIT(pcbinfo, name);
 	INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");	/* XXXRW: argument? */
 	INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
 #ifdef VIMAGE
 	pcbinfo->ipi_vnet = curvnet;
 #endif
 	pcbinfo->ipi_listhead = listhead;
 	LIST_INIT(pcbinfo->ipi_listhead);
 	pcbinfo->ipi_count = 0;
 	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
 	    &pcbinfo->ipi_hashmask);
 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_porthashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
 #endif
 	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
 	    NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR,
 	    inpcbzone_flags);
 	uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
 	uma_zone_set_warning(pcbinfo->ipi_zone,
 	    "kern.ipc.maxsockets limit reached");
 }
 
 /*
  * Destroy an inpcbinfo.
  */
 void
 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
 {
 
 	KASSERT(pcbinfo->ipi_count == 0,
 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
 
 	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
 	    pcbinfo->ipi_porthashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_destroy(pcbinfo);
 #endif
 	uma_zdestroy(pcbinfo->ipi_zone);
 	INP_LIST_LOCK_DESTROY(pcbinfo);
 	INP_HASH_LOCK_DESTROY(pcbinfo);
 	INP_INFO_LOCK_DESTROY(pcbinfo);
 }
 
 /*
  * Allocate a PCB and associate it with the socket.
  * On success return with the PCB locked.
  */
 int
 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 {
 	struct inpcb *inp;
 	int error;
 
 #ifdef INVARIANTS
 	if (pcbinfo == &V_tcbinfo) {
 		INP_INFO_RLOCK_ASSERT(pcbinfo);
 	} else {
 		INP_INFO_WLOCK_ASSERT(pcbinfo);
 	}
 #endif
 
 	error = 0;
 	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
 	if (inp == NULL)
 		return (ENOBUFS);
 	bzero(inp, inp_zero_size);
 	inp->inp_pcbinfo = pcbinfo;
 	inp->inp_socket = so;
 	inp->inp_cred = crhold(so->so_cred);
 	inp->inp_inc.inc_fibnum = so->so_fibnum;
 #ifdef MAC
 	error = mac_inpcb_init(inp, M_NOWAIT);
 	if (error != 0)
 		goto out;
 	mac_inpcb_create(so, inp);
 #endif
 #ifdef IPSEC
 	error = ipsec_init_policy(so, &inp->inp_sp);
 	if (error != 0) {
 #ifdef MAC
 		mac_inpcb_destroy(inp);
 #endif
 		goto out;
 	}
 #endif /*IPSEC*/
 #ifdef INET6
 	if (INP_SOCKAF(so) == AF_INET6) {
 		inp->inp_vflag |= INP_IPV6PROTO;
 		if (V_ip6_v6only)
 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
 	}
 #endif
 	INP_WLOCK(inp);
 	INP_LIST_WLOCK(pcbinfo);
 	LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
 	pcbinfo->ipi_count++;
 	so->so_pcb = (caddr_t)inp;
 #ifdef INET6
 	if (V_ip6_auto_flowlabel)
 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
 #endif
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	refcount_init(&inp->inp_refcount, 1);	/* Reference from inpcbinfo */
 	INP_LIST_WUNLOCK(pcbinfo);
 #if defined(IPSEC) || defined(MAC)
 out:
 	if (error != 0) {
 		crfree(inp->inp_cred);
 		uma_zfree(pcbinfo->ipi_zone, inp);
 	}
 #endif
 	return (error);
 }
 
 #ifdef INET
 int
 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
 	    &inp->inp_lport, cred);
 	if (error)
 		return (error);
 	if (in_pcbinshash(inp) != 0) {
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 #endif
 
 /*
  * Select a local port (number) to use.
  */
 #if defined(INET) || defined(INET6)
 int
 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
     struct ucred *cred, int lookupflags)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcb *tmpinp;
 	unsigned short *lastport;
 	int count, dorandom, error;
 	u_short aux, first, last, lport;
 #ifdef INET
 	struct in_addr laddr;
 #endif
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	/*
 	 * Because no actual state changes occur here, a global write lock on
 	 * the pcbinfo isn't required.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (inp->inp_flags & INP_HIGHPORT) {
 		first = V_ipport_hifirstauto;	/* sysctl */
 		last  = V_ipport_hilastauto;
 		lastport = &pcbinfo->ipi_lasthi;
 	} else if (inp->inp_flags & INP_LOWPORT) {
 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
 		if (error)
 			return (error);
 		first = V_ipport_lowfirstauto;	/* 1023 */
 		last  = V_ipport_lowlastauto;	/* 600 */
 		lastport = &pcbinfo->ipi_lastlow;
 	} else {
 		first = V_ipport_firstauto;	/* sysctl */
 		last  = V_ipport_lastauto;
 		lastport = &pcbinfo->ipi_lastport;
 	}
 	/*
 	 * For UDP(-Lite), use random port allocation as long as the user
 	 * allows it.  For TCP (and as of yet unknown) connections,
 	 * use random port allocation only if the user allows it AND
 	 * ipport_tick() allows it.
 	 */
 	if (V_ipport_randomized &&
 		(!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
 		pcbinfo == &V_ulitecbinfo))
 		dorandom = 1;
 	else
 		dorandom = 0;
 	/*
 	 * It makes no sense to do random port allocation if
 	 * we have the only port available.
 	 */
 	if (first == last)
 		dorandom = 0;
 	/* Make sure to not include UDP(-Lite) packets in the count. */
 	if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
 		V_ipport_tcpallocs++;
 	/*
 	 * Instead of having two loops further down counting up or down
 	 * make sure that first is always <= last and go with only one
 	 * code path implementing all logic.
 	 */
 	if (first > last) {
 		aux = first;
 		first = last;
 		last = aux;
 	}
 
 #ifdef INET
 	/* Make the compiler happy. */
 	laddr.s_addr = 0;
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
 		KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p",
 		    __func__, inp));
 		laddr = *laddrp;
 	}
 #endif
 	tmpinp = NULL;	/* Make compiler happy. */
 	lport = *lportp;
 
 	if (dorandom)
 		*lastport = first + (arc4random() % (last - first));
 
 	count = last - first;
 
 	do {
 		if (count-- < 0)	/* completely used? */
 			return (EADDRNOTAVAIL);
 		++*lastport;
 		if (*lastport < first || *lastport > last)
 			*lastport = first;
 		lport = htons(*lastport);
 
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV6) != 0)
 			tmpinp = in6_pcblookup_local(pcbinfo,
 			    &inp->in6p_laddr, lport, lookupflags, cred);
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 			tmpinp = in_pcblookup_local(pcbinfo, laddr,
 			    lport, lookupflags, cred);
 #endif
 	} while (tmpinp != NULL);
 
 #ifdef INET
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4)
 		laddrp->s_addr = laddr.s_addr;
 #endif
 	*lportp = lport;
 
 	return (0);
 }
 
 /*
  * Return cached socket options.
  */
 short
 inp_so_options(const struct inpcb *inp)
 {
    short so_options;
 
    so_options = 0;
 
    if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
 	   so_options |= SO_REUSEPORT;
    if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
 	   so_options |= SO_REUSEADDR;
    return (so_options);
 }
 #endif /* INET || INET6 */
 
 /*
  * Check if a new BINDMULTI socket is allowed to be created.
  *
  * ni points to the new inp.
  * oi points to the exisitng inp.
  *
  * This checks whether the existing inp also has BINDMULTI and
  * whether the credentials match.
  */
 int
 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
 {
 	/* Check permissions match */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    (ni->inp_cred->cr_uid !=
 	    oi->inp_cred->cr_uid))
 		return (0);
 
 	/* Check the existing inp has BINDMULTI set */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    ((oi->inp_flags2 & INP_BINDMULTI) == 0))
 		return (0);
 
 	/*
 	 * We're okay - either INP_BINDMULTI isn't set on ni, or
 	 * it is and it matches the checks.
 	 */
 	return (1);
 }
 
 #ifdef INET
 /*
  * Set up a bind operation on a PCB, performing port allocation
  * as required, but do not actually modify the PCB. Callers can
  * either complete the bind by setting inp_laddr/inp_lport and
  * calling in_pcbinshash(), or they can just use the resulting
  * port and address to authorise the sending of a once-off packet.
  *
  * On error, the values of *laddrp and *lportp are not changed.
  */
 int
 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
     u_short *lportp, struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	struct sockaddr_in *sin;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct in_addr laddr;
 	u_short lport = 0;
 	int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
 	int error;
 
 	/*
 	 * No state changes, so read locks are sufficient here.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
 			return (error);
 	} else {
 		sin = (struct sockaddr_in *)nam;
 		if (nam->sa_len != sizeof (*sin))
 			return (EINVAL);
 #ifdef notdef
 		/*
 		 * We should check the family, but old programs
 		 * incorrectly fail to initialize it.
 		 */
 		if (sin->sin_family != AF_INET)
 			return (EAFNOSUPPORT);
 #endif
 		error = prison_local_ip4(cred, &sin->sin_addr);
 		if (error)
 			return (error);
 		if (sin->sin_port != *lportp) {
 			/* Don't allow the port to change. */
 			if (*lportp != 0)
 				return (EINVAL);
 			lport = sin->sin_port;
 		}
 		/* NB: lport is left as 0 if the port isn't being changed. */
 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow complete duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
 			sin->sin_port = 0;		/* yech... */
 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 			/*
 			 * Is the address a local IP address? 
 			 * If INP_BINDANY is set, then the socket may be bound
 			 * to any endpoint address, local or not.
 			 */
 			if ((inp->inp_flags & INP_BINDANY) == 0 &&
 			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 
 				return (EADDRNOTAVAIL);
 		}
 		laddr = sin->sin_addr;
 		if (lport) {
 			struct inpcb *t;
 			struct tcptw *tw;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
 			    0))
 				return (EACCES);
 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
 			    priv_check_cred(inp->inp_cred,
 			    PRIV_NETINET_REUSEPORT, 0) != 0) {
 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 				    lport, INPLOOKUP_WILDCARD, cred);
 	/*
 	 * XXX
 	 * This entire block sorely needs a rewrite.
 	 */
 				if (t &&
 				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				     (t->inp_flags2 & INP_REUSEPORT) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 
 				/*
 				 * If the socket is a BINDMULTI socket, then
 				 * the credentials need to match and the
 				 * original socket also has to have been bound
 				 * with BINDMULTI.
 				 */
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 			    lport, lookupflags, cred);
 			if (t && (t->inp_flags & INP_TIMEWAIT)) {
 				/*
 				 * XXXRW: If an incpb has had its timewait
 				 * state recycled, we treat the address as
 				 * being in use (for now).  This is better
 				 * than a panic, but not desirable.
 				 */
 				tw = intotw(t);
 				if (tw == NULL ||
 				    (reuseport & tw->tw_so_options) == 0)
 					return (EADDRINUSE);
 			} else if (t &&
 			    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 			    (reuseport & inp_so_options(t)) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
 				    ntohl(t->inp_laddr.s_addr) !=
 				    INADDR_ANY ||
 				    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
 				    (t->inp_vflag & INP_IPV6PROTO) == 0)
 #endif
 				return (EADDRINUSE);
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 		}
 	}
 	if (*lportp != 0)
 		lport = *lportp;
 	if (lport == 0) {
 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
 		if (error != 0)
 			return (error);
 
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	return (0);
 }
 
 /*
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
     struct ucred *cred, struct mbuf *m)
 {
 	u_short lport, fport;
 	in_addr_t laddr, faddr;
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	lport = inp->inp_lport;
 	laddr = inp->inp_laddr.s_addr;
 	anonport = (lport == 0);
 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
 	    NULL, cred);
 	if (error)
 		return (error);
 
 	/* Do the initial binding of the local address if required. */
 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 		inp->inp_lport = lport;
 		inp->inp_laddr.s_addr = laddr;
 		if (in_pcbinshash(inp) != 0) {
 			inp->inp_laddr.s_addr = INADDR_ANY;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 
 	/* Commit the remaining changes. */
 	inp->inp_lport = lport;
 	inp->inp_laddr.s_addr = laddr;
 	inp->inp_faddr.s_addr = faddr;
 	inp->inp_fport = fport;
 	in_pcbrehash_mbuf(inp, m);
 
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 
 int
 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 
 	return (in_pcbconnect_mbuf(inp, nam, cred, NULL));
 }
 
 /*
  * Do proper source address selection on an unbound socket in case
  * of connect. Take jails into account as well.
  */
 int
 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
     struct ucred *cred)
 {
 	struct ifaddr *ifa;
 	struct sockaddr *sa;
 	struct sockaddr_in *sin;
 	struct route sro;
 	int error;
 
 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
 
 	/*
 	 * Bypass source address selection and use the primary jail IP
 	 * if requested.
 	 */
 	if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
 		return (0);
 
 	error = 0;
 	bzero(&sro, sizeof(sro));
 
 	sin = (struct sockaddr_in *)&sro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_addr.s_addr = faddr->s_addr;
 
 	/*
 	 * If route is known our src addr is taken from the i/f,
 	 * else punt.
 	 *
 	 * Find out route to destination.
 	 */
 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
 		in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
 
 	/*
 	 * If we found a route, use the address corresponding to
 	 * the outgoing interface.
 	 * 
 	 * Otherwise assume faddr is reachable on a directly connected
 	 * network and try to find a corresponding interface to take
 	 * the source address from.
 	 */
 	if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
 						inp->inp_socket->so_fibnum));
 		if (ia == NULL) {
 			error = ENETUNREACH;
 			goto done;
 		}
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			ifa_free(&ia->ia_ifa);
 			goto done;
 		}
 
 		ifp = ia->ia_ifp;
 		ifa_free(&ia->ia_ifa);
 		ia = NULL;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			IF_ADDR_RUNLOCK(ifp);
 			goto done;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * If the outgoing interface on the route found is not
 	 * a loopback interface, use the address from that interface.
 	 * In case of jails do those three steps:
 	 * 1. check if the interface address belongs to the jail. If so use it.
 	 * 2. check if we have any address on the outgoing interface
 	 *    belonging to this jail. If so use it.
 	 * 3. as a last resort return the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		/* If not jailed, use the default returned. */
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		/* 1. Check if the iface address belongs to the jail. */
 		sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 			ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/*
 		 * 2. Check if we have any address on the outgoing interface
 		 *    belonging to this jail.
 		 */
 		ia = NULL;
 		ifp = sro.ro_rt->rt_ifp;
 		IF_ADDR_RLOCK(ifp);
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			IF_ADDR_RUNLOCK(ifp);
 			goto done;
 		}
 		IF_ADDR_RUNLOCK(ifp);
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * The outgoing interface is marked with 'loopback net', so a route
 	 * to ourselves is here.
 	 * Try to find the interface of the destination address and then
 	 * take the address from there. That interface is not necessarily
 	 * a loopback interface.
 	 * In case of jails, check that it is an address of the jail
 	 * and if we cannot find, fall back to the 'default' jail address.
 	 */
 	if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
 		struct sockaddr_in sain;
 		struct in_ifaddr *ia;
 
 		bzero(&sain, sizeof(struct sockaddr_in));
 		sain.sin_family = AF_INET;
 		sain.sin_len = sizeof(struct sockaddr_in);
 		sain.sin_addr.s_addr = faddr->s_addr;
 
 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain),
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0,
 						inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			if (ia == NULL) {
 				error = ENETUNREACH;
 				goto done;
 			}
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			ifa_free(&ia->ia_ifa);
 			goto done;
 		}
 
 		/* Jailed. */
 		if (ia != NULL) {
 			struct ifnet *ifp;
 
 			ifp = ia->ia_ifp;
 			ifa_free(&ia->ia_ifa);
 			ia = NULL;
 			IF_ADDR_RLOCK(ifp);
 			TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 
 				sa = ifa->ifa_addr;
 				if (sa->sa_family != AF_INET)
 					continue;
 				sin = (struct sockaddr_in *)sa;
 				if (prison_check_ip4(cred,
 				    &sin->sin_addr) == 0) {
 					ia = (struct in_ifaddr *)ifa;
 					break;
 				}
 			}
 			if (ia != NULL) {
 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 				IF_ADDR_RUNLOCK(ifp);
 				goto done;
 			}
 			IF_ADDR_RUNLOCK(ifp);
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 done:
 	if (sro.ro_rt != NULL)
 		RTFREE(sro.ro_rt);
 	return (error);
 }
 
 /*
  * Set up for a connect from a socket to the specified address.
  * On entry, *laddrp and *lportp should contain the current local
  * address and port for the PCB; these are updated to the values
  * that should be placed in inp_laddr and inp_lport to complete
  * the connect.
  *
  * On success, *faddrp and *fportp will be set to the remote address
  * and port. These are not updated in the error case.
  *
  * If the operation fails because the connection already exists,
  * *oinpp will be set to the PCB of that connection so that the
  * caller can decide to override it. In all other cases, *oinpp
  * is set to NULL.
  */
 int
 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
     struct inpcb **oinpp, struct ucred *cred)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 	struct in_ifaddr *ia;
 	struct inpcb *oinp;
 	struct in_addr laddr, faddr;
 	u_short lport, fport;
 	int error;
 
 	/*
 	 * Because a global state change doesn't actually occur here, a read
 	 * lock is sufficient.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (oinpp != NULL)
 		*oinpp = NULL;
 	if (nam->sa_len != sizeof (*sin))
 		return (EINVAL);
 	if (sin->sin_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (sin->sin_port == 0)
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	lport = *lportp;
 	faddr = sin->sin_addr;
 	fport = sin->sin_port;
 
 	if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
 		/*
 		 * If the destination address is INADDR_ANY,
 		 * use the primary local address.
 		 * If the supplied address is INADDR_BROADCAST,
 		 * and the primary interface supports broadcast,
 		 * choose the broadcast address for that interface.
 		 */
 		if (faddr.s_addr == INADDR_ANY) {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			faddr =
 			    IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			if (cred != NULL &&
 			    (error = prison_get_ip4(cred, &faddr)) != 0)
 				return (error);
 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
 			IN_IFADDR_RLOCK(&in_ifa_tracker);
 			if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
 			    IFF_BROADCAST)
 				faddr = satosin(&TAILQ_FIRST(
 				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
 			IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 		}
 	}
 	if (laddr.s_addr == INADDR_ANY) {
 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
 		/*
 		 * If the destination address is multicast and an outgoing
 		 * interface has been set as a multicast option, prefer the
 		 * address of that interface as our source address.
 		 */
 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
 		    inp->inp_moptions != NULL) {
 			struct ip_moptions *imo;
 			struct ifnet *ifp;
 
 			imo = inp->inp_moptions;
 			if (imo->imo_multicast_ifp != NULL) {
 				ifp = imo->imo_multicast_ifp;
 				IN_IFADDR_RLOCK(&in_ifa_tracker);
 				TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 					if ((ia->ia_ifp == ifp) &&
 					    (cred == NULL ||
 					    prison_check_ip4(cred,
 					    &ia->ia_addr.sin_addr) == 0))
 						break;
 				}
 				if (ia == NULL)
 					error = EADDRNOTAVAIL;
 				else {
 					laddr = ia->ia_addr.sin_addr;
 					error = 0;
 				}
 				IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 			}
 		}
 		if (error)
 			return (error);
 	}
 	oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport,
 	    laddr, lport, 0, NULL);
 	if (oinp != NULL) {
 		if (oinpp != NULL)
 			*oinpp = oinp;
 		return (EADDRINUSE);
 	}
 	if (lport == 0) {
 		error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
 		    cred);
 		if (error)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	*faddrp = faddr.s_addr;
 	*fportp = fport;
 	return (0);
 }
 
 void
 in_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	inp->inp_fport = 0;
 	in_pcbrehash(inp);
 }
 #endif /* INET */
 
 /*
  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
  * For most protocols, this will be invoked immediately prior to calling
  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
  * socket, in which case in_pcbfree() is deferred.
  */
 void
 in_pcbdetach(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 
 	inp->inp_socket->so_pcb = NULL;
 	inp->inp_socket = NULL;
 }
 
 /*
  * in_pcbref() bumps the reference count on an inpcb in order to maintain
  * stability of an inpcb pointer despite the inpcb lock being released.  This
  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
  * but where the inpcb lock may already held, or when acquiring a reference
  * via a pcbgroup.
  *
  * in_pcbref() should be used only to provide brief memory stability, and
  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
  * garbage collect the inpcb if it has been in_pcbfree()'d from another
  * context.  Until in_pcbrele() has returned that the inpcb is still valid,
  * lock and rele are the *only* safe operations that may be performed on the
  * inpcb.
  *
  * While the inpcb will not be freed, releasing the inpcb lock means that the
  * connection's state may change, so the caller should be careful to
  * revalidate any cached state on reacquiring the lock.  Drop the reference
  * using in_pcbrele().
  */
 void
 in_pcbref(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	refcount_acquire(&inp->inp_refcount);
 }
 
 /*
  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
  * return a flag indicating whether or not the inpcb remains valid.  If it is
  * valid, we return with the inpcb lock held.
  *
  * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
  * reference on an inpcb.  Historically more work was done here (actually, in
  * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
  * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
  * about memory stability (and continued use of the write lock).
  */
 int
 in_pcbrele_rlocked(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_RLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0) {
 		/*
 		 * If the inpcb has been freed, let the caller know, even if
 		 * this isn't the last reference.
 		 */
 		if (inp->inp_flags2 & INP_FREED) {
 			INP_RUNLOCK(inp);
 			return (1);
 		}
 		return (0);
 	}
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 
 	INP_RUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 int
 in_pcbrele_wlocked(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0) {
 		/*
 		 * If the inpcb has been freed, let the caller know, even if
 		 * this isn't the last reference.
 		 */
 		if (inp->inp_flags2 & INP_FREED) {
 			INP_WUNLOCK(inp);
 			return (1);
 		}
 		return (0);
 	}
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 
 	INP_WUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 /*
  * Temporary wrapper.
  */
 int
 in_pcbrele(struct inpcb *inp)
 {
 
 	return (in_pcbrele_wlocked(inp));
 }
 
 /*
  * Unconditionally schedule an inpcb to be freed by decrementing its
  * reference count, which should occur only after the inpcb has been detached
  * from its socket.  If another thread holds a temporary reference (acquired
  * using in_pcbref()) then the free is deferred until that reference is
  * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
  * work, including removal from global lists, is done in this context, where
  * the pcbinfo lock is held.
  */
 void
 in_pcbfree(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 
 #ifdef INVARIANTS
 	if (pcbinfo == &V_tcbinfo) {
 		INP_INFO_LOCK_ASSERT(pcbinfo);
 	} else {
 		INP_INFO_WLOCK_ASSERT(pcbinfo);
 	}
 #endif
 	INP_WLOCK_ASSERT(inp);
 
 	/* XXXRW: Do as much as possible here. */
 #ifdef IPSEC
 	if (inp->inp_sp != NULL)
 		ipsec_delete_pcbpolicy(inp);
 #endif
 	INP_LIST_WLOCK(pcbinfo);
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	in_pcbremlists(inp);
 	INP_LIST_WUNLOCK(pcbinfo);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6PROTO) {
 		ip6_freepcbopts(inp->in6p_outputopts);
 		if (inp->in6p_moptions != NULL)
 			ip6_freemoptions(inp->in6p_moptions);
 	}
 #endif
 	if (inp->inp_options)
 		(void)m_free(inp->inp_options);
 #ifdef INET
 	if (inp->inp_moptions != NULL)
 		inp_freemoptions(inp->inp_moptions);
 #endif
+	if (inp->inp_route.ro_rt) {
+		RTFREE(inp->inp_route.ro_rt);
+		inp->inp_route.ro_rt = (struct rtentry *)NULL;
+	}
+
 	inp->inp_vflag = 0;
 	inp->inp_flags2 |= INP_FREED;
 	crfree(inp->inp_cred);
 #ifdef MAC
 	mac_inpcb_destroy(inp);
 #endif
 	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 }
 
 /*
  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
  * port reservation, and preventing it from being returned by inpcb lookups.
  *
  * It is used by TCP to mark an inpcb as unused and avoid future packet
  * delivery or event notification when a socket remains open but TCP has
  * closed.  This might occur as a result of a shutdown()-initiated TCP close
  * or a RST on the wire, and allows the port binding to be reused while still
  * maintaining the invariant that so_pcb always points to a valid inpcb until
  * in_pcbdetach().
  *
  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
  * in_pcbnotifyall() and in_pcbpurgeif0()?
  */
 void
 in_pcbdrop(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * XXXRW: Possibly we should protect the setting of INP_DROPPED with
 	 * the hash lock...?
 	 */
 	inp->inp_flags |= INP_DROPPED;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(inp->inp_pcbinfo);
 		LIST_REMOVE(inp, inp_hash);
 		LIST_REMOVE(inp, inp_portlist);
 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			LIST_REMOVE(phd, phd_hash);
 			free(phd, M_PCB);
 		}
 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 #ifdef PCBGROUP
 		in_pcbgroup_remove(inp);
 #endif
 	}
 }
 
 #ifdef INET
 /*
  * Common routines to return the socket addresses associated with inpcbs.
  */
 struct sockaddr *
 in_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in *sin;
 
 	sin = malloc(sizeof *sin, M_SONAME,
 		M_WAITOK | M_ZERO);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = *addr_p;
 	sin->sin_port = port;
 
 	return (struct sockaddr *)sin;
 }
 
 int
 in_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->inp_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 void
 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb *inp, *inp_temp;
 
 	INP_INFO_WLOCK(pcbinfo);
 	LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 #endif
 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
 		    inp->inp_socket == NULL) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 		if ((*notify)(inp, errno))
 			INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 void
 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb *inp;
 	struct ip_moptions *imo;
 	int i, gap;
 
 	INP_INFO_WLOCK(pcbinfo);
 	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		imo = inp->inp_moptions;
 		if ((inp->inp_vflag & INP_IPV4) &&
 		    imo != NULL) {
 			/*
 			 * Unselect the outgoing interface if it is being
 			 * detached.
 			 */
 			if (imo->imo_multicast_ifp == ifp)
 				imo->imo_multicast_ifp = NULL;
 
 			/*
 			 * Drop multicast group membership if we joined
 			 * through the interface being detached.
 			 */
 			for (i = 0, gap = 0; i < imo->imo_num_memberships;
 			    i++) {
 				if (imo->imo_membership[i]->inm_ifp == ifp) {
 					in_delmulti(imo->imo_membership[i]);
 					gap++;
 				} else if (gap != 0)
 					imo->imo_membership[i - gap] =
 					    imo->imo_membership[i];
 			}
 			imo->imo_num_memberships -= gap;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 /*
  * Lookup a PCB based on the local address and port.  Caller must hold the
  * hash lock.  No inpcb locks or references are acquired.
  */
 #define INP_LOOKUP_MAPPED_PCB_COST	3
 struct inpcb *
 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
     u_short lport, int lookupflags, struct ucred *cred)
 {
 	struct inpcb *inp;
 #ifdef INET6
 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
 #else
 	int matchwild = 3;
 #endif
 	int wildcard;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
 			    inp->inp_laddr.s_addr == laddr.s_addr &&
 			    inp->inp_lport == lport) {
 				/*
 				 * Found?
 				 */
 				if (cred == NULL ||
 				    prison_equal_ip4(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 				if (cred != NULL &&
 				    !prison_equal_ip4(inp->inp_cred->cr_prison,
 					cred->cr_prison))
 					continue;
 #ifdef INET6
 				/* XXX inp locking */
 				if ((inp->inp_vflag & INP_IPV4) == 0)
 					continue;
 				/*
 				 * We never select the PCB that has
 				 * INP_IPV6 flag and is bound to :: if
 				 * we have another PCB which is bound
 				 * to 0.0.0.0.  If a PCB has the
 				 * INP_IPV6 flag, then we set its cost
 				 * higher than IPv4 only PCBs.
 				 *
 				 * Note that the case only happens
 				 * when a socket is bound to ::, under
 				 * the condition that the use of the
 				 * mapped address is allowed.
 				 */
 				if ((inp->inp_vflag & INP_IPV6) != 0)
 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 #endif
 				if (inp->inp_faddr.s_addr != INADDR_ANY)
 					wildcard++;
 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
 					if (laddr.s_addr == INADDR_ANY)
 						wildcard++;
 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
 						continue;
 				} else {
 					if (laddr.s_addr != INADDR_ANY)
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0)
 						break;
 				}
 			}
 		}
 		return (match);
 	}
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
 #ifdef PCBGROUP
 /*
  * Lookup PCB in hash list, using pcbgroup tables.
  */
 static struct inpcb *
 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
     u_int lport_arg, int lookupflags, struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	INP_GROUP_LOCK(pcbgroup);
 	head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbgroup->ipg_hashmask)];
 	LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				goto found;
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL) {
 		inp = tmpinp;
 		goto found;
 	}
 
 #ifdef	RSS
 	/*
 	 * For incoming connections, we may wish to do a wildcard
 	 * match for an RSS-local socket.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		struct inpcbhead *head;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
 		    lport, 0, pcbgroup->ipg_hashmask)];
 		LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 #ifdef INET6
 		if (inp == NULL)
 			inp = local_wild_mapped;
 #endif
 		if (inp != NULL)
 			goto found;
 	}
 #endif
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		struct inpcbhead *head;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_wildmask)];
 		LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 #ifdef INET6
 		if (inp == NULL)
 			inp = local_wild_mapped;
 #endif
 		if (inp != NULL)
 			goto found;
 	} /* if (lookupflags & INPLOOKUP_WILDCARD) */
 	INP_GROUP_UNLOCK(pcbgroup);
 	return (NULL);
 
 found:
 	in_pcbref(inp);
 	INP_GROUP_UNLOCK(pcbgroup);
 	if (lookupflags & INPLOOKUP_WLOCKPCB) {
 		INP_WLOCK(inp);
 		if (in_pcbrele_wlocked(inp))
 			return (NULL);
 	} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 		INP_RLOCK(inp);
 		if (in_pcbrele_rlocked(inp))
 			return (NULL);
 	} else
 		panic("%s: locking bug", __func__);
 	return (inp);
 }
 #endif /* PCBGROUP */
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
  * that the caller has locked the hash list, and will not perform any further
  * locking or reference operations on either the hash list or the connection.
  */
 static struct inpcb *
 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
     struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
 	LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				return (inp);
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 		    0, pcbinfo->ipi_hashmask)];
 		LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4(inp->inp_cred,
 				    &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					return (inp);
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		if (jail_wild != NULL)
 			return (jail_wild);
 		if (local_exact != NULL)
 			return (local_exact);
 		if (local_wild != NULL)
 			return (local_wild);
 #ifdef INET6
 		if (local_wild_mapped != NULL)
 			return (local_wild_mapped);
 #endif
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 
 	return (NULL);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
  * hash list lock, and will return the inpcb locked (i.e., requires
  * INPLOOKUP_LOCKPCB).
  */
 static struct inpcb *
 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp)
 {
 	struct inpcb *inp;
 
 	INP_HASH_RLOCK(pcbinfo);
 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 	    (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
 	if (inp != NULL) {
 		in_pcbref(inp);
 		INP_HASH_RUNLOCK(pcbinfo);
 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
 			INP_WLOCK(inp);
 			if (in_pcbrele_wlocked(inp))
 				return (NULL);
 		} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 			INP_RLOCK(inp);
 			if (in_pcbrele_rlocked(inp))
 				return (NULL);
 		} else
 			panic("%s: locking bug", __func__);
 	} else
 		INP_HASH_RUNLOCK(pcbinfo);
 	return (inp);
 }
 
 /*
  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
  * from which a pre-calculated hash value may be extracted.
  *
  * Possibly more of this logic should be in in_pcbgroup.c.
  */
 struct inpcb *
 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 {
 #if defined(PCBGROUP) && !defined(RSS)
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	/*
 	 * When not using RSS, use connection groups in preference to the
 	 * reservation table when looking up 4-tuples.  When using RSS, just
 	 * use the reservation table, due to the cost of the Toeplitz hash
 	 * in software.
 	 *
 	 * XXXRW: This policy belongs in the pcbgroup code, as in principle
 	 * we could be doing RSS with a non-Toeplitz hash that is affordable
 	 * in software.
 	 */
 #if defined(PCBGROUP) && !defined(RSS)
 	if (in_pcbgroup_enabled(pcbinfo)) {
 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 	}
 #endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 
 struct inpcb *
 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, struct mbuf *m)
 {
 #ifdef PCBGROUP
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 #ifdef PCBGROUP
 	/*
 	 * If we can use a hardware-generated hash to look up the connection
 	 * group, use that connection group to find the inpcb.  Otherwise
 	 * fall back on a software hash -- or the reservation table if we're
 	 * using RSS.
 	 *
 	 * XXXRW: As above, that policy belongs in the pcbgroup code.
 	 */
 	if (in_pcbgroup_enabled(pcbinfo) &&
 	    !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
 		pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
 		    m->m_pkthdr.flowid);
 		if (pcbgroup != NULL)
 			return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
 			    fport, laddr, lport, lookupflags, ifp));
 #ifndef RSS
 		pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 #endif
 	}
 #endif
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 #endif /* INET */
 
 /*
  * Insert PCB onto various hash lists.
  */
 static int
 in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
 {
 	struct inpcbhead *pcbhash;
 	struct inpcbporthead *pcbporthash;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbport *phd;
 	u_int32_t hashkey_faddr;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
 	    ("in_pcbinshash: INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 	else
 #endif
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	pcbporthash = &pcbinfo->ipi_porthashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 
 	/*
 	 * Go through port list and look for a head for this lport.
 	 */
 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
 		if (phd->phd_port == inp->inp_lport)
 			break;
 	}
 	/*
 	 * If none exists, malloc one and tack it on.
 	 */
 	if (phd == NULL) {
 		phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
 		if (phd == NULL) {
 			return (ENOBUFS); /* XXX */
 		}
 		phd->phd_port = inp->inp_lport;
 		LIST_INIT(&phd->phd_pcblist);
 		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 	}
 	inp->inp_phd = phd;
 	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 	inp->inp_flags |= INP_INHASHLIST;
 #ifdef PCBGROUP
 	if (do_pcbgroup_update)
 		in_pcbgroup_update(inp);
 #endif
 	return (0);
 }
 
 /*
  * For now, there are two public interfaces to insert an inpcb into the hash
  * lists -- one that does update pcbgroups, and one that doesn't.  The latter
  * is used only in the TCP syncache, where in_pcbinshash is called before the
  * full 4-tuple is set for the inpcb, and we don't want to install in the
  * pcbgroup until later.
  *
  * XXXRW: This seems like a misfeature.  in_pcbinshash should always update
  * connection groups, and partially initialised inpcbs should not be exposed
  * to either reservation hash tables or pcbgroups.
  */
 int
 in_pcbinshash(struct inpcb *inp)
 {
 
 	return (in_pcbinshash_internal(inp, 1));
 }
 
 int
 in_pcbinshash_nopcbgroup(struct inpcb *inp)
 {
 
 	return (in_pcbinshash_internal(inp, 0));
 }
 
 /*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
  * not change after in_pcbinshash() has been called.
  */
 void
 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *head;
 	u_int32_t hashkey_faddr;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT(inp->inp_flags & INP_INHASHLIST,
 	    ("in_pcbrehash: !INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 	else
 #endif
 	hashkey_faddr = inp->inp_faddr.s_addr;
 
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 		inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	LIST_REMOVE(inp, inp_hash);
 	LIST_INSERT_HEAD(head, inp, inp_hash);
 
 #ifdef PCBGROUP
 	if (m != NULL)
 		in_pcbgroup_update_mbuf(inp, m);
 	else
 		in_pcbgroup_update(inp);
 #endif
 }
 
 void
 in_pcbrehash(struct inpcb *inp)
 {
 
 	in_pcbrehash_mbuf(inp, NULL);
 }
 
 /*
  * Remove PCB from various lists.
  */
 static void
 in_pcbremlists(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
 #ifdef INVARIANTS
 	if (pcbinfo == &V_tcbinfo) {
 		INP_INFO_RLOCK_ASSERT(pcbinfo);
 	} else {
 		INP_INFO_WLOCK_ASSERT(pcbinfo);
 	}
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 	INP_LIST_WLOCK_ASSERT(pcbinfo);
 
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(pcbinfo);
 		LIST_REMOVE(inp, inp_hash);
 		LIST_REMOVE(inp, inp_portlist);
 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			LIST_REMOVE(phd, phd_hash);
 			free(phd, M_PCB);
 		}
 		INP_HASH_WUNLOCK(pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 	}
 	LIST_REMOVE(inp, inp_list);
 	pcbinfo->ipi_count--;
 #ifdef PCBGROUP
 	in_pcbgroup_remove(inp);
 #endif
+}
+
+/*
+ * Check for alternatives when higher level complains
+ * about service problems.  For now, invalidate cached
+ * routing information.  If the route was created dynamically
+ * (by a redirect), time to try a default gateway again.
+ */
+void
+in_losing(struct inpcb *inp)
+{
+
+	if (inp->inp_route.ro_rt) {
+		RTFREE(inp->inp_route.ro_rt);
+		inp->inp_route.ro_rt = (struct rtentry *)NULL;
+	}
+	return;
 }
 
 /*
  * A set label operation has occurred at the socket layer, propagate the
  * label change into the in_pcb for the socket.
  */
 void
 in_pcbsosetlabel(struct socket *so)
 {
 #ifdef MAC
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
 
 	INP_WLOCK(inp);
 	SOCK_LOCK(so);
 	mac_inpcb_sosetlabel(so, inp);
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 #endif
 }
 
 /*
  * ipport_tick runs once per second, determining if random port allocation
  * should be continued.  If more than ipport_randomcps ports have been
  * allocated in the last second, then we return to sequential port
  * allocation. We return to random allocation only once we drop below
  * ipport_randomcps for at least ipport_randomtime seconds.
  */
 static void
 ipport_tick(void *xtp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);	/* XXX appease INVARIANTS here */
 		if (V_ipport_tcpallocs <=
 		    V_ipport_tcplastcount + V_ipport_randomcps) {
 			if (V_ipport_stoprandom > 0)
 				V_ipport_stoprandom--;
 		} else
 			V_ipport_stoprandom = V_ipport_randomtime;
 		V_ipport_tcplastcount = V_ipport_tcpallocs;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
 }
 
 static void
 ip_fini(void *xtp)
 {
 
 	callout_stop(&ipport_tick_callout);
 }
 
 /* 
  * The ipport_callout should start running at about the time we attach the
  * inet or inet6 domains.
  */
 static void
 ipport_tick_init(const void *unused __unused)
 {
 
 	/* Start ipport_tick. */
 	callout_init(&ipport_tick_callout, 1);
 	callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 }
 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, 
     ipport_tick_init, NULL);
 
 void
 inp_wlock(struct inpcb *inp)
 {
 
 	INP_WLOCK(inp);
 }
 
 void
 inp_wunlock(struct inpcb *inp)
 {
 
 	INP_WUNLOCK(inp);
 }
 
 void
 inp_rlock(struct inpcb *inp)
 {
 
 	INP_RLOCK(inp);
 }
 
 void
 inp_runlock(struct inpcb *inp)
 {
 
 	INP_RUNLOCK(inp);
 }
 
 #ifdef INVARIANTS
 void
 inp_lock_assert(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 }
 
 void
 inp_unlock_assert(struct inpcb *inp)
 {
 
 	INP_UNLOCK_ASSERT(inp);
 }
 #endif
 
 void
 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
 {
 	struct inpcb *inp;
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
 		INP_WLOCK(inp);
 		func(inp, arg);
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 }
 
 struct socket *
 inp_inpcbtosocket(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return (inp->inp_socket);
 }
 
 struct tcpcb *
 inp_inpcbtotcpcb(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return ((struct tcpcb *)inp->inp_ppcb);
 }
 
 int
 inp_ip_tos_get(const struct inpcb *inp)
 {
 
 	return (inp->inp_ip_tos);
 }
 
 void
 inp_ip_tos_set(struct inpcb *inp, int val)
 {
 
 	inp->inp_ip_tos = val;
 }
 
 void
 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
     uint32_t *faddr, uint16_t *fp)
 {
 
 	INP_LOCK_ASSERT(inp);
 	*laddr = inp->inp_laddr.s_addr;
 	*faddr = inp->inp_faddr.s_addr;
 	*lp = inp->inp_lport;
 	*fp = inp->inp_fport;
 }
 
 struct inpcb *
 so_sotoinpcb(struct socket *so)
 {
 
 	return (sotoinpcb(so));
 }
 
 struct tcpcb *
 so_sototcpcb(struct socket *so)
 {
 
 	return (sototcpcb(so));
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
 {
 	char faddr_str[48], laddr_str[48];
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inc);
 
 	indent += 2;
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		/* IPv6. */
 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
 	} else
 #endif
 	{
 		/* IPv4. */
 		inet_ntoa_r(inc->inc_laddr, laddr_str);
 		inet_ntoa_r(inc->inc_faddr, faddr_str);
 	}
 	db_print_indent(indent);
 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
 	    ntohs(inc->inc_lport));
 	db_print_indent(indent);
 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
 	    ntohs(inc->inc_fport));
 }
 
 static void
 db_print_inpflags(int inp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_flags & INP_RECVOPTS) {
 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVRETOPTS) {
 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVDSTADDR) {
 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HDRINCL) {
 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HIGHPORT) {
 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_LOWPORT) {
 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ANONPORT) {
 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVIF) {
 		db_printf("%sINP_RECVIF", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_MTUDISC) {
 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTTL) {
 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_DONTFRAG) {
 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTOS) {
 		db_printf("%sINP_RECVTOS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_IPV6_V6ONLY) {
 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_PKTINFO) {
 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPLIMIT) {
 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPOPTS) {
 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_DSTOPTS) {
 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDR) {
 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_TCLASS) {
 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_TIMEWAIT) {
 		db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_ONESBCAST) {
 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_DROPPED) {
 		db_printf("%sINP_DROPPED", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_SOCKREF) {
 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & IN6P_RFC2292) {
 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_MTU) {
 		db_printf("IN6P_MTU%s", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_inpvflag(u_char inp_vflag)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_vflag & INP_IPV4) {
 		db_printf("%sINP_IPV4", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6) {
 		db_printf("%sINP_IPV6", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6PROTO) {
 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
 		comma  = 1;
 	}
 }
 
 static void
 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
 
 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
 
 	db_print_indent(indent);
 	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
 	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
 
 	db_print_indent(indent);
 	db_printf("inp_label: %p   inp_flags: 0x%x (",
 	   inp->inp_label, inp->inp_flags);
 	db_print_inpflags(inp->inp_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
 	    inp->inp_vflag);
 	db_print_inpvflag(inp->inp_vflag);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
 
 	db_print_indent(indent);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
 		    "in6p_moptions: %p\n", inp->in6p_options,
 		    inp->in6p_outputopts, inp->in6p_moptions);
 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
 		    inp->in6p_hops);
 	} else
 #endif
 	{
 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
 		    inp->inp_options, inp->inp_moptions);
 	}
 
 	db_print_indent(indent);
 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
 	    (uintmax_t)inp->inp_gencnt);
 }
 
 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 {
 	struct inpcb *inp;
 
 	if (!have_addr) {
 		db_printf("usage: show inpcb <addr>\n");
 		return;
 	}
 	inp = (struct inpcb *)addr;
 
 	db_print_inpcb(inp, "inpcb", 0);
 }
 #endif /* DDB */
Index: head/sys/netinet/in_pcb.h
===================================================================
--- head/sys/netinet/in_pcb.h	(revision 297224)
+++ head/sys/netinet/in_pcb.h	(revision 297225)
@@ -1,733 +1,741 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IN_PCB_H_
 #define _NETINET_IN_PCB_H_
 
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/_rwlock.h>
+#include <net/route.h>
 
 #ifdef _KERNEL
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <net/vnet.h>
 #include <vm/uma.h>
 #endif
 
 #define	in6pcb		inpcb	/* for KAME src sync over BSD*'s */
 #define	in6p_sp		inp_sp	/* for KAME src sync over BSD*'s */
 struct inpcbpolicy;
 
 /*
  * struct inpcb is the common protocol control block structure used in most
  * IP transport protocols.
  *
  * Pointers to local and foreign host table entries, local and foreign socket
  * numbers, and pointers up (to a socket structure) and down (to a
  * protocol-specific control block) are stored here.
  */
 LIST_HEAD(inpcbhead, inpcb);
 LIST_HEAD(inpcbporthead, inpcbport);
 typedef	u_quad_t	inp_gen_t;
 
 /*
  * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
  * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing
  * the following structure.
  */
 struct in_addr_4in6 {
 	u_int32_t	ia46_pad32[3];
 	struct	in_addr	ia46_addr4;
 };
 
 /*
  * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553.  in_conninfo has
  * some extra padding to accomplish this.
  * NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport,
  * lport, faddr to generate hash, so these fields shouldn't be moved.
  */
 struct in_endpoints {
 	u_int16_t	ie_fport;		/* foreign port */
 	u_int16_t	ie_lport;		/* local port */
 	/* protocol dependent part, local and foreign addr */
 	union {
 		/* foreign host table entry */
 		struct	in_addr_4in6 ie46_foreign;
 		struct	in6_addr ie6_foreign;
 	} ie_dependfaddr;
 	union {
 		/* local host table entry */
 		struct	in_addr_4in6 ie46_local;
 		struct	in6_addr ie6_local;
 	} ie_dependladdr;
 	u_int32_t	ie6_zoneid;		/* scope zone id */
 };
 #define	ie_faddr	ie_dependfaddr.ie46_foreign.ia46_addr4
 #define	ie_laddr	ie_dependladdr.ie46_local.ia46_addr4
 #define	ie6_faddr	ie_dependfaddr.ie6_foreign
 #define	ie6_laddr	ie_dependladdr.ie6_local
 
 /*
  * XXX The defines for inc_* are hacks and should be changed to direct
  * references.
  */
 struct in_conninfo {
 	u_int8_t	inc_flags;
 	u_int8_t	inc_len;
 	u_int16_t	inc_fibnum;	/* XXX was pad, 16 bits is plenty */
 	/* protocol dependent part */
 	struct	in_endpoints inc_ie;
 };
 
 /*
  * Flags for inc_flags.
  */
 #define	INC_ISIPV6	0x01
 
 #define inc_isipv6	inc_flags	/* temp compatability */
 #define	inc_fport	inc_ie.ie_fport
 #define	inc_lport	inc_ie.ie_lport
 #define	inc_faddr	inc_ie.ie_faddr
 #define	inc_laddr	inc_ie.ie_laddr
 #define	inc6_faddr	inc_ie.ie6_faddr
 #define	inc6_laddr	inc_ie.ie6_laddr
 #define	inc6_zoneid	inc_ie.ie6_zoneid
 
 struct	icmp6_filter;
 
 /*-
  * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
  * IPv6 sockets.  In the case of TCP and UDP, further per-connection state is
  * hung off of inp_ppcb most of the time.  Almost all fields of struct inpcb
  * are static after creation or protected by a per-inpcb rwlock, inp_lock.  A
  * few fields are protected by multiple locks as indicated in the locking notes
  * below.  For these fields, all of the listed locks must be write-locked for
  * any modifications.  However, these fields can be safely read while any one of
  * the listed locks are read-locked.  This model can permit greater concurrency
  * for read operations.  For example, connections can be looked up while only
  * holding a read lock on the global pcblist lock.  This is important for
  * performance when attempting to find the connection for a packet given its IP
  * and port tuple.
  *
  * One noteworthy exception is that the global pcbinfo lock follows a different
  * set of rules in relation to the inp_list field.  Rather than being
  * write-locked for modifications and read-locked for list iterations, it must
  * be read-locked during modifications and write-locked during list iterations.
  * This ensures that the relatively rare global list iterations safely walk a
  * stable snapshot of connections while allowing more common list modifications
  * to safely grab the pcblist lock just while adding or removing a connection
  * from the global list.
  *
  * Key:
  * (c) - Constant after initialization
  * (g) - Protected by the pcbgroup lock
  * (i) - Protected by the inpcb lock
  * (p) - Protected by the pcbinfo lock for the inpcb
  * (l) - Protected by the pcblist lock for the inpcb
  * (h) - Protected by the pcbhash lock for the inpcb
  * (s) - Protected by another subsystem's locks
  * (x) - Undefined locking
  *
  * A few other notes:
  *
  * When a read lock is held, stability of the field is guaranteed; to write
  * to a field, a write lock must generally be held.
  *
  * netinet/netinet6-layer code should not assume that the inp_socket pointer
  * is safe to dereference without inp_lock being held, even for protocols
  * other than TCP (where the inpcb persists during TIMEWAIT even after the
  * socket has been freed), or there may be close(2)-related races.
  *
  * The inp_vflag field is overloaded, and would otherwise ideally be (c).
  *
  * TODO:  Currently only the TCP stack is leveraging the global pcbinfo lock
  * read-lock usage during modification, this model can be applied to other
  * protocols (especially SCTP).
  */
 struct inpcb {
 	LIST_ENTRY(inpcb) inp_hash;	/* (h/i) hash list */
 	LIST_ENTRY(inpcb) inp_pcbgrouphash;	/* (g/i) hash list */
 	LIST_ENTRY(inpcb) inp_list;	/* (p/l) list for all PCBs for proto */
 	                                /* (p[w]) for list iteration */
 	                                /* (p[r]/l) for addition/removal */
 	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
 	struct	inpcbinfo *inp_pcbinfo;	/* (c) PCB list info */
 	struct	inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
 	LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
 	struct	socket *inp_socket;	/* (i) back pointer to socket */
 	struct	ucred	*inp_cred;	/* (c) cache of socket cred */
 	u_int32_t inp_flow;		/* (i) IPv6 flow information */
 	int	inp_flags;		/* (i) generic IP/datagram flags */
 	int	inp_flags2;		/* (i) generic IP/datagram flags #2*/
 	u_char	inp_vflag;		/* (i) IP version flag (v4/v6) */
 	u_char	inp_ip_ttl;		/* (i) time to live proto */
 	u_char	inp_ip_p;		/* (c) protocol proto */
 	u_char	inp_ip_minttl;		/* (i) minimum TTL or drop */
 	uint32_t inp_flowid;		/* (x) flow id / queue id */
 	u_int	inp_refcount;		/* (i) refcount */
 	void	*inp_pspare[5];		/* (x) route caching / general use */
 	uint32_t inp_flowtype;		/* (x) M_HASHTYPE value */
 	uint32_t inp_rss_listen_bucket;	/* (x) overridden RSS listen bucket */
 	u_int	inp_ispare[4];		/* (x) route caching / user cookie /
 					 *     general use */
 
 	/* Local and foreign ports, local and foreign addr. */
 	struct	in_conninfo inp_inc;	/* (i) list for PCB's local port */
 
 	/* MAC and IPSEC policy information. */
 	struct	label *inp_label;	/* (i) MAC label */
 	struct	inpcbpolicy *inp_sp;    /* (s) for IPSEC */
 
 	/* Protocol-dependent part; options. */
 	struct {
 		u_char	inp4_ip_tos;		/* (i) type of service proto */
 		struct	mbuf *inp4_options;	/* (i) IP options */
 		struct	ip_moptions *inp4_moptions; /* (i) IP mcast options */
 	} inp_depend4;
 	struct {
 		/* (i) IP options */
 		struct	mbuf *inp6_options;
 		/* (i) IP6 options for outgoing packets */
 		struct	ip6_pktopts *inp6_outputopts;
 		/* (i) IP multicast options */
 		struct	ip6_moptions *inp6_moptions;
 		/* (i) ICMPv6 code type filter */
 		struct	icmp6_filter *inp6_icmp6filt;
 		/* (i) IPV6_CHECKSUM setsockopt */
 		int	inp6_cksum;
 		short	inp6_hops;
 	} inp_depend6;
 	LIST_ENTRY(inpcb) inp_portlist;	/* (i/h) */
 	struct	inpcbport *inp_phd;	/* (i/h) head of this list */
 #define inp_zero_size offsetof(struct inpcb, inp_gencnt)
 	inp_gen_t	inp_gencnt;	/* (c) generation count */
 	struct llentry	*inp_lle;	/* cached L2 information */
-	struct rtentry	*inp_rt;	/* cached L3 information */
 	struct rwlock	inp_lock;
+	rt_gen_t	inp_rt_cookie;	/* generation for route entry */
+	union {				/* cached L3 information */
+		struct route inpu_route;
+		struct route_in6 inpu_route6;
+	} inp_rtu;
+#define inp_route inp_rtu.inpu_route
+#define inp_route6 inp_rtu.inpu_route6
 };
 #define	inp_fport	inp_inc.inc_fport
 #define	inp_lport	inp_inc.inc_lport
 #define	inp_faddr	inp_inc.inc_faddr
 #define	inp_laddr	inp_inc.inc_laddr
 #define	inp_ip_tos	inp_depend4.inp4_ip_tos
 #define	inp_options	inp_depend4.inp4_options
 #define	inp_moptions	inp_depend4.inp4_moptions
 
 #define	in6p_faddr	inp_inc.inc6_faddr
 #define	in6p_laddr	inp_inc.inc6_laddr
 #define	in6p_zoneid	inp_inc.inc6_zoneid
 #define	in6p_hops	inp_depend6.inp6_hops	/* default hop limit */
 #define	in6p_flowinfo	inp_flow
 #define	in6p_options	inp_depend6.inp6_options
 #define	in6p_outputopts	inp_depend6.inp6_outputopts
 #define	in6p_moptions	inp_depend6.inp6_moptions
 #define	in6p_icmp6filt	inp_depend6.inp6_icmp6filt
 #define	in6p_cksum	inp_depend6.inp6_cksum
 
 #define	inp_vnet	inp_pcbinfo->ipi_vnet
 
 /*
  * The range of the generation count, as used in this implementation, is 9e19.
  * We would have to create 300 billion connections per second for this number
  * to roll over in a year.  This seems sufficiently unlikely that we simply
  * don't concern ourselves with that possibility.
  */
 
 /*
  * Interface exported to userland by various protocols which use inpcbs.  Hack
  * alert -- only define if struct xsocket is in scope.
  */
 #ifdef _SYS_SOCKETVAR_H_
 struct	xinpcb {
 	size_t	xi_len;		/* length of this structure */
 	struct	inpcb xi_inp;
 	struct	xsocket xi_socket;
 	u_quad_t	xi_alignment_hack;
 };
 
 struct	xinpgen {
 	size_t	xig_len;	/* length of this structure */
 	u_int	xig_count;	/* number of PCBs at this time */
 	inp_gen_t xig_gen;	/* generation count at this time */
 	so_gen_t xig_sogen;	/* socket generation count at this time */
 };
 #endif /* _SYS_SOCKETVAR_H_ */
 
 struct inpcbport {
 	LIST_ENTRY(inpcbport) phd_hash;
 	struct inpcbhead phd_pcblist;
 	u_short phd_port;
 };
 
 /*-
  * Global data structure for each high-level protocol (UDP, TCP, ...) in both
  * IPv4 and IPv6.  Holds inpcb lists and information for managing them.
  *
  * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and
  * ipi_list_lock:
  *  - ipi_lock covering the global pcb list stability during loop iteration,
  *  - ipi_hash_lock covering the hashed lookup tables,
  *  - ipi_list_lock covering mutable global fields (such as the global
  *    pcb list)
  *
  * The lock order is:
  *
  *    ipi_lock (before)
  *        inpcb locks (before)
  *            ipi_list locks (before)
  *                {ipi_hash_lock, pcbgroup locks}
  *
  * Locking key:
  *
  * (c) Constant or nearly constant after initialisation
  * (g) Locked by ipi_lock
  * (l) Locked by ipi_list_lock
  * (h) Read using either ipi_hash_lock or inpcb lock; write requires both
  * (p) Protected by one or more pcbgroup locks
  * (x) Synchronisation properties poorly defined
  */
 struct inpcbinfo {
 	/*
 	 * Global lock protecting full inpcb list traversal
 	 */
 	struct rwlock		 ipi_lock;
 
 	/*
 	 * Global list of inpcbs on the protocol.
 	 */
 	struct inpcbhead	*ipi_listhead;		/* (g/l) */
 	u_int			 ipi_count;		/* (l) */
 
 	/*
 	 * Generation count -- incremented each time a connection is allocated
 	 * or freed.
 	 */
 	u_quad_t		 ipi_gencnt;		/* (l) */
 
 	/*
 	 * Fields associated with port lookup and allocation.
 	 */
 	u_short			 ipi_lastport;		/* (x) */
 	u_short			 ipi_lastlow;		/* (x) */
 	u_short			 ipi_lasthi;		/* (x) */
 
 	/*
 	 * UMA zone from which inpcbs are allocated for this protocol.
 	 */
 	struct	uma_zone	*ipi_zone;		/* (c) */
 
 	/*
 	 * Connection groups associated with this protocol.  These fields are
 	 * constant, but pcbgroup structures themselves are protected by
 	 * per-pcbgroup locks.
 	 */
 	struct inpcbgroup	*ipi_pcbgroups;		/* (c) */
 	u_int			 ipi_npcbgroups;	/* (c) */
 	u_int			 ipi_hashfields;	/* (c) */
 
 	/*
 	 * Global lock protecting non-pcbgroup hash lookup tables.
 	 */
 	struct rwlock		 ipi_hash_lock;
 
 	/*
 	 * Global hash of inpcbs, hashed by local and foreign addresses and
 	 * port numbers.
 	 */
 	struct inpcbhead	*ipi_hashbase;		/* (h) */
 	u_long			 ipi_hashmask;		/* (h) */
 
 	/*
 	 * Global hash of inpcbs, hashed by only local port number.
 	 */
 	struct inpcbporthead	*ipi_porthashbase;	/* (h) */
 	u_long			 ipi_porthashmask;	/* (h) */
 
 	/*
 	 * List of wildcard inpcbs for use with pcbgroups.  In the past, was
 	 * per-pcbgroup but is now global.  All pcbgroup locks must be held
 	 * to modify the list, so any is sufficient to read it.
 	 */
 	struct inpcbhead	*ipi_wildbase;		/* (p) */
 	u_long			 ipi_wildmask;		/* (p) */
 
 	/*
 	 * Pointer to network stack instance
 	 */
 	struct vnet		*ipi_vnet;		/* (c) */
 
 	/*
 	 * general use 2
 	 */
 	void 			*ipi_pspare[2];
 
 	/*
 	 * Global lock protecting global inpcb list, inpcb count, etc.
 	 */
 	struct rwlock		 ipi_list_lock;
 };
 
 #ifdef _KERNEL
 /*
  * Connection groups hold sets of connections that have similar CPU/thread
  * affinity.  Each connection belongs to exactly one connection group.
  */
 struct inpcbgroup {
 	/*
 	 * Per-connection group hash of inpcbs, hashed by local and foreign
 	 * addresses and port numbers.
 	 */
 	struct inpcbhead	*ipg_hashbase;		/* (c) */
 	u_long			 ipg_hashmask;		/* (c) */
 
 	/*
 	 * Notional affinity of this pcbgroup.
 	 */
 	u_int			 ipg_cpu;		/* (p) */
 
 	/*
 	 * Per-connection group lock, not to be confused with ipi_lock.
 	 * Protects the hash table hung off the group, but also the global
 	 * wildcard list in inpcbinfo.
 	 */
 	struct mtx		 ipg_lock;
 } __aligned(CACHE_LINE_SIZE);
 
 #define INP_LOCK_INIT(inp, d, t) \
 	rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE |  RW_DUPOK)
 #define INP_LOCK_DESTROY(inp)	rw_destroy(&(inp)->inp_lock)
 #define INP_RLOCK(inp)		rw_rlock(&(inp)->inp_lock)
 #define INP_WLOCK(inp)		rw_wlock(&(inp)->inp_lock)
 #define INP_TRY_RLOCK(inp)	rw_try_rlock(&(inp)->inp_lock)
 #define INP_TRY_WLOCK(inp)	rw_try_wlock(&(inp)->inp_lock)
 #define INP_RUNLOCK(inp)	rw_runlock(&(inp)->inp_lock)
 #define INP_WUNLOCK(inp)	rw_wunlock(&(inp)->inp_lock)
 #define	INP_TRY_UPGRADE(inp)	rw_try_upgrade(&(inp)->inp_lock)
 #define	INP_DOWNGRADE(inp)	rw_downgrade(&(inp)->inp_lock)
 #define	INP_WLOCKED(inp)	rw_wowned(&(inp)->inp_lock)
 #define	INP_LOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_LOCKED)
 #define	INP_RLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_RLOCKED)
 #define	INP_WLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_WLOCKED)
 #define	INP_UNLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_UNLOCKED)
 
 /*
  * These locking functions are for inpcb consumers outside of sys/netinet,
  * more specifically, they were added for the benefit of TOE drivers. The
  * macros are reserved for use by the stack.
  */
 void inp_wlock(struct inpcb *);
 void inp_wunlock(struct inpcb *);
 void inp_rlock(struct inpcb *);
 void inp_runlock(struct inpcb *);
 
 #ifdef INVARIANTS
 void inp_lock_assert(struct inpcb *);
 void inp_unlock_assert(struct inpcb *);
 #else
 static __inline void
 inp_lock_assert(struct inpcb *inp __unused)
 {
 }
 
 static __inline void
 inp_unlock_assert(struct inpcb *inp __unused)
 {
 }
 
 #endif
 
 void	inp_apply_all(void (*func)(struct inpcb *, void *), void *arg);
 int 	inp_ip_tos_get(const struct inpcb *inp);
 void 	inp_ip_tos_set(struct inpcb *inp, int val);
 struct socket *
 	inp_inpcbtosocket(struct inpcb *inp);
 struct tcpcb *
 	inp_inpcbtotcpcb(struct inpcb *inp);
 void 	inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
 		uint32_t *faddr, uint16_t *fp);
 short	inp_so_options(const struct inpcb *inp);
 
 #endif /* _KERNEL */
 
 #define INP_INFO_LOCK_INIT(ipi, d) \
 	rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE)
 #define INP_INFO_LOCK_DESTROY(ipi)  rw_destroy(&(ipi)->ipi_lock)
 #define INP_INFO_RLOCK(ipi)	rw_rlock(&(ipi)->ipi_lock)
 #define INP_INFO_WLOCK(ipi)	rw_wlock(&(ipi)->ipi_lock)
 #define INP_INFO_TRY_RLOCK(ipi)	rw_try_rlock(&(ipi)->ipi_lock)
 #define INP_INFO_TRY_WLOCK(ipi)	rw_try_wlock(&(ipi)->ipi_lock)
 #define INP_INFO_TRY_UPGRADE(ipi)	rw_try_upgrade(&(ipi)->ipi_lock)
 #define INP_INFO_WLOCKED(ipi)	rw_wowned(&(ipi)->ipi_lock)
 #define INP_INFO_RUNLOCK(ipi)	rw_runlock(&(ipi)->ipi_lock)
 #define INP_INFO_WUNLOCK(ipi)	rw_wunlock(&(ipi)->ipi_lock)
 #define	INP_INFO_LOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_LOCKED)
 #define INP_INFO_RLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_RLOCKED)
 #define INP_INFO_WLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
 #define INP_INFO_UNLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
 
 #define INP_LIST_LOCK_INIT(ipi, d) \
         rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
 #define INP_LIST_LOCK_DESTROY(ipi)  rw_destroy(&(ipi)->ipi_list_lock)
 #define INP_LIST_RLOCK(ipi)     rw_rlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_WLOCK(ipi)     rw_wlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_TRY_UPGRADE(ipi)       rw_try_upgrade(&(ipi)->ipi_list_lock)
 #define INP_LIST_RUNLOCK(ipi)   rw_runlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_WUNLOCK(ipi)   rw_wunlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_LOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED)
 #define INP_LIST_RLOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED)
 #define INP_LIST_WLOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED)
 #define INP_LIST_UNLOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
 
 #define	INP_HASH_LOCK_INIT(ipi, d) \
 	rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0)
 #define	INP_HASH_LOCK_DESTROY(ipi)	rw_destroy(&(ipi)->ipi_hash_lock)
 #define	INP_HASH_RLOCK(ipi)		rw_rlock(&(ipi)->ipi_hash_lock)
 #define	INP_HASH_WLOCK(ipi)		rw_wlock(&(ipi)->ipi_hash_lock)
 #define	INP_HASH_RUNLOCK(ipi)		rw_runlock(&(ipi)->ipi_hash_lock)
 #define	INP_HASH_WUNLOCK(ipi)		rw_wunlock(&(ipi)->ipi_hash_lock)
 #define	INP_HASH_LOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_hash_lock, \
 					    RA_LOCKED)
 #define	INP_HASH_WLOCK_ASSERT(ipi)	rw_assert(&(ipi)->ipi_hash_lock, \
 					    RA_WLOCKED)
 
 #define	INP_GROUP_LOCK_INIT(ipg, d)	mtx_init(&(ipg)->ipg_lock, (d), NULL, \
 					    MTX_DEF | MTX_DUPOK)
 #define	INP_GROUP_LOCK_DESTROY(ipg)	mtx_destroy(&(ipg)->ipg_lock)
 
 #define	INP_GROUP_LOCK(ipg)		mtx_lock(&(ipg)->ipg_lock)
 #define	INP_GROUP_LOCK_ASSERT(ipg)	mtx_assert(&(ipg)->ipg_lock, MA_OWNED)
 #define	INP_GROUP_UNLOCK(ipg)		mtx_unlock(&(ipg)->ipg_lock)
 
 #define INP_PCBHASH(faddr, lport, fport, mask) \
 	(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
 #define INP_PCBPORTHASH(lport, mask) \
 	(ntohs((lport)) & (mask))
 #define	INP6_PCBHASHKEY(faddr)	((faddr)->s6_addr32[3])
 
 /*
  * Flags for inp_vflags -- historically version flags only
  */
 #define	INP_IPV4	0x1
 #define	INP_IPV6	0x2
 #define	INP_IPV6PROTO	0x4		/* opened under IPv6 protocol */
 
 /*
  * Flags for inp_flags.
  */
 #define	INP_RECVOPTS		0x00000001 /* receive incoming IP options */
 #define	INP_RECVRETOPTS		0x00000002 /* receive IP options for reply */
 #define	INP_RECVDSTADDR		0x00000004 /* receive IP dst address */
 #define	INP_HDRINCL		0x00000008 /* user supplies entire IP header */
 #define	INP_HIGHPORT		0x00000010 /* user wants "high" port binding */
 #define	INP_LOWPORT		0x00000020 /* user wants "low" port binding */
 #define	INP_ANONPORT		0x00000040 /* port chosen for user */
 #define	INP_RECVIF		0x00000080 /* receive incoming interface */
 #define	INP_MTUDISC		0x00000100 /* user can do MTU discovery */
 				   	   /* 0x000200 unused: was INP_FAITH */
 #define	INP_RECVTTL		0x00000400 /* receive incoming IP TTL */
 #define	INP_DONTFRAG		0x00000800 /* don't fragment packet */
 #define	INP_BINDANY		0x00001000 /* allow bind to any address */
 #define	INP_INHASHLIST		0x00002000 /* in_pcbinshash() has been called */
 #define	INP_RECVTOS		0x00004000 /* receive incoming IP TOS */
 #define	IN6P_IPV6_V6ONLY	0x00008000 /* restrict AF_INET6 socket for v6 */
 #define	IN6P_PKTINFO		0x00010000 /* receive IP6 dst and I/F */
 #define	IN6P_HOPLIMIT		0x00020000 /* receive hoplimit */
 #define	IN6P_HOPOPTS		0x00040000 /* receive hop-by-hop options */
 #define	IN6P_DSTOPTS		0x00080000 /* receive dst options after rthdr */
 #define	IN6P_RTHDR		0x00100000 /* receive routing header */
 #define	IN6P_RTHDRDSTOPTS	0x00200000 /* receive dstoptions before rthdr */
 #define	IN6P_TCLASS		0x00400000 /* receive traffic class value */
 #define	IN6P_AUTOFLOWLABEL	0x00800000 /* attach flowlabel automatically */
 #define	INP_TIMEWAIT		0x01000000 /* in TIMEWAIT, ppcb is tcptw */
 #define	INP_ONESBCAST		0x02000000 /* send all-ones broadcast */
 #define	INP_DROPPED		0x04000000 /* protocol drop flag */
 #define	INP_SOCKREF		0x08000000 /* strong socket reference */
 #define	INP_RESERVED_0          0x10000000 /* reserved field */
 #define	INP_RESERVED_1          0x20000000 /* reserved field */
 #define	IN6P_RFC2292		0x40000000 /* used RFC2292 API on the socket */
 #define	IN6P_MTU		0x80000000 /* receive path MTU */
 
 #define	INP_CONTROLOPTS		(INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
 				 INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\
 				 IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
 				 IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
 				 IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\
 				 IN6P_MTU)
 
 /*
  * Flags for inp_flags2.
  */
 #define	INP_LLE_VALID		0x00000001 /* cached lle is valid */	
 #define	INP_RT_VALID		0x00000002 /* cached rtentry is valid */
 #define	INP_PCBGROUPWILD	0x00000004 /* in pcbgroup wildcard list */
 #define	INP_REUSEPORT		0x00000008 /* SO_REUSEPORT option is set */
 #define	INP_FREED		0x00000010 /* inp itself is not valid */
 #define	INP_REUSEADDR		0x00000020 /* SO_REUSEADDR option is set */
 #define	INP_BINDMULTI		0x00000040 /* IP_BINDMULTI option is set */
 #define	INP_RSS_BUCKET_SET	0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
 #define	INP_RECVFLOWID		0x00000100 /* populate recv datagram with flow info */
 #define	INP_RECVRSSBUCKETID	0x00000200 /* populate recv datagram with bucket id */
 
 /*
  * Flags passed to in_pcblookup*() functions.
  */
 #define	INPLOOKUP_WILDCARD	0x00000001	/* Allow wildcard sockets. */
 #define	INPLOOKUP_RLOCKPCB	0x00000002	/* Return inpcb read-locked. */
 #define	INPLOOKUP_WLOCKPCB	0x00000004	/* Return inpcb write-locked. */
 
 #define	INPLOOKUP_MASK	(INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
 			    INPLOOKUP_WLOCKPCB)
 
 #define	sotoinpcb(so)	((struct inpcb *)(so)->so_pcb)
 #define	sotoin6pcb(so)	sotoinpcb(so) /* for KAME src sync over BSD*'s */
 
 #define	INP_SOCKAF(so) so->so_proto->pr_domain->dom_family
 
 #define	INP_CHECK_SOCKAF(so, af)	(INP_SOCKAF(so) == af)
 
 /*
  * Constants for pcbinfo.ipi_hashfields.
  */
 #define	IPI_HASHFIELDS_NONE	0
 #define	IPI_HASHFIELDS_2TUPLE	1
 #define	IPI_HASHFIELDS_4TUPLE	2
 
 #ifdef _KERNEL
 VNET_DECLARE(int, ipport_reservedhigh);
 VNET_DECLARE(int, ipport_reservedlow);
 VNET_DECLARE(int, ipport_lowfirstauto);
 VNET_DECLARE(int, ipport_lowlastauto);
 VNET_DECLARE(int, ipport_firstauto);
 VNET_DECLARE(int, ipport_lastauto);
 VNET_DECLARE(int, ipport_hifirstauto);
 VNET_DECLARE(int, ipport_hilastauto);
 VNET_DECLARE(int, ipport_randomized);
 VNET_DECLARE(int, ipport_randomcps);
 VNET_DECLARE(int, ipport_randomtime);
 VNET_DECLARE(int, ipport_stoprandom);
 VNET_DECLARE(int, ipport_tcpallocs);
 
 #define	V_ipport_reservedhigh	VNET(ipport_reservedhigh)
 #define	V_ipport_reservedlow	VNET(ipport_reservedlow)
 #define	V_ipport_lowfirstauto	VNET(ipport_lowfirstauto)
 #define	V_ipport_lowlastauto	VNET(ipport_lowlastauto)
 #define	V_ipport_firstauto	VNET(ipport_firstauto)
 #define	V_ipport_lastauto	VNET(ipport_lastauto)
 #define	V_ipport_hifirstauto	VNET(ipport_hifirstauto)
 #define	V_ipport_hilastauto	VNET(ipport_hilastauto)
 #define	V_ipport_randomized	VNET(ipport_randomized)
 #define	V_ipport_randomcps	VNET(ipport_randomcps)
 #define	V_ipport_randomtime	VNET(ipport_randomtime)
 #define	V_ipport_stoprandom	VNET(ipport_stoprandom)
 #define	V_ipport_tcpallocs	VNET(ipport_tcpallocs)
 
 void	in_pcbinfo_destroy(struct inpcbinfo *);
 void	in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *,
 	    int, int, char *, uma_init, uma_fini, uint32_t, u_int);
 
 int	in_pcbbind_check_bindmulti(const struct inpcb *ni,
 	    const struct inpcb *oi);
 
 struct inpcbgroup *
 	in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t);
 struct inpcbgroup *
 	in_pcbgroup_byinpcb(struct inpcb *);
 struct inpcbgroup *
 	in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short,
 	    struct in_addr, u_short);
 void	in_pcbgroup_destroy(struct inpcbinfo *);
 int	in_pcbgroup_enabled(struct inpcbinfo *);
 void	in_pcbgroup_init(struct inpcbinfo *, u_int, int);
 void	in_pcbgroup_remove(struct inpcb *);
 void	in_pcbgroup_update(struct inpcb *);
 void	in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *);
 
 void	in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
 int	in_pcballoc(struct socket *, struct inpcbinfo *);
 int	in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *);
 int	in_pcb_lport(struct inpcb *, struct in_addr *, u_short *,
 	    struct ucred *, int);
 int	in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
 	    u_short *, struct ucred *);
 int	in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *);
 int	in_pcbconnect_mbuf(struct inpcb *, struct sockaddr *, struct ucred *,
 	    struct mbuf *);
 int	in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
 	    u_short *, in_addr_t *, u_short *, struct inpcb **,
 	    struct ucred *);
 void	in_pcbdetach(struct inpcb *);
 void	in_pcbdisconnect(struct inpcb *);
 void	in_pcbdrop(struct inpcb *);
 void	in_pcbfree(struct inpcb *);
 int	in_pcbinshash(struct inpcb *);
 int	in_pcbinshash_nopcbgroup(struct inpcb *);
 int	in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *,
 	    struct ucred *);
 struct inpcb *
 	in_pcblookup_local(struct inpcbinfo *,
 	    struct in_addr, u_short, int, struct ucred *);
 struct inpcb *
 	in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
 	    struct in_addr, u_int, int, struct ifnet *);
 struct inpcb *
 	in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
 	    struct in_addr, u_int, int, struct ifnet *, struct mbuf *);
 void	in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
 	    int, struct inpcb *(*)(struct inpcb *, int));
 void	in_pcbref(struct inpcb *);
 void	in_pcbrehash(struct inpcb *);
 void	in_pcbrehash_mbuf(struct inpcb *, struct mbuf *);
 int	in_pcbrele(struct inpcb *);
 int	in_pcbrele_rlocked(struct inpcb *);
 int	in_pcbrele_wlocked(struct inpcb *);
+void	in_losing(struct inpcb *);
 void	in_pcbsetsolabel(struct socket *so);
 int	in_getpeeraddr(struct socket *so, struct sockaddr **nam);
 int	in_getsockaddr(struct socket *so, struct sockaddr **nam);
 struct sockaddr *
 	in_sockaddr(in_port_t port, struct in_addr *addr);
 void	in_pcbsosetlabel(struct socket *so);
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_IN_PCB_H_ */
Index: head/sys/netinet/ip_output.c
===================================================================
--- head/sys/netinet/ip_output.c	(revision 297224)
+++ head/sys/netinet/ip_output.c	(revision 297225)
@@ -1,1392 +1,1411 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mpath.h"
 #include "opt_route.h"
 #include "opt_sctp.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llatbl.h>
 #include <net/netisr.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/flowtable.h>
 #ifdef RADIX_MPATH
 #include <net/radix_mpath.h>
 #endif
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #ifdef IPSEC
 #include <netinet/ip_ipsec.h>
 #include <netipsec/ipsec.h>
 #endif /* IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef MBUF_STRESS_TEST
 static int mbuf_frag_size = 0;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
 	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
 #endif
 
 static void	ip_mloopback(struct ifnet *, const struct mbuf *, int);
 
 
 extern int in_mcast_loop;
 extern	struct protosw inetsw[];
 
 static inline int
 ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp,
     struct sockaddr_in *dst, int *fibnum, int *error)
 {
 	struct m_tag *fwd_tag = NULL;
 	struct mbuf *m;
 	struct in_addr odst;
 	struct ip *ip;
 
 	m = *mp;
 	ip = mtod(m, struct ip *);
 
 	/* Run through list of hooks for output packets. */
 	odst.s_addr = ip->ip_dst.s_addr;
 	*error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, PFIL_OUT, inp);
 	m = *mp;
 	if ((*error) != 0 || m == NULL)
 		return 1; /* Finished */
 
 	ip = mtod(m, struct ip *);
 
 	/* See if destination IP address was changed by packet filter. */
 	if (odst.s_addr != ip->ip_dst.s_addr) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip_input(). */
 		if (in_localip(ip->ip_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 				m->m_pkthdr.csum_flags |=
 					CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 			m->m_pkthdr.csum_flags |=
 				CSUM_IP_CHECKED | CSUM_IP_VALID;
 #ifdef SCTP
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			*error = netisr_queue(NETISR_IP, m);
 			return 1; /* Finished */
 		}
 
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 
 		return -1; /* Reloop */
 	}
 	/* See if fib was changed by packet filter. */
 	if ((*fibnum) != M_GETFIB(m)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		*fibnum = M_GETFIB(m);
 		return -1; /* Reloop for FIB change */
 	}
 
 	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			m->m_pkthdr.csum_flags |=
 				CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		m->m_pkthdr.csum_flags |=
 			CSUM_IP_CHECKED | CSUM_IP_VALID;
 
 		*error = netisr_queue(NETISR_IP, m);
 		return 1; /* Finished */
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
 		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 
 		return -1; /* Reloop for CHANGE of dst */
 	}
 
 	return 0;
 }
 
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route ro is present and has ro_rt initialized, route lookup would be
  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
  * then result of route lookup is stored in ro->ro_rt.
  *
  * In the IP forwarding case, the packet will arrive with options already
  * inserted, so must have a NULL opt pointer.
  */
 int
 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
     struct ip_moptions *imo, struct inpcb *inp)
 {
 	struct rm_priotracker in_ifa_tracker;
 	struct ip *ip;
 	struct ifnet *ifp = NULL;	/* keep compiler happy */
 	struct mbuf *m0;
 	int hlen = sizeof (struct ip);
 	int mtu;
 	int error = 0;
 	struct sockaddr_in *dst;
 	const struct sockaddr_in *gw;
 	struct in_ifaddr *ia;
 	int isbroadcast;
 	uint16_t ip_len, ip_off;
 	struct route iproute;
 	struct rtentry *rte;	/* cache for ro->ro_rt */
 	uint32_t fibnum;
 	int have_ia_ref;
 #ifdef IPSEC
 	int no_route_but_check_spd = 0;
 #endif
 	M_ASSERTPKTHDR(m);
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if ((flags & IP_NODEFAULTFLOWID) == 0) {
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 		}
 	}
 
 	if (ro == NULL) {
 		ro = &iproute;
 		bzero(ro, sizeof (*ro));
 	}
 
 #ifdef FLOWTABLE
 	if (ro->ro_rt == NULL)
 		(void )flowtable_lookup(AF_INET, m, ro);
 #endif
 
 	if (opt) {
 		int len = 0;
 		m = ip_insertoptions(m, opt, &len);
 		if (len != 0)
 			hlen = len; /* ip->ip_hl is updated above */
 	}
 	ip = mtod(m, struct ip *);
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = hlen >> 2;
 		ip_fillid(ip);
 		IPSTAT_INC(ips_localout);
 	} else {
 		/* Header already set, fetch hlen from there */
 		hlen = ip->ip_hl << 2;
 	}
 
 	/*
 	 * dst/gw handling:
 	 *
 	 * dst can be rewritten but always points to &ro->ro_dst.
 	 * gw is readonly but can point either to dst OR rt_gateway,
 	 * therefore we need restore gw if we're redoing lookup.
 	 */
 	gw = dst = (struct sockaddr_in *)&ro->ro_dst;
 	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
 	rte = ro->ro_rt;
-	/*
-	 * The address family should also be checked in case of sharing
-	 * the cache with IPv6.
-	 */
-	if (rte == NULL || dst->sin_family != AF_INET) {
+	if (rte == NULL) {
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 	}
 again:
+	/*
+	 * Validate route against routing table additions;
+	 * a better/more specific route might have been added.
+	 */
+	if (inp)
+		RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
+	/*
+	 * If there is a cached route,
+	 * check that it is to the same destination
+	 * and is still up.  If not, free it and try again.
+	 * The address family should also be checked in case of sharing the
+	 * cache with IPv6.
+	 * Also check whether routing cache needs invalidation.
+	 */
+	rte = ro->ro_rt;
+	if (rte && ((rte->rt_flags & RTF_UP) == 0 ||
+		    rte->rt_ifp == NULL ||
+		    !RT_LINK_IS_UP(rte->rt_ifp) ||
+			  dst->sin_family != AF_INET ||
+			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
+		RTFREE(rte);
+		rte = ro->ro_rt = (struct rtentry *)NULL;
+	}
 	ia = NULL;
 	have_ia_ref = 0;
 	/*
 	 * If routing to interface only, short circuit routing lookup.
 	 * The use of an all-ones broadcast address implies this; an
 	 * interface is specified by the broadcast address of an interface,
 	 * or the destination address of a ptp interface.
 	 */
 	if (flags & IP_SENDONES) {
 		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
 						      M_GETFIB(m)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
 						    M_GETFIB(m)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		have_ia_ref = 1;
 		ip->ip_dst.s_addr = INADDR_BROADCAST;
 		dst->sin_addr = ip->ip_dst;
 		ifp = ia->ia_ifp;
 		ip->ip_ttl = 1;
 		isbroadcast = 1;
 	} else if (flags & IP_ROUTETOIF) {
 		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
 						    M_GETFIB(m)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
 						M_GETFIB(m)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		have_ia_ref = 1;
 		ifp = ia->ia_ifp;
 		ip->ip_ttl = 1;
 		isbroadcast = in_broadcast(dst->sin_addr, ifp);
 	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 	    imo != NULL && imo->imo_multicast_ifp != NULL) {
 		/*
 		 * Bypass the normal routing lookup for multicast
 		 * packets if the interface is specified.
 		 */
 		ifp = imo->imo_multicast_ifp;
 		IFP_TO_IA(ifp, ia, &in_ifa_tracker);
 		if (ia)
 			have_ia_ref = 1;
 		isbroadcast = 0;	/* fool gcc */
 	} else {
 		/*
 		 * We want to do any cloning requested by the link layer,
 		 * as this is probably required in all cases for correct
 		 * operation (as it is for ARP).
 		 */
 		if (rte == NULL) {
 #ifdef RADIX_MPATH
 			rtalloc_mpath_fib(ro,
 			    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
 			    fibnum);
 #else
 			in_rtalloc_ign(ro, 0, fibnum);
 #endif
 			rte = ro->ro_rt;
 		}
 		if (rte == NULL ||
 		    (rte->rt_flags & RTF_UP) == 0 ||
 		    rte->rt_ifp == NULL ||
 		    !RT_LINK_IS_UP(rte->rt_ifp)) {
 #ifdef IPSEC
 			/*
 			 * There is no route for this packet, but it is
 			 * possible that a matching SPD entry exists.
 			 */
 			no_route_but_check_spd = 1;
 			mtu = 0; /* Silence GCC warning. */
 			goto sendit;
 #endif
 			IPSTAT_INC(ips_noroute);
 			error = EHOSTUNREACH;
 			goto bad;
 		}
 		ia = ifatoia(rte->rt_ifa);
 		ifp = rte->rt_ifp;
 		counter_u64_add(rte->rt_pksent, 1);
 		rt_update_ro_flags(ro);
 		if (rte->rt_flags & RTF_GATEWAY)
 			gw = (struct sockaddr_in *)rte->rt_gateway;
 		if (rte->rt_flags & RTF_HOST)
 			isbroadcast = (rte->rt_flags & RTF_BROADCAST);
 		else
 			isbroadcast = in_broadcast(gw->sin_addr, ifp);
 	}
 
 	/*
 	 * Calculate MTU.  If we have a route that is up, use that,
 	 * otherwise use the interface's MTU.
 	 */
 	if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST)))
 		mtu = rte->rt_mtu;
 	else
 		mtu = ifp->if_mtu;
 	/* Catch a possible divide by zero later. */
 	KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p",
 	    __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp));
 
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		m->m_flags |= M_MCAST;
 		/*
 		 * IP destination address is multicast.  Make sure "gw"
 		 * still points to the address in "ro".  (It may have been
 		 * changed to point to a gateway address, above.)
 		 */
 		gw = dst;
 		/*
 		 * See if the caller provided any multicast options
 		 */
 		if (imo != NULL) {
 			ip->ip_ttl = imo->imo_multicast_ttl;
 			if (imo->imo_multicast_vif != -1)
 				ip->ip_src.s_addr =
 				    ip_mcast_src ?
 				    ip_mcast_src(imo->imo_multicast_vif) :
 				    INADDR_ANY;
 		} else
 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
 			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 				IPSTAT_INC(ips_noroute);
 				error = ENETUNREACH;
 				goto bad;
 			}
 		}
 		/*
 		 * If source address not specified yet, use address
 		 * of outgoing interface.
 		 */
 		if (ip->ip_src.s_addr == INADDR_ANY) {
 			/* Interface may have no addresses. */
 			if (ia != NULL)
 				ip->ip_src = IA_SIN(ia)->sin_addr;
 		}
 
 		if ((imo == NULL && in_mcast_loop) ||
 		    (imo && imo->imo_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we are not a member
 			 * of the group; ip_input() will filter it later,
 			 * thus deferring a hash lookup and mutex acquisition
 			 * at the expense of a cheap copy using m_copym().
 			 */
 			ip_mloopback(ifp, m, hlen);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IP_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip_mloopback(),
 			 * above, will be forwarded by the ip_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
 				/*
 				 * If rsvp daemon is not running, do not
 				 * set ip_moptions. This ensures that the packet
 				 * is multicast and not just sent down one link
 				 * as prescribed by rsvpd.
 				 */
 				if (!V_rsvp_on)
 					imo = NULL;
 				if (ip_mforward &&
 				    ip_mforward(ip, ifp, m, imo) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 
 		/*
 		 * Multicasts with a time-to-live of zero may be looped-
 		 * back, above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip_mloopback() will
 		 * loop back a copy. ip_input() will drop the copy if
 		 * this host does not belong to the destination group on
 		 * the loopback interface.
 		 */
 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 			m_freem(m);
 			goto done;
 		}
 
 		goto sendit;
 	}
 
 	/*
 	 * If the source address is not specified yet, use the address
 	 * of the outoing interface.
 	 */
 	if (ip->ip_src.s_addr == INADDR_ANY) {
 		/* Interface may have no addresses. */
 		if (ia != NULL) {
 			ip->ip_src = IA_SIN(ia)->sin_addr;
 		}
 	}
 
 	/*
 	 * Look for broadcast address and
 	 * verify user is allowed to send
 	 * such a packet.
 	 */
 	if (isbroadcast) {
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 		if ((flags & IP_ALLOWBROADCAST) == 0) {
 			error = EACCES;
 			goto bad;
 		}
 		/* don't allow broadcast messages to be fragmented */
 		if (ip_len > mtu) {
 			error = EMSGSIZE;
 			goto bad;
 		}
 		m->m_flags |= M_BCAST;
 	} else {
 		m->m_flags &= ~M_BCAST;
 	}
 
 sendit:
 #ifdef IPSEC
 	switch(ip_ipsec_output(&m, inp, &error)) {
 	case 1:
 		goto bad;
 	case -1:
 		goto done;
 	case 0:
 	default:
 		break;	/* Continue with packet processing. */
 	}
 	/*
 	 * Check if there was a route for this packet; return error if not.
 	 */
 	if (no_route_but_check_spd) {
 		IPSTAT_INC(ips_noroute);
 		error = EHOSTUNREACH;
 		goto bad;
 	}
 	/* Update variables that are affected by ipsec4_output(). */
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 #endif /* IPSEC */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (PFIL_HOOKED(&V_inet_pfil_hook)) {
 		switch (ip_output_pfil(&m, ifp, inp, dst, &fibnum, &error)) {
 		case 1: /* Finished */
 			goto done;
 
 		case 0: /* Continue normally */
 			ip = mtod(m, struct ip *);
 			break;
 
 		case -1: /* Need to try again */
 			/* Reset everything for a new round */
 			RO_RTFREE(ro);
 			if (have_ia_ref)
 				ifa_free(&ia->ia_ifa);
 			ro->ro_prepend = NULL;
 			rte = NULL;
 			gw = dst;
 			ip = mtod(m, struct ip *);
 			goto again;
 
 		}
 	}
 
 	/* 127/8 must not appear on wire - RFC1122. */
 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 	}
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 
 	/*
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, we can just send directly.
 	 */
 	if (ip_len <= mtu ||
 	    (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
 		ip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
 			ip->ip_sum = in_cksum(m, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 
 		/*
 		 * Record statistics for this interface address.
 		 * With CSUM_TSO the byte/packet count will be slightly
 		 * incorrect because we count the IP+TCP headers only
 		 * once instead of for every generated packet.
 		 */
 		if (!(flags & IP_FORWARDING) && ia) {
 			if (m->m_pkthdr.csum_flags & CSUM_TSO)
 				counter_u64_add(ia->ia_ifa.ifa_opackets,
 				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
 			else
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 
 			counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
 		}
 #ifdef MBUF_STRESS_TEST
 		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
 			m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
 #endif
 		/*
 		 * Reset layer specific mbuf flags
 		 * to avoid confusing lower layers.
 		 */
 		m_clrprotoflags(m);
 		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 		error = (*ifp->if_output)(ifp, m,
 		    (const struct sockaddr *)gw, ro);
 		goto done;
 	}
 
 	/* Balk when DF bit is set or the interface didn't support TSO. */
 	if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
 		error = EMSGSIZE;
 		IPSTAT_INC(ips_cantfrag);
 		goto bad;
 	}
 
 	/*
 	 * Too large for interface; fragment if possible. If successful,
 	 * on return, m will point to a list of packets to be sent.
 	 */
 	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
 	if (error)
 		goto bad;
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia != NULL) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
 			/*
 			 * Reset layer specific mbuf flags
 			 * to avoid confusing upper layers.
 			 */
 			m_clrprotoflags(m);
 
 			IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 			error = (*ifp->if_output)(ifp, m,
 			    (const struct sockaddr *)gw, ro);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IPSTAT_INC(ips_fragmented);
 
 done:
 	if (ro == &iproute)
 		RO_RTFREE(ro);
 	else if (rte == NULL)
 		/*
 		 * If the caller supplied a route but somehow the reference
 		 * to it has been released need to prevent the caller
 		 * calling RTFREE on it again.
 		 */
 		ro->ro_rt = NULL;
 	if (have_ia_ref)
 		ifa_free(&ia->ia_ifa);
 	return (error);
 bad:
 	m_freem(m);
 	goto done;
 }
 
 /*
  * Create a chain of fragments which fit the given mtu. m_frag points to the
  * mbuf to be fragmented; on return it points to the chain with the fragments.
  * Return 0 if no error. If error, m_frag may contain a partially built
  * chain of fragments that should be freed by the caller.
  *
  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
  */
 int
 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
     u_long if_hwassist_flags)
 {
 	int error = 0;
 	int hlen = ip->ip_hl << 2;
 	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
 	int off;
 	struct mbuf *m0 = *m_frag;	/* the original packet		*/
 	int firstlen;
 	struct mbuf **mnext;
 	int nfrags;
 	uint16_t ip_len, ip_off;
 
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if (ip_off & IP_DF) {	/* Fragmentation not allowed */
 		IPSTAT_INC(ips_cantfrag);
 		return EMSGSIZE;
 	}
 
 	/*
 	 * Must be able to put at least 8 bytes per fragment.
 	 */
 	if (len < 8)
 		return EMSGSIZE;
 
 	/*
 	 * If the interface will not calculate checksums on
 	 * fragmented packets, then do it here.
 	 */
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m0);
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
 		sctp_delayed_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 	if (len > PAGE_SIZE) {
 		/*
 		 * Fragment large datagrams such that each segment
 		 * contains a multiple of PAGE_SIZE amount of data,
 		 * plus headers. This enables a receiver to perform
 		 * page-flipping zero-copy optimizations.
 		 *
 		 * XXX When does this help given that sender and receiver
 		 * could have different page sizes, and also mtu could
 		 * be less than the receiver's page size ?
 		 */
 		int newlen;
 
 		off = MIN(mtu, m0->m_pkthdr.len);
 
 		/*
 		 * firstlen (off - hlen) must be aligned on an
 		 * 8-byte boundary
 		 */
 		if (off < hlen)
 			goto smart_frag_failure;
 		off = ((off - hlen) & ~7) + hlen;
 		newlen = (~PAGE_MASK) & mtu;
 		if ((newlen + sizeof (struct ip)) > mtu) {
 			/* we failed, go back the default */
 smart_frag_failure:
 			newlen = len;
 			off = hlen + len;
 		}
 		len = newlen;
 
 	} else {
 		off = hlen + len;
 	}
 
 	firstlen = off - hlen;
 	mnext = &m0->m_nextpkt;		/* pointer to next packet */
 
 	/*
 	 * Loop through length of segment after first fragment,
 	 * make new header and copy data of each part and link onto chain.
 	 * Here, m0 is the original packet, m is the fragment being created.
 	 * The fragments are linked off the m_nextpkt of the original
 	 * packet, which after processing serves as the first fragment.
 	 */
 	for (nfrags = 1; off < ip_len; off += len, nfrags++) {
 		struct ip *mhip;	/* ip header on the fragment */
 		struct mbuf *m;
 		int mhlen = sizeof (struct ip);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		/*
 		 * Make sure the complete packet header gets copied
 		 * from the originating mbuf to the newly created
 		 * mbuf. This also ensures that existing firewall
 		 * classification(s), VLAN tags and so on get copied
 		 * to the resulting fragmented packet(s):
 		 */
 		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
 			m_free(m);
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		/*
 		 * In the first mbuf, leave room for the link header, then
 		 * copy the original IP header including options. The payload
 		 * goes into an additional mbuf chain returned by m_copym().
 		 */
 		m->m_data += max_linkhdr;
 		mhip = mtod(m, struct ip *);
 		*mhip = *ip;
 		if (hlen > sizeof (struct ip)) {
 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
 			mhip->ip_v = IPVERSION;
 			mhip->ip_hl = mhlen >> 2;
 		}
 		m->m_len = mhlen;
 		/* XXX do we need to add ip_off below ? */
 		mhip->ip_off = ((off - hlen) >> 3) + ip_off;
 		if (off + len >= ip_len)
 			len = ip_len - off;
 		else
 			mhip->ip_off |= IP_MF;
 		mhip->ip_len = htons((u_short)(len + mhlen));
 		m->m_next = m_copym(m0, off, len, M_NOWAIT);
 		if (m->m_next == NULL) {	/* copy failed */
 			m_free(m);
 			error = ENOBUFS;	/* ??? */
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		m->m_pkthdr.len = mhlen + len;
 #ifdef MAC
 		mac_netinet_fragment(m0, m);
 #endif
 		mhip->ip_off = htons(mhip->ip_off);
 		mhip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 			mhip->ip_sum = in_cksum(m, mhlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 	}
 	IPSTAT_ADD(ips_ofragments, nfrags);
 
 	/*
 	 * Update first fragment by trimming what's been copied out
 	 * and updating header.
 	 */
 	m_adj(m0, hlen + firstlen - ip_len);
 	m0->m_pkthdr.len = hlen + firstlen;
 	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
 	ip->ip_off = htons(ip_off | IP_MF);
 	ip->ip_sum = 0;
 	if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 		ip->ip_sum = in_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 	}
 
 done:
 	*m_frag = m0;
 	return error;
 }
 
 void
 in_delayed_cksum(struct mbuf *m)
 {
 	struct ip *ip;
 	uint16_t csum, offset, ip_len;
 
 	ip = mtod(m, struct ip *);
 	offset = ip->ip_hl << 2 ;
 	ip_len = ntohs(ip->ip_len);
 	csum = in_cksum_skip(m, ip_len, offset);
 	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
 		csum = 0xffff;
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	/* find the mbuf in the chain where the checksum starts*/
 	while ((m != NULL) && (offset >= m->m_len)) {
 		offset -= m->m_len;
 		m = m->m_next;
 	}
 	KASSERT(m != NULL, ("in_delayed_cksum: checksum outside mbuf chain."));
 	KASSERT(offset + sizeof(u_short) <= m->m_len, ("in_delayed_cksum: checksum split between mbufs."));
 	*(u_short *)(m->m_data + offset) = csum;
 }
 
 /*
  * IP socket option processing.
  */
 int
 ip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 	error = optval = 0;
 	if (sopt->sopt_level != IPPROTO_IP) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					inp->inp_flags2 |= INP_REUSEADDR;
 				else
 					inp->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(inp);
 				inp->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			default:
 				break;
 			}
 		}
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 #ifdef notyet
 		case IP_RETOPTS:
 #endif
 		{
 			struct mbuf *m;
 			if (sopt->sopt_valsize > MLEN) {
 				error = EMSGSIZE;
 				break;
 			}
 			m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 			if (m == NULL) {
 				error = ENOBUFS;
 				break;
 			}
 			m->m_len = sopt->sopt_valsize;
 			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
 					    m->m_len);
 			if (error) {
 				m_free(m);
 				break;
 			}
 			INP_WLOCK(inp);
 			error = ip_pcbopts(inp, sopt->sopt_name, m);
 			INP_WUNLOCK(inp);
 			return (error);
 		}
 
 		case IP_BINDANY:
 			if (sopt->sopt_td != NULL) {
 				error = priv_check(sopt->sopt_td,
 				    PRIV_NETINET_BINDANY);
 				if (error)
 					break;
 			}
 			/* FALLTHROUGH */
 		case IP_BINDMULTI:
 #ifdef	RSS
 		case IP_RSS_LISTEN_BUCKET:
 #endif
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_RECVTOS:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RECVRSSBUCKETID:
 #endif
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				inp->inp_ip_tos = optval;
 				break;
 
 			case IP_TTL:
 				inp->inp_ip_ttl = optval;
 				break;
 
 			case IP_MINTTL:
 				if (optval >= 0 && optval <= MAXTTL)
 					inp->inp_ip_minttl = optval;
 				else
 					error = EINVAL;
 				break;
 
 #define	OPTSET(bit) do {						\
 	INP_WLOCK(inp);							\
 	if (optval)							\
 		inp->inp_flags |= bit;					\
 	else								\
 		inp->inp_flags &= ~bit;					\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 #define	OPTSET2(bit, val) do {						\
 	INP_WLOCK(inp);							\
 	if (val)							\
 		inp->inp_flags2 |= bit;					\
 	else								\
 		inp->inp_flags2 &= ~bit;				\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 			case IP_RECVOPTS:
 				OPTSET(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				OPTSET(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				OPTSET(INP_RECVDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				OPTSET(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				OPTSET(INP_RECVIF);
 				break;
 
 			case IP_ONESBCAST:
 				OPTSET(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				OPTSET(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				OPTSET(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				OPTSET(INP_RECVTOS);
 				break;
 			case IP_BINDMULTI:
 				OPTSET2(INP_BINDMULTI, optval);
 				break;
 			case IP_RECVFLOWID:
 				OPTSET2(INP_RECVFLOWID, optval);
 				break;
 #ifdef	RSS
 			case IP_RSS_LISTEN_BUCKET:
 				if ((optval >= 0) &&
 				    (optval < rss_getnumbuckets())) {
 					inp->inp_rss_listen_bucket = optval;
 					OPTSET2(INP_RSS_BUCKET_SET, 1);
 				} else {
 					error = EINVAL;
 				}
 				break;
 			case IP_RECVRSSBUCKETID:
 				OPTSET2(INP_RECVRSSBUCKETID, optval);
 				break;
 #endif
 			}
 			break;
 #undef OPTSET
 #undef OPTSET2
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_ADD_MEMBERSHIP:
 		case IP_DROP_MEMBERSHIP:
 		case IP_ADD_SOURCE_MEMBERSHIP:
 		case IP_DROP_SOURCE_MEMBERSHIP:
 		case IP_BLOCK_SOURCE:
 		case IP_UNBLOCK_SOURCE:
 		case IP_MSFILTER:
 		case MCAST_JOIN_GROUP:
 		case MCAST_LEAVE_GROUP:
 		case MCAST_JOIN_SOURCE_GROUP:
 		case MCAST_LEAVE_SOURCE_GROUP:
 		case MCAST_BLOCK_SOURCE:
 		case MCAST_UNBLOCK_SOURCE:
 			error = inp_setmoptions(inp, sopt);
 			break;
 
 		case IP_PORTRANGE:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			INP_WLOCK(inp);
 			switch (optval) {
 			case IP_PORTRANGE_DEFAULT:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				break;
 
 			case IP_PORTRANGE_HIGH:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags |= INP_HIGHPORT;
 				break;
 
 			case IP_PORTRANGE_LOW:
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				inp->inp_flags |= INP_LOWPORT;
 				break;
 
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 
 #ifdef IPSEC
 		case IP_IPSEC_POLICY:
 		{
 			caddr_t req;
 			struct mbuf *m;
 
 			if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
 				break;
 			if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
 				break;
 			req = mtod(m, caddr_t);
 			error = ipsec_set_policy(inp, sopt->sopt_name, req,
 			    m->m_len, (sopt->sopt_td != NULL) ?
 			    sopt->sopt_td->td_ucred : NULL);
 			m_freem(m);
 			break;
 		}
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 		case IP_RETOPTS:
 			if (inp->inp_options)
 				error = sooptcopyout(sopt,
 						     mtod(inp->inp_options,
 							  char *),
 						     inp->inp_options->m_len);
 			else
 				sopt->sopt_valsize = 0;
 			break;
 
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_PORTRANGE:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_BINDANY:
 		case IP_RECVTOS:
 		case IP_BINDMULTI:
 		case IP_FLOWID:
 		case IP_FLOWTYPE:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RSSBUCKETID:
 		case IP_RECVRSSBUCKETID:
 #endif
 			switch (sopt->sopt_name) {
 
 			case IP_TOS:
 				optval = inp->inp_ip_tos;
 				break;
 
 			case IP_TTL:
 				optval = inp->inp_ip_ttl;
 				break;
 
 			case IP_MINTTL:
 				optval = inp->inp_ip_minttl;
 				break;
 
 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
 #define	OPTBIT2(bit)	(inp->inp_flags2 & bit ? 1 : 0)
 
 			case IP_RECVOPTS:
 				optval = OPTBIT(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				optval = OPTBIT(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				optval = OPTBIT(INP_RECVDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				optval = OPTBIT(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				optval = OPTBIT(INP_RECVIF);
 				break;
 
 			case IP_PORTRANGE:
 				if (inp->inp_flags & INP_HIGHPORT)
 					optval = IP_PORTRANGE_HIGH;
 				else if (inp->inp_flags & INP_LOWPORT)
 					optval = IP_PORTRANGE_LOW;
 				else
 					optval = 0;
 				break;
 
 			case IP_ONESBCAST:
 				optval = OPTBIT(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				optval = OPTBIT(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				optval = OPTBIT(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				optval = OPTBIT(INP_RECVTOS);
 				break;
 			case IP_FLOWID:
 				optval = inp->inp_flowid;
 				break;
 			case IP_FLOWTYPE:
 				optval = inp->inp_flowtype;
 				break;
 			case IP_RECVFLOWID:
 				optval = OPTBIT2(INP_RECVFLOWID);
 				break;
 #ifdef	RSS
 			case IP_RSSBUCKETID:
 				retval = rss_hash2bucket(inp->inp_flowid,
 				    inp->inp_flowtype,
 				    &rss_bucket);
 				if (retval == 0)
 					optval = rss_bucket;
 				else
 					error = EINVAL;
 				break;
 			case IP_RECVRSSBUCKETID:
 				optval = OPTBIT2(INP_RECVRSSBUCKETID);
 				break;
 #endif
 			case IP_BINDMULTI:
 				optval = OPTBIT2(INP_BINDMULTI);
 				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_MSFILTER:
 			error = inp_getmoptions(inp, sopt);
 			break;
 
 #ifdef IPSEC
 		case IP_IPSEC_POLICY:
 		{
 			struct mbuf *m = NULL;
 			caddr_t req = NULL;
 			size_t len = 0;
 
 			if (m != 0) {
 				req = mtod(m, caddr_t);
 				len = m->m_len;
 			}
 			error = ipsec_get_policy(sotoinpcb(so), req, len, &m);
 			if (error == 0)
 				error = soopt_mcopyout(sopt, m); /* XXX */
 			if (error == 0)
 				m_freem(m);
 			break;
 		}
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 /*
  * Routine called from ip_output() to loop back a copy of an IP multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be a loopback interface -- evil, but easier than
  * replicating that code here.
  */
 static void
 ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
 {
 	struct ip *ip;
 	struct mbuf *copym;
 
 	/*
 	 * Make a deep copy of the packet because we're going to
 	 * modify the pack in order to generate checksums.
 	 */
 	copym = m_dup(m, M_NOWAIT);
 	if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
 		copym = m_pullup(copym, hlen);
 	if (copym != NULL) {
 		/* If needed, compute the checksum and mark it as valid. */
 		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			in_delayed_cksum(copym);
 			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 			copym->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			copym->m_pkthdr.csum_data = 0xffff;
 		}
 		/*
 		 * We don't bother to fragment if the IP length is greater
 		 * than the interface's MTU.  Can this possibly matter?
 		 */
 		ip = mtod(copym, struct ip *);
 		ip->ip_sum = 0;
 		ip->ip_sum = in_cksum(copym, hlen);
 		if_simloop(ifp, copym, AF_INET, 0);
 	}
 }
Index: head/sys/netinet/tcp_output.c
===================================================================
--- head/sys/netinet/tcp_output.c	(revision 297224)
+++ head/sys/netinet/tcp_output.c	(revision 297225)
@@ -1,1810 +1,1806 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #ifdef TCP_RFC7413
 #include <netinet/tcp_fastopen.h>
 #endif
 #include <netinet/tcp.h>
 #define	TCPOUTFLAGS
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #ifdef TCPPCAP
 #include <netinet/tcp_pcap.h>
 #endif
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, path_mtu_discovery) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(path_mtu_discovery), 1,
 	"Enable Path MTU Discovery");
 
 VNET_DEFINE(int, tcp_do_tso) = 1;
 #define	V_tcp_do_tso		VNET(tcp_do_tso)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_do_tso), 0,
 	"Enable TCP Segmentation Offload");
 
 VNET_DEFINE(int, tcp_sendspace) = 1024*32;
 #define	V_tcp_sendspace	VNET(tcp_sendspace)
 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size");
 
 VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
 #define	V_tcp_do_autosndbuf	VNET(tcp_do_autosndbuf)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_do_autosndbuf), 0,
 	"Enable automatic send buffer sizing");
 
 VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024;
 #define	V_tcp_autosndbuf_inc	VNET(tcp_autosndbuf_inc)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_autosndbuf_inc), 0,
 	"Incrementor step size of automatic send buffer");
 
 VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024;
 #define	V_tcp_autosndbuf_max	VNET(tcp_autosndbuf_max)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(tcp_autosndbuf_max), 0,
 	"Max size of automatic send buffer");
 
 static void inline	hhook_run_tcp_est_out(struct tcpcb *tp,
 			    struct tcphdr *th, struct tcpopt *to,
 			    long len, int tso);
 static void inline	cc_after_idle(struct tcpcb *tp);
 
 /*
  * Wrapper for the TCP established output helper hook.
  */
 static void inline
 hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
     struct tcpopt *to, long len, int tso)
 {
 	struct tcp_hhook_data hhook_data;
 
 	if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) {
 		hhook_data.tp = tp;
 		hhook_data.th = th;
 		hhook_data.to = to;
 		hhook_data.len = len;
 		hhook_data.tso = tso;
 
 		hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data,
 		    tp->osd);
 	}
 }
 
 /*
  * CC wrapper hook functions
  */
 static void inline
 cc_after_idle(struct tcpcb *tp)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (CC_ALGO(tp)->after_idle != NULL)
 		CC_ALGO(tp)->after_idle(tp->ccv);
 }
 
 /*
  * Tcp output routine: figure out what should be sent and send it.
  */
 int
 tcp_output(struct tcpcb *tp)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 	long len, recwin, sendwin;
 	int off, flags, error = 0;	/* Keep compiler happy */
 	struct mbuf *m;
 	struct ip *ip = NULL;
 	struct ipovly *ipov = NULL;
 	struct tcphdr *th;
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
 #ifdef IPSEC
 	unsigned ipsec_optlen = 0;
 #endif
 	int idle, sendalot;
 	int sack_rxmit, sack_bytes_rxmt;
 	struct sackhole *p;
 	int tso, mtu;
 	struct tcpopt to;
 #if 0
 	int maxburst = TCP_MAXBURST;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 
 	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 #endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return (tcp_offload_output(tp));
 #endif
 
 #ifdef TCP_RFC7413
 	/*
 	 * For TFO connections in SYN_RECEIVED, only allow the initial
 	 * SYN|ACK and those sent by the retransmit timer.
 	 */
 	if ((tp->t_flags & TF_FASTOPEN) &&
 	    (tp->t_state == TCPS_SYN_RECEIVED) &&
 	    SEQ_GT(tp->snd_max, tp->snd_una) &&    /* inital SYN|ACK sent */
 	    (tp->snd_nxt != tp->snd_una))          /* not a retransmit */
 		return (0);
 #endif
 	/*
 	 * Determine length of data that should be transmitted,
 	 * and flags that will be used.
 	 * If there is some data or critical controls (SYN, RST)
 	 * to send, then transmit; otherwise, investigate further.
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 	if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
 		cc_after_idle(tp);
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {
 		if (tp->t_flags & TF_MORETOCOME) {
 			tp->t_flags |= TF_LASTIDLE;
 			idle = 0;
 		}
 	}
 again:
 	/*
 	 * If we've recently taken a timeout, snd_max will be greater than
 	 * snd_nxt.  There may be SACK information that allows us to avoid
 	 * resending already delivered data.  Adjust snd_nxt accordingly.
 	 */
 	if ((tp->t_flags & TF_SACK_PERMIT) &&
 	    SEQ_LT(tp->snd_nxt, tp->snd_max))
 		tcp_sack_adjust(tp);
 	sendalot = 0;
 	tso = 0;
 	mtu = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*
 	 * Send any SACK-generated retransmissions.  If we're explicitly trying
 	 * to send out new data (when sendalot is 1), bypass this function.
 	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
 	 * we're replacing a (future) new transmission with a retransmission
 	 * now, and we previously incremented snd_cwnd in tcp_input().
 	 */
 	/*
 	 * Still in sack recovery , reset rxmit flag to zero.
 	 */
 	sack_rxmit = 0;
 	sack_bytes_rxmt = 0;
 	len = 0;
 	p = NULL;
 	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
 	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
 		long cwin;
 		
 		cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
 		if (cwin < 0)
 			cwin = 0;
 		/* Do not retransmit SACK segments beyond snd_recover */
 		if (SEQ_GT(p->end, tp->snd_recover)) {
 			/*
 			 * (At least) part of sack hole extends beyond
 			 * snd_recover. Check to see if we can rexmit data
 			 * for this hole.
 			 */
 			if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
 				/*
 				 * Can't rexmit any more data for this hole.
 				 * That data will be rexmitted in the next
 				 * sack recovery episode, when snd_recover
 				 * moves past p->rxmit.
 				 */
 				p = NULL;
 				goto after_sack_rexmit;
 			} else
 				/* Can rexmit part of the current hole */
 				len = ((long)ulmin(cwin,
 						   tp->snd_recover - p->rxmit));
 		} else
 			len = ((long)ulmin(cwin, p->end - p->rxmit));
 		off = p->rxmit - tp->snd_una;
 		KASSERT(off >= 0,("%s: sack block to the left of una : %d",
 		    __func__, off));
 		if (len > 0) {
 			sack_rxmit = 1;
 			sendalot = 1;
 			TCPSTAT_INC(tcps_sack_rexmits);
 			TCPSTAT_ADD(tcps_sack_rexmit_bytes,
 			    min(len, tp->t_maxseg));
 		}
 	}
 after_sack_rexmit:
 	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
 	 */
 	if (tp->t_flags & TF_NEEDFIN)
 		flags |= TH_FIN;
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	/*
 	 * If in persist timeout with window of 0, send 1 byte.
 	 * Otherwise, if window is small but nonzero
 	 * and timer expired, we will send what we can
 	 * and go to transmit state.
 	 */
 	if (tp->t_flags & TF_FORCEDATA) {
 		if (sendwin == 0) {
 			/*
 			 * If we still have some data to send, then
 			 * clear the FIN bit.  Usually this would
 			 * happen below when it realizes that we
 			 * aren't sending all the data.  However,
 			 * if we have exactly 1 byte of unsent data,
 			 * then it won't clear the FIN bit below,
 			 * and if we are in persist state, we wind
 			 * up sending the packet without recording
 			 * that we sent the FIN bit.
 			 *
 			 * We can't just blindly clear the FIN bit,
 			 * because if we don't have any more data
 			 * to send then the probe will be the FIN
 			 * itself.
 			 */
 			if (off < sbused(&so->so_snd))
 				flags &= ~TH_FIN;
 			sendwin = 1;
 		} else {
 			tcp_timer_activate(tp, TT_PERSIST, 0);
 			tp->t_rxtshift = 0;
 		}
 	}
 
 	/*
 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
 	 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
 	 * a negative length.  This can also occur when TCP opens up
 	 * its congestion window while receiving additional duplicate
 	 * acks after fast-retransmit because TCP will reset snd_nxt
 	 * to snd_max after the fast-retransmit.
 	 *
 	 * In the normal retransmit-FIN-only case, however, snd_nxt will
 	 * be set to snd_una, the offset will be 0, and the length may
 	 * wind up 0.
 	 *
 	 * If sack_rxmit is true we are retransmitting from the scoreboard
 	 * in which case len is already set.
 	 */
 	if (sack_rxmit == 0) {
 		if (sack_bytes_rxmt == 0)
 			len = ((long)ulmin(sbavail(&so->so_snd), sendwin) -
 			    off);
 		else {
 			long cwin;
 
                         /*
 			 * We are inside of a SACK recovery episode and are
 			 * sending new data, having retransmitted all the
 			 * data possible in the scoreboard.
 			 */
 			len = ((long)ulmin(sbavail(&so->so_snd), tp->snd_wnd) -
 			    off);
 			/*
 			 * Don't remove this (len > 0) check !
 			 * We explicitly check for len > 0 here (although it 
 			 * isn't really necessary), to work around a gcc 
 			 * optimization issue - to force gcc to compute
 			 * len above. Without this check, the computation
 			 * of len is bungled by the optimizer.
 			 */
 			if (len > 0) {
 				cwin = tp->snd_cwnd - 
 					(tp->snd_nxt - tp->sack_newdata) -
 					sack_bytes_rxmt;
 				if (cwin < 0)
 					cwin = 0;
 				len = lmin(len, cwin);
 			}
 		}
 	}
 
 	/*
 	 * Lop off SYN bit if it has already been sent.  However, if this
 	 * is SYN-SENT state and if segment contains data and if we don't
 	 * know that foreign host supports TAO, suppress sending segment.
 	 */
 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
 		if (tp->t_state != TCPS_SYN_RECEIVED)
 			flags &= ~TH_SYN;
 #ifdef TCP_RFC7413
 		/*
 		 * When sending additional segments following a TFO SYN|ACK,
 		 * do not include the SYN bit.
 		 */
 		if ((tp->t_flags & TF_FASTOPEN) &&
 		    (tp->t_state == TCPS_SYN_RECEIVED))
 			flags &= ~TH_SYN;
 #endif
 		off--, len++;
 	}
 
 	/*
 	 * Be careful not to send data and/or FIN on SYN segments.
 	 * This measure is needed to prevent interoperability problems
 	 * with not fully conformant TCP implementations.
 	 */
 	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 		len = 0;
 		flags &= ~TH_FIN;
 	}
 
 #ifdef TCP_RFC7413
 	/*
 	 * When retransmitting SYN|ACK on a passively-created TFO socket,
 	 * don't include data, as the presence of data may have caused the
 	 * original SYN|ACK to have been dropped by a middlebox.
 	 */
 	if ((tp->t_flags & TF_FASTOPEN) &&
 	    (((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)) ||
 	     (flags & TH_RST)))
 		len = 0;
 #endif
 	if (len <= 0) {
 		/*
 		 * If FIN has been sent but not acked,
 		 * but we haven't been called to retransmit,
 		 * len will be < 0.  Otherwise, window shrank
 		 * after we sent into it.  If window shrank to 0,
 		 * cancel pending retransmit, pull snd_nxt back
 		 * to (closed) window, and set the persist timer
 		 * if it isn't already going.  If the window didn't
 		 * close completely, just wait for an ACK.
 		 *
 		 * We also do a general check here to ensure that
 		 * we will set the persist timer when we have data
 		 * to send, but a 0-byte window. This makes sure
 		 * the persist timer is set even if the packet
 		 * hits one of the "goto send" lines below.
 		 */
 		len = 0;
 		if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) &&
 			(off < (int) sbavail(&so->so_snd))) {
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			tp->t_rxtshift = 0;
 			tp->snd_nxt = tp->snd_una;
 			if (!tcp_timer_active(tp, TT_PERSIST))
 				tcp_setpersist(tp);
 		}
 	}
 
 	/* len will be >= 0 after this point. */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 
 	/*
 	 * Automatic sizing of send socket buffer.  Often the send buffer
 	 * size is not optimally adjusted to the actual network conditions
 	 * at hand (delay bandwidth product).  Setting the buffer size too
 	 * small limits throughput on links with high bandwidth and high
 	 * delay (eg. trans-continental/oceanic links).  Setting the
 	 * buffer size too big consumes too much real kernel memory,
 	 * especially with many connections on busy servers.
 	 *
 	 * The criteria to step up the send buffer one notch are:
 	 *  1. receive window of remote host is larger than send buffer
 	 *     (with a fudge factor of 5/4th);
 	 *  2. send buffer is filled to 7/8th with data (so we actually
 	 *     have data to make use of it);
 	 *  3. send buffer fill has not hit maximal automatic size;
 	 *  4. our send window (slow start and cogestion controlled) is
 	 *     larger than sent but unacknowledged data in send buffer.
 	 *
 	 * The remote host receive window scaling factor may limit the
 	 * growing of the send buffer before it reaches its allowed
 	 * maximum.
 	 *
 	 * It scales directly with slow start or congestion window
 	 * and does at most one step per received ACK.  This fast
 	 * scaling has the drawback of growing the send buffer beyond
 	 * what is strictly necessary to make full use of a given
 	 * delay*bandwith product.  However testing has shown this not
 	 * to be much of an problem.  At worst we are trading wasting
 	 * of available bandwith (the non-use of it) for wasting some
 	 * socket buffer memory.
 	 *
 	 * TODO: Shrink send buffer during idle periods together
 	 * with congestion window.  Requires another timer.  Has to
 	 * wait for upcoming tcp timer rewrite.
 	 *
 	 * XXXGL: should there be used sbused() or sbavail()?
 	 */
 	if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
 		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
 		    sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) &&
 		    sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
 		    sendwin >= (sbused(&so->so_snd) -
 		    (tp->snd_nxt - tp->snd_una))) {
 			if (!sbreserve_locked(&so->so_snd,
 			    min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
 			     V_tcp_autosndbuf_max), so, curthread))
 				so->so_snd.sb_flags &= ~SB_AUTOSIZE;
 		}
 	}
 
 	/*
 	 * Decide if we can use TCP Segmentation Offloading (if supported by
 	 * hardware).
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
 	 * IP options prevent using TSO.  With TSO the TCP header is the same
 	 * (except for the sequence number) for all generated packets.  This
 	 * makes it impossible to transmit any options which vary per generated
 	 * segment or packet.
 	 */
 #ifdef IPSEC
 	/*
 	 * Pre-calculate here as we save another lookup into the darknesses
 	 * of IPsec that way and can actually decide if TSO is ok.
 	 */
 	ipsec_optlen = ipsec_hdrsiz_tcp(tp);
 #endif
 	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
 	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
 	    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
 #ifdef IPSEC
 	    ipsec_optlen == 0 &&
 #endif
 	    tp->t_inpcb->inp_options == NULL &&
 	    tp->t_inpcb->in6p_options == NULL)
 		tso = 1;
 
 	if (sack_rxmit) {
 		if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd)))
 			flags &= ~TH_FIN;
 	} else {
 		if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
 		    sbused(&so->so_snd)))
 			flags &= ~TH_FIN;
 	}
 
 	recwin = sbspace(&so->so_rcv);
 
 	/*
 	 * Sender silly window avoidance.   We transmit under the following
 	 * conditions when len is non-zero:
 	 *
 	 *	- We have a full segment (or more with TSO)
 	 *	- This is the last buffer in a write()/send() and we are
 	 *	  either idle or running NODELAY
 	 *	- we've timed out (e.g. persist timer)
 	 *	- we have more then 1/2 the maximum send window's worth of
 	 *	  data (receiver may be limited the window size)
 	 *	- we need to retransmit
 	 */
 	if (len) {
 		if (len >= tp->t_maxseg)
 			goto send;
 		/*
 		 * NOTE! on localhost connections an 'ack' from the remote
 		 * end may occur synchronously with the output and cause
 		 * us to flush a buffer queued with moretocome.  XXX
 		 *
 		 * note: the len + off check is almost certainly unnecessary.
 		 */
 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
 		    (idle || (tp->t_flags & TF_NODELAY)) &&
 		    len + off >= sbavail(&so->so_snd) &&
 		    (tp->t_flags & TF_NOPUSH) == 0) {
 			goto send;
 		}
 		if (tp->t_flags & TF_FORCEDATA)		/* typ. timeout case */
 			goto send;
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
 			goto send;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
 			goto send;
 		if (sack_rxmit)
 			goto send;
 	}
 
 	/*
 	 * Sending of standalone window updates.
 	 *
 	 * Window updates are important when we close our window due to a
 	 * full socket buffer and are opening it again after the application
 	 * reads data from it.  Once the window has opened again and the
 	 * remote end starts to send again the ACK clock takes over and
 	 * provides the most current window information.
 	 *
 	 * We must avoid the silly window syndrome whereas every read
 	 * from the receive buffer, no matter how small, causes a window
 	 * update to be sent.  We also should avoid sending a flurry of
 	 * window updates when the socket buffer had queued a lot of data
 	 * and the application is doing small reads.
 	 *
 	 * Prevent a flurry of pointless window updates by only sending
 	 * an update when we can increase the advertized window by more
 	 * than 1/4th of the socket buffer capacity.  When the buffer is
 	 * getting full or is very small be more aggressive and send an
 	 * update whenever we can increase by two mss sized segments.
 	 * In all other situations the ACK's to new incoming data will
 	 * carry further window increases.
 	 *
 	 * Don't send an independent window update if a delayed
 	 * ACK is pending (it will get piggy-backed on it) or the
 	 * remote side already has done a half-close and won't send
 	 * more data.  Skip this if the connection is in T/TCP
 	 * half-open state.
 	 */
 	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
 	    !(tp->t_flags & TF_DELACK) &&
 	    !TCPS_HAVERCVDFIN(tp->t_state)) {
 		/*
 		 * "adv" is the amount we could increase the window,
 		 * taking into account that we are limited by
 		 * TCP_MAXWIN << tp->rcv_scale.
 		 */
 		long adv;
 		int oldwin;
 
 		adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
 			oldwin = (tp->rcv_adv - tp->rcv_nxt);
 			adv -= oldwin;
 		} else
 			oldwin = 0;
 
 		/* 
 		 * If the new window size ends up being the same as the old
 		 * size when it is scaled, then don't force a window update.
 		 */
 		if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
 			goto dontupdate;
 
 		if (adv >= (long)(2 * tp->t_maxseg) &&
 		    (adv >= (long)(so->so_rcv.sb_hiwat / 4) ||
 		     recwin <= (long)(so->so_rcv.sb_hiwat / 8) ||
 		     so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg))
 			goto send;
 	}
 dontupdate:
 
 	/*
 	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 	 * is also a catch-all for the retransmit timer timeout case.
 	 */
 	if (tp->t_flags & TF_ACKNOW)
 		goto send;
 	if ((flags & TH_RST) ||
 	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
 		goto send;
 	if (SEQ_GT(tp->snd_up, tp->snd_una))
 		goto send;
 	/*
 	 * If our state indicates that FIN should be sent
 	 * and we have not yet done so, then we need to send.
 	 */
 	if (flags & TH_FIN &&
 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 		goto send;
 	/*
 	 * In SACK, it is possible for tcp_output to fail to send a segment
 	 * after the retransmission timer has been turned off.  Make sure
 	 * that the retransmission timer is set.
 	 */
 	if ((tp->t_flags & TF_SACK_PERMIT) &&
 	    SEQ_GT(tp->snd_max, tp->snd_una) &&
 	    !tcp_timer_active(tp, TT_REXMT) &&
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 		goto just_return;
 	} 
 	/*
 	 * TCP window updates are not reliable, rather a polling protocol
 	 * using ``persist'' packets is used to insure receipt of window
 	 * updates.  The three ``states'' for the output side are:
 	 *	idle			not doing retransmits or persists
 	 *	persisting		to move a small or zero window
 	 *	(re)transmitting	and thereby not persisting
 	 *
 	 * tcp_timer_active(tp, TT_PERSIST)
 	 *	is true when we are in persist state.
 	 * (tp->t_flags & TF_FORCEDATA)
 	 *	is set when we are called to send a persist packet.
 	 * tcp_timer_active(tp, TT_REXMT)
 	 *	is set when we are retransmitting
 	 * The output side is idle when both timers are zero.
 	 *
 	 * If send window is too small, there is data to transmit, and no
 	 * retransmit or persist is pending, then go to persist state.
 	 * If nothing happens soon, send when timer expires:
 	 * if window is nonzero, transmit what we can,
 	 * otherwise force out a byte.
 	 */
 	if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) &&
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tp->t_rxtshift = 0;
 		tcp_setpersist(tp);
 	}
 
 	/*
 	 * No reason to send a segment, just return.
 	 */
 just_return:
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 
 send:
 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 	if (len > 0) {
 		if (len >= tp->t_maxseg)
 			tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
 	}
 	/*
 	 * Before ESTABLISHED, force sending of initial options
 	 * unless TCP set not to do any options.
 	 * NOTE: we assume that the IP/TCP header plus TCP options
 	 * always fit in a single mbuf, leaving room for a maximum
 	 * link header, i.e.
 	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
 	 */
 	optlen = 0;
 #ifdef INET6
 	if (isipv6)
 		hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	else
 #endif
 		hdrlen = sizeof (struct tcpiphdr);
 
 	/*
 	 * Compute options for segment.
 	 * We only have to care about SYN and established connection
 	 * segments.  Options for SYN-ACK segments are handled in TCP
 	 * syncache.
 	 */
 	to.to_flags = 0;
 	if ((tp->t_flags & TF_NOOPT) == 0) {
 		/* Maximum segment size. */
 		if (flags & TH_SYN) {
 			tp->snd_nxt = tp->iss;
 			to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
 			to.to_flags |= TOF_MSS;
 #ifdef TCP_RFC7413
 			/*
 			 * Only include the TFO option on the first
 			 * transmission of the SYN|ACK on a
 			 * passively-created TFO socket, as the presence of
 			 * the TFO option may have caused the original
 			 * SYN|ACK to have been dropped by a middlebox.
 			 */
 			if ((tp->t_flags & TF_FASTOPEN) &&
 			    (tp->t_state == TCPS_SYN_RECEIVED) &&
 			    (tp->t_rxtshift == 0)) {
 				to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
 				to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie;
 				to.to_flags |= TOF_FASTOPEN;
 			}
 #endif
 		}
 		/* Window scaling. */
 		if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
 			to.to_wscale = tp->request_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		/* Timestamps. */
 		if ((tp->t_flags & TF_RCVD_TSTMP) ||
 		    ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
 			to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 			/* Set receive buffer autosizing timestamp. */
 			if (tp->rfbuf_ts == 0 &&
 			    (so->so_rcv.sb_flags & SB_AUTOSIZE))
 				tp->rfbuf_ts = tcp_ts_getticks();
 		}
 		/* Selective ACK's. */
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (flags & TH_SYN)
 				to.to_flags |= TOF_SACKPERM;
 			else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 			    (tp->t_flags & TF_SACK_PERMIT) &&
 			    tp->rcv_numsacks > 0) {
 				to.to_flags |= TOF_SACK;
 				to.to_nsacks = tp->rcv_numsacks;
 				to.to_sacks = (u_char *)tp->sackblks;
 			}
 		}
 #ifdef TCP_SIGNATURE
 		/* TCP-MD5 (RFC2385). */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif /* TCP_SIGNATURE */
 
 		/* Processing the options. */
 		hdrlen += optlen = tcp_addoptions(&to, opt);
 	}
 
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
 	else
 #endif
 	if (tp->t_inpcb->inp_options)
 		ipoptlen = tp->t_inpcb->inp_options->m_len -
 				offsetof(struct ipoption, ipopt_list);
 	else
 		ipoptlen = 0;
 #ifdef IPSEC
 	ipoptlen += ipsec_optlen;
 #endif
 
 	/*
 	 * Adjust data length if insertion of options will
 	 * bump the packet length beyond the t_maxseg length.
 	 * Clear the FIN bit because we cut off the tail of
 	 * the segment.
 	 */
 	if (len + optlen + ipoptlen > tp->t_maxseg) {
 		flags &= ~TH_FIN;
 
 		if (tso) {
 			u_int if_hw_tsomax;
 			u_int if_hw_tsomaxsegcount;
 			u_int if_hw_tsomaxsegsize;
 			struct mbuf *mb;
 			u_int moff;
 			int max_len;
 
 			/* extract TSO information */
 			if_hw_tsomax = tp->t_tsomax;
 			if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
 			if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
 
 			/*
 			 * Limit a TSO burst to prevent it from
 			 * overflowing or exceeding the maximum length
 			 * allowed by the network interface:
 			 */
 			KASSERT(ipoptlen == 0,
 			    ("%s: TSO can't do IP options", __func__));
 
 			/*
 			 * Check if we should limit by maximum payload
 			 * length:
 			 */
 			if (if_hw_tsomax != 0) {
 				/* compute maximum TSO length */
 				max_len = (if_hw_tsomax - hdrlen -
 				    max_linkhdr);
 				if (max_len <= 0) {
 					len = 0;
 				} else if (len > max_len) {
 					sendalot = 1;
 					len = max_len;
 				}
 			}
 
 			/*
 			 * Check if we should limit by maximum segment
 			 * size and count:
 			 */
 			if (if_hw_tsomaxsegcount != 0 &&
 			    if_hw_tsomaxsegsize != 0) {
 				/*
 				 * Subtract one segment for the LINK
 				 * and TCP/IP headers mbuf that will
 				 * be prepended to this mbuf chain
 				 * after the code in this section
 				 * limits the number of mbufs in the
 				 * chain to if_hw_tsomaxsegcount.
 				 */
 				if_hw_tsomaxsegcount -= 1;
 				max_len = 0;
 				mb = sbsndmbuf(&so->so_snd, off, &moff);
 
 				while (mb != NULL && max_len < len) {
 					u_int mlen;
 					u_int frags;
 
 					/*
 					 * Get length of mbuf fragment
 					 * and how many hardware frags,
 					 * rounded up, it would use:
 					 */
 					mlen = (mb->m_len - moff);
 					frags = howmany(mlen,
 					    if_hw_tsomaxsegsize);
 
 					/* Handle special case: Zero Length Mbuf */
 					if (frags == 0)
 						frags = 1;
 
 					/*
 					 * Check if the fragment limit
 					 * will be reached or exceeded:
 					 */
 					if (frags >= if_hw_tsomaxsegcount) {
 						max_len += min(mlen,
 						    if_hw_tsomaxsegcount *
 						    if_hw_tsomaxsegsize);
 						break;
 					}
 					max_len += mlen;
 					if_hw_tsomaxsegcount -= frags;
 					moff = 0;
 					mb = mb->m_next;
 				}
 				if (max_len <= 0) {
 					len = 0;
 				} else if (len > max_len) {
 					sendalot = 1;
 					len = max_len;
 				}
 			}
 
 			/*
 			 * Prevent the last segment from being
 			 * fractional unless the send sockbuf can be
 			 * emptied:
 			 */
 			max_len = (tp->t_maxseg - optlen);
 			if ((off + len) < sbavail(&so->so_snd)) {
 				moff = len % max_len;
 				if (moff != 0) {
 					len -= moff;
 					sendalot = 1;
 				}
 			}
 
 			/*
 			 * In case there are too many small fragments
 			 * don't use TSO:
 			 */
 			if (len <= max_len) {
 				len = max_len;
 				sendalot = 1;
 				tso = 0;
 			}
 
 			/*
 			 * Send the FIN in a separate segment
 			 * after the bulk sending is done.
 			 * We don't trust the TSO implementations
 			 * to clear the FIN flag on all but the
 			 * last segment.
 			 */
 			if (tp->t_flags & TF_NEEDFIN)
 				sendalot = 1;
 
 		} else {
 			len = tp->t_maxseg - optlen - ipoptlen;
 			sendalot = 1;
 		}
 	} else
 		tso = 0;
 
 	KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
 	    ("%s: len > IP_MAXPACKET", __func__));
 
 /*#ifdef DIAGNOSTIC*/
 #ifdef INET6
 	if (max_linkhdr + hdrlen > MCLBYTES)
 #else
 	if (max_linkhdr + hdrlen > MHLEN)
 #endif
 		panic("tcphdr too big");
 /*#endif*/
 
 	/*
 	 * This KASSERT is here to catch edge cases at a well defined place.
 	 * Before, those had triggered (random) panic conditions further down.
 	 */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 
 	/*
 	 * Grab a header mbuf, attaching a copy of data to
 	 * be transmitted, and initialize the header from
 	 * the template for sends on this connection.
 	 */
 	if (len) {
 		struct mbuf *mb;
 		u_int moff;
 
 		if ((tp->t_flags & TF_FORCEDATA) && len == 1)
 			TCPSTAT_INC(tcps_sndprobe);
 		else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
 			tp->t_sndrexmitpack++;
 			TCPSTAT_INC(tcps_sndrexmitpack);
 			TCPSTAT_ADD(tcps_sndrexmitbyte, len);
 		} else {
 			TCPSTAT_INC(tcps_sndpack);
 			TCPSTAT_ADD(tcps_sndbyte, len);
 		}
 #ifdef INET6
 		if (MHLEN < hdrlen + max_linkhdr)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 #endif
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 
 		if (m == NULL) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 
 		/*
 		 * Start the m_copy functions from the closest mbuf
 		 * to the offset in the socket buffer chain.
 		 */
 		mb = sbsndptr(&so->so_snd, off, len, &moff);
 
 		if (len <= MHLEN - hdrlen - max_linkhdr) {
 			m_copydata(mb, moff, (int)len,
 			    mtod(m, caddr_t) + hdrlen);
 			m->m_len += len;
 		} else {
 			m->m_next = m_copy(mb, moff, (int)len);
 			if (m->m_next == NULL) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				(void) m_free(m);
 				error = ENOBUFS;
 				sack_rxmit = 0;
 				goto out;
 			}
 		}
 
 		/*
 		 * If we're sending everything we've got, set PUSH.
 		 * (This will keep happy those implementations which only
 		 * give data to the user when a buffer fills or
 		 * a PUSH comes in.)
 		 */
 		if ((off + len == sbused(&so->so_snd)) && !(flags & TH_SYN))
 			flags |= TH_PUSH;
 		SOCKBUF_UNLOCK(&so->so_snd);
 	} else {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		if (tp->t_flags & TF_ACKNOW)
 			TCPSTAT_INC(tcps_sndacks);
 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
 			TCPSTAT_INC(tcps_sndctrl);
 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
 			TCPSTAT_INC(tcps_sndurg);
 		else
 			TCPSTAT_INC(tcps_sndwinup);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 #ifdef INET6
 		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
 		    MHLEN >= hdrlen) {
 			M_ALIGN(m, hdrlen);
 		} else
 #endif
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 	}
 	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 	mac_inpcb_create_mbuf(tp->t_inpcb, m);
 #endif
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		tcpip_fillheaders(tp->t_inpcb, ip6, th);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 		ipov = (struct ipovly *)ip;
 		th = (struct tcphdr *)(ip + 1);
 		tcpip_fillheaders(tp->t_inpcb, ip, th);
 	}
 
 	/*
 	 * Fill in fields, remembering maximum advertised
 	 * window for use in delaying messages about window sizes.
 	 * If resending a FIN, be sure not to use a new sequence number.
 	 */
 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 	    tp->snd_nxt == tp->snd_max)
 		tp->snd_nxt--;
 	/*
 	 * If we are starting a connection, send ECN setup
 	 * SYN packet. If we are on a retransmit, we may
 	 * resend those bits a number of times as per
 	 * RFC 3168.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
 		if (tp->t_rxtshift >= 1) {
 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
 				flags |= TH_ECE|TH_CWR;
 		} else
 			flags |= TH_ECE|TH_CWR;
 	}
 	
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (tp->t_flags & TF_ECN_PERMIT)) {
 		/*
 		 * If the peer has ECN, mark data packets with
 		 * ECN capable transmission (ECT).
 		 * Ignore pure ack packets, retransmissions and window probes.
 		 */
 		if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
 		    !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
 #ifdef INET6
 			if (isipv6)
 				ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
 			else
 #endif
 				ip->ip_tos |= IPTOS_ECN_ECT0;
 			TCPSTAT_INC(tcps_ecn_ect0);
 		}
 		
 		/*
 		 * Reply with proper ECN notifications.
 		 */
 		if (tp->t_flags & TF_ECN_SND_CWR) {
 			flags |= TH_CWR;
 			tp->t_flags &= ~TF_ECN_SND_CWR;
 		} 
 		if (tp->t_flags & TF_ECN_SND_ECE)
 			flags |= TH_ECE;
 	}
 	
 	/*
 	 * If we are doing retransmissions, then snd_nxt will
 	 * not reflect the first unsent octet.  For ACK only
 	 * packets, we do not want the sequence number of the
 	 * retransmitted packet, we want the sequence number
 	 * of the next unsent octet.  So, if there is no data
 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
 	 * when filling in ti_seq.  But if we are in persist
 	 * state, snd_max might reflect one byte beyond the
 	 * right edge of the window, so use snd_nxt in that
 	 * case, since we know we aren't doing a retransmission.
 	 * (retransmit and persist are mutually exclusive...)
 	 */
 	if (sack_rxmit == 0) {
 		if (len || (flags & (TH_SYN|TH_FIN)) ||
 		    tcp_timer_active(tp, TT_PERSIST))
 			th->th_seq = htonl(tp->snd_nxt);
 		else
 			th->th_seq = htonl(tp->snd_max);
 	} else {
 		th->th_seq = htonl(p->rxmit);
 		p->rxmit += len;
 		tp->sackhint.sack_bytes_rexmit += len;
 	}
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 	}
 	th->th_flags = flags;
 	/*
 	 * Calculate receive window.  Don't shrink window,
 	 * but avoid silly window syndrome.
 	 */
 	if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
 	    recwin < (long)tp->t_maxseg)
 		recwin = 0;
 	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
 	    recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
 		recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
 	if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
 		recwin = (long)TCP_MAXWIN << tp->rcv_scale;
 
 	/*
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 	 * or <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK>
 	 * case is handled in syncache.
 	 */
 	if (flags & TH_SYN)
 		th->th_win = htons((u_short)
 				(min(sbspace(&so->so_rcv), TCP_MAXWIN)));
 	else
 		th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
 
 	/*
 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised
 	 * a 0 window.  This may cause the remote transmitter to stall.  This
 	 * flag tells soreceive() to disable delayed acknowledgements when
 	 * draining the buffer.  This can occur if the receiver is attempting
 	 * to read more data than can be buffered prior to transmitting on
 	 * the connection.
 	 */
 	if (th->th_win == 0) {
 		tp->t_sndzerowin++;
 		tp->t_flags |= TF_RXWIN0SENT;
 	} else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
 		th->th_flags |= TH_URG;
 	} else
 		/*
 		 * If no urgent pointer to send, then we pull
 		 * the urgent pointer to the left edge of the send window
 		 * so that it doesn't drift into the send window on sequence
 		 * number wraparound.
 		 */
 		tp->snd_up = tp->snd_una;		/* drag it along */
 
 #ifdef TCP_SIGNATURE
 	if (to.to_flags & TOF_SIGNATURE) {
 		int sigoff = to.to_signature - opt;
 		tcp_signature_compute(m, 0, len, optlen,
 		    (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
 	}
 #endif
 
 	/*
 	 * Put TCP length in extended header, and then
 	 * checksum extended header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * ip6_plen is not need to be filled now, and will be filled
 		 * in ip6_output.
 		 */
 		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 		th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
 		    optlen + len, IPPROTO_TCP, 0);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
 
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
 #endif
 
 	/*
 	 * Enable TSO and specify the size of the segments.
 	 * The TCP pseudo header checksum is always provided.
 	 */
 	if (tso) {
 		KASSERT(len > tp->t_maxseg - optlen,
 		    ("%s: len <= tso_segsz", __func__));
 		m->m_pkthdr.csum_flags |= CSUM_TSO;
 		m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
 	}
 
 #ifdef IPSEC
 	KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
 	    ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u",
 	    __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
 #else
 	KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
 	    ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u",
 	    __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
 #endif
 
 	/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
 	hhook_run_tcp_est_out(tp, th, &to, len, tso);
 
 #ifdef TCPDEBUG
 	/*
 	 * Trace.
 	 */
 	if (so->so_options & SO_DEBUG) {
 		u_short save = 0;
 #ifdef INET6
 		if (!isipv6)
 #endif
 		{
 			save = ipov->ih_len;
 			ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
 		}
 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
 #ifdef INET6
 		if (!isipv6)
 #endif
 		ipov->ih_len = save;
 	}
 #endif /* TCPDEBUG */
 	TCP_PROBE3(debug__output, tp, th, mtod(m, const char *));
 
 	/*
 	 * Fill in IP length and desired time to live and
 	 * send to IP level.  There should be a better way
 	 * to handle ttl and tos; we could keep them in
 	 * the template, but need a way to checksum without them.
 	 */
 	/*
 	 * m->m_pkthdr.len should have been set before checksum calculation,
 	 * because in6_cksum() need it.
 	 */
 #ifdef INET6
 	if (isipv6) {
 		struct route_in6 ro;
 
 		bzero(&ro, sizeof(ro));
 		/*
 		 * we separately set hoplimit for every segment, since the
 		 * user might want to change the value via setsockopt.
 		 * Also, desired default hop limit might be changed via
 		 * Neighbor Discovery.
 		 */
 		ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
 
 		/*
 		 * Set the packet size here for the benefit of DTrace probes.
 		 * ip6_output() will set it properly; it's supposed to include
 		 * the option header lengths as well.
 		 */
 		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
 
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip6, tp, th);
 
 #ifdef TCPPCAP
 		/* Save packet, if requested. */
 		tcp_pcap_add(th, m, &(tp->t_outpkts));
 #endif
 
 		/* TODO: IPv6 IP6TOS_ECT bit on */
 		error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro,
 		    ((so->so_options & SO_DONTROUTE) ?  IP_ROUTETOIF : 0),
 		    NULL, NULL, tp->t_inpcb);
 
 		if (error == EMSGSIZE && ro.ro_rt != NULL)
 			mtu = ro.ro_rt->rt_mtu;
 		RO_RTFREE(&ro);
 	}
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
     {
-	struct route ro;
-
-	bzero(&ro, sizeof(ro));
 	ip->ip_len = htons(m->m_pkthdr.len);
 #ifdef INET6
 	if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
 		ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
 #endif /* INET6 */
 	/*
 	 * If we do path MTU discovery, then we set DF on every packet.
 	 * This might not be the best thing to do according to RFC3390
 	 * Section 2. However the tcp hostcache migitates the problem
 	 * so it affects only the first tcp connection with a host.
 	 *
 	 * NB: Don't set DF on small MTU/MSS to have a safe fallback.
 	 */
 	if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
 		ip->ip_off |= htons(IP_DF);
 		tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 	} else {
 		tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 	}
 
 	if (tp->t_state == TCPS_SYN_SENT)
 		TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
 
 	TCP_PROBE5(send, NULL, tp, ip, tp, th);
 
 #ifdef TCPPCAP
 	/* Save packet, if requested. */
 	tcp_pcap_add(th, m, &(tp->t_outpkts));
 #endif
 
-	error = ip_output(m, tp->t_inpcb->inp_options, &ro,
+	error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
 	    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
 	    tp->t_inpcb);
 
-	if (error == EMSGSIZE && ro.ro_rt != NULL)
-		mtu = ro.ro_rt->rt_mtu;
-	RO_RTFREE(&ro);
+	if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL)
+		mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu;
     }
 #endif /* INET */
 
 out:
 	/*
 	 * In transmit state, time the transmission and arrange for
 	 * the retransmit.  In persist state, just set snd_max.
 	 */
 	if ((tp->t_flags & TF_FORCEDATA) == 0 || 
 	    !tcp_timer_active(tp, TT_PERSIST)) {
 		tcp_seq startseq = tp->snd_nxt;
 
 		/*
 		 * Advance snd_nxt over sequence space of this segment.
 		 */
 		if (flags & (TH_SYN|TH_FIN)) {
 			if (flags & TH_SYN)
 				tp->snd_nxt++;
 			if (flags & TH_FIN) {
 				tp->snd_nxt++;
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
 		if (sack_rxmit)
 			goto timer;
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			tp->snd_max = tp->snd_nxt;
 			/*
 			 * Time this transmission if not a retransmission and
 			 * not currently timing anything.
 			 */
 			if (tp->t_rtttime == 0) {
 				tp->t_rtttime = ticks;
 				tp->t_rtseq = startseq;
 				TCPSTAT_INC(tcps_segstimed);
 			}
 		}
 
 		/*
 		 * Set retransmit timer if not currently set,
 		 * and not doing a pure ack or a keep-alive probe.
 		 * Initial value for retransmit timer is smoothed
 		 * round-trip time + 2 * round-trip time variance.
 		 * Initialize shift counter which is used for backoff
 		 * of retransmit time.
 		 */
 timer:
 		if (!tcp_timer_active(tp, TT_REXMT) &&
 		    ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
 		     (tp->snd_nxt != tp->snd_una))) {
 			if (tcp_timer_active(tp, TT_PERSIST)) {
 				tcp_timer_activate(tp, TT_PERSIST, 0);
 				tp->t_rxtshift = 0;
 			}
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 		} else if (len == 0 && sbavail(&so->so_snd) &&
 		    !tcp_timer_active(tp, TT_REXMT) &&
 		    !tcp_timer_active(tp, TT_PERSIST)) {
 			/*
 			 * Avoid a situation where we do not set persist timer
 			 * after a zero window condition. For example:
 			 * 1) A -> B: packet with enough data to fill the window
 			 * 2) B -> A: ACK for #1 + new data (0 window
 			 *    advertisement)
 			 * 3) A -> B: ACK for #2, 0 len packet
 			 *
 			 * In this case, A will not activate the persist timer,
 			 * because it chose to send a packet. Unless tcp_output
 			 * is called for some other reason (delayed ack timer,
 			 * another input packet from B, socket syscall), A will
 			 * not send zero window probes.
 			 *
 			 * So, if you send a 0-length packet, but there is data
 			 * in the socket buffer, and neither the rexmt or
 			 * persist timer is already set, then activate the
 			 * persist timer.
 			 */
 			tp->t_rxtshift = 0;
 			tcp_setpersist(tp);
 		}
 	} else {
 		/*
 		 * Persist case, update snd_max but since we are in
 		 * persist mode (no window) we do not update snd_nxt.
 		 */
 		int xlen = len;
 		if (flags & TH_SYN)
 			++xlen;
 		if (flags & TH_FIN) {
 			++xlen;
 			tp->t_flags |= TF_SENTFIN;
 		}
 		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
 			tp->snd_max = tp->snd_nxt + len;
 	}
 
 	if (error) {
 
 		/*
 		 * We know that the packet was lost, so back out the
 		 * sequence number advance, if any.
 		 *
 		 * If the error is EPERM the packet got blocked by the
 		 * local firewall.  Normally we should terminate the
 		 * connection but the blocking may have been spurious
 		 * due to a firewall reconfiguration cycle.  So we treat
 		 * it like a packet loss and let the retransmit timer and
 		 * timeouts do their work over time.
 		 * XXX: It is a POLA question whether calling tcp_drop right
 		 * away would be the really correct behavior instead.
 		 */
 		if (((tp->t_flags & TF_FORCEDATA) == 0 ||
 		    !tcp_timer_active(tp, TT_PERSIST)) &&
 		    ((flags & TH_SYN) == 0) &&
 		    (error != EPERM)) {
 			if (sack_rxmit) {
 				p->rxmit -= len;
 				tp->sackhint.sack_bytes_rexmit -= len;
 				KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
 				    ("sackhint bytes rtx >= 0"));
 			} else
 				tp->snd_nxt -= len;
 		}
 		SOCKBUF_UNLOCK_ASSERT(&so->so_snd);	/* Check gotos. */
 		switch (error) {
 		case EPERM:
 			tp->t_softerror = error;
 			return (error);
 		case ENOBUFS:
 	                if (!tcp_timer_active(tp, TT_REXMT) &&
 			    !tcp_timer_active(tp, TT_PERSIST))
 	                        tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 			tp->snd_cwnd = tp->t_maxseg;
 			return (0);
 		case EMSGSIZE:
 			/*
 			 * For some reason the interface we used initially
 			 * to send segments changed to another or lowered
 			 * its MTU.
 			 * If TSO was active we either got an interface
 			 * without TSO capabilits or TSO was turned off.
 			 * If we obtained mtu from ip_output() then update
 			 * it and try again.
 			 */
 			if (tso)
 				tp->t_flags &= ~TF_TSO;
 			if (mtu != 0) {
 				tcp_mss_update(tp, -1, mtu, NULL, NULL);
 				goto again;
 			}
 			return (error);
 		case EHOSTDOWN:
 		case EHOSTUNREACH:
 		case ENETDOWN:
 		case ENETUNREACH:
 			if (TCPS_HAVERCVDSYN(tp->t_state)) {
 				tp->t_softerror = error;
 				return (0);
 			}
 			/* FALLTHROUGH */
 		default:
 			return (error);
 		}
 	}
 	TCPSTAT_INC(tcps_sndtotal);
 
 	/*
 	 * Data sent (as far as we can tell).
 	 * If this advertises a larger window than any other segment,
 	 * then remember the size of the advertised window.
 	 * Any pending ACK has now been sent.
 	 */
 	if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
 		tp->rcv_adv = tp->rcv_nxt + recwin;
 	tp->last_ack_sent = tp->rcv_nxt;
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 	if (tcp_timer_active(tp, TT_DELACK))
 		tcp_timer_activate(tp, TT_DELACK, 0);
 #if 0
 	/*
 	 * This completely breaks TCP if newreno is turned on.  What happens
 	 * is that if delayed-acks are turned on on the receiver, this code
 	 * on the transmitter effectively destroys the TCP window, forcing
 	 * it to four packets (1.5Kx4 = 6K window).
 	 */
 	if (sendalot && --maxburst)
 		goto again;
 #endif
 	if (sendalot)
 		goto again;
 	return (0);
 }
 
 void
 tcp_setpersist(struct tcpcb *tp)
 {
 	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
 	int tt;
 
 	tp->t_flags &= ~TF_PREVVALID;
 	if (tcp_timer_active(tp, TT_REXMT))
 		panic("tcp_setpersist: retransmit pending");
 	/*
 	 * Start/restart persistance timer.
 	 */
 	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
 		      tcp_persmin, tcp_persmax);
 	tcp_timer_activate(tp, TT_PERSIST, tt);
 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 		tp->t_rxtshift++;
 }
 
 /*
  * Insert TCP options according to the supplied parameters to the place
  * optp in a consistent way.  Can handle unaligned destinations.
  *
  * The order of the option processing is crucial for optimal packing and
  * alignment for the scarce option space.
  *
  * The optimal order for a SYN/SYN-ACK segment is:
  *   MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
  *   Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
  *
  * The SACK options should be last.  SACK blocks consume 8*n+2 bytes.
  * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
  * At minimum we need 10 bytes (to generate 1 SACK block).  If both
  * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
  * we only have 10 bytes for SACK options (40 - (12 + 18)).
  */
 int
 tcp_addoptions(struct tcpopt *to, u_char *optp)
 {
 	u_int32_t mask, optlen = 0;
 
 	for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
 		if ((to->to_flags & mask) != mask)
 			continue;
 		if (optlen == TCP_MAXOLEN)
 			break;
 		switch (to->to_flags & mask) {
 		case TOF_MSS:
 			while (optlen % 4) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)
 				continue;
 			optlen += TCPOLEN_MAXSEG;
 			*optp++ = TCPOPT_MAXSEG;
 			*optp++ = TCPOLEN_MAXSEG;
 			to->to_mss = htons(to->to_mss);
 			bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
 			optp += sizeof(to->to_mss);
 			break;
 		case TOF_SCALE:
 			while (!optlen || optlen % 2 != 1) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)
 				continue;
 			optlen += TCPOLEN_WINDOW;
 			*optp++ = TCPOPT_WINDOW;
 			*optp++ = TCPOLEN_WINDOW;
 			*optp++ = to->to_wscale;
 			break;
 		case TOF_SACKPERM:
 			while (optlen % 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)
 				continue;
 			optlen += TCPOLEN_SACK_PERMITTED;
 			*optp++ = TCPOPT_SACK_PERMITTED;
 			*optp++ = TCPOLEN_SACK_PERMITTED;
 			break;
 		case TOF_TS:
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)
 				continue;
 			optlen += TCPOLEN_TIMESTAMP;
 			*optp++ = TCPOPT_TIMESTAMP;
 			*optp++ = TCPOLEN_TIMESTAMP;
 			to->to_tsval = htonl(to->to_tsval);
 			to->to_tsecr = htonl(to->to_tsecr);
 			bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
 			optp += sizeof(to->to_tsval);
 			bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
 			optp += sizeof(to->to_tsecr);
 			break;
 #ifdef TCP_SIGNATURE
 		case TOF_SIGNATURE:
 			{
 			int siglen = TCPOLEN_SIGNATURE - 2;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE)
 				continue;
 			optlen += TCPOLEN_SIGNATURE;
 			*optp++ = TCPOPT_SIGNATURE;
 			*optp++ = TCPOLEN_SIGNATURE;
 			to->to_signature = optp;
 			while (siglen--)
 				 *optp++ = 0;
 			break;
 			}
 #endif
 		case TOF_SACK:
 			{
 			int sackblks = 0;
 			struct sackblk *sack = (struct sackblk *)to->to_sacks;
 			tcp_seq sack_seq;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
 				*optp++ = TCPOPT_NOP;
 			}
 			if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)
 				continue;
 			optlen += TCPOLEN_SACKHDR;
 			*optp++ = TCPOPT_SACK;
 			sackblks = min(to->to_nsacks,
 					(TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
 			*optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
 			while (sackblks--) {
 				sack_seq = htonl(sack->start);
 				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 				optp += sizeof(sack_seq);
 				sack_seq = htonl(sack->end);
 				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
 				optp += sizeof(sack_seq);
 				optlen += TCPOLEN_SACK;
 				sack++;
 			}
 			TCPSTAT_INC(tcps_sack_send_blocks);
 			break;
 			}
 #ifdef TCP_RFC7413
 		case TOF_FASTOPEN:
 			{
 			int total_len;
 
 			/* XXX is there any point to aligning this option? */
 			total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len;
 			if (TCP_MAXOLEN - optlen < total_len)
 				continue;
 			*optp++ = TCPOPT_FAST_OPEN;
 			*optp++ = total_len;
 			if (to->to_tfo_len > 0) {
 				bcopy(to->to_tfo_cookie, optp, to->to_tfo_len);
 				optp += to->to_tfo_len;
 			}
 			optlen += total_len;
 			break;
 			}
 #endif
 		default:
 			panic("%s: unknown TCP option type", __func__);
 			break;
 		}
 	}
 
 	/* Terminate and pad TCP options to a 4 byte boundary. */
 	if (optlen % 4) {
 		optlen += TCPOLEN_EOL;
 		*optp++ = TCPOPT_EOL;
 	}
 	/*
 	 * According to RFC 793 (STD0007):
 	 *   "The content of the header beyond the End-of-Option option
 	 *    must be header padding (i.e., zero)."
 	 *   and later: "The padding is composed of zeros."
 	 */
 	while (optlen % 4) {
 		optlen += TCPOLEN_PAD;
 		*optp++ = TCPOPT_PAD;
 	}
 
 	KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__));
 	return (optlen);
 }
Index: head/sys/netinet/tcp_subr.c
===================================================================
--- head/sys/netinet/tcp_subr.c	(revision 297224)
+++ head/sys/netinet/tcp_subr.c	(revision 297225)
@@ -1,2993 +1,2997 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/sysctl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/refcount.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
 #endif
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 
 #ifdef TCP_RFC7413
 #include <netinet/tcp_fastopen.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/cc/cc.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPPCAP
 #include <netinet/tcp_pcap.h>
 #endif
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef INET6
 #include <netinet6/ip6protosw.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/xform.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/key.h>
 #include <sys/syslog.h>
 #endif /*IPSEC*/
 
 #include <machine/in_cksum.h>
 #include <sys/md5.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS;
 #ifdef INET6
 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
 #endif
 
 struct rwlock tcp_function_lock;
 
 static int
 sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0,
     &sysctl_net_inet_tcp_mss_check, "I",
     "Default TCP Maximum Segment Size");
 
 #ifdef INET6
 static int
 sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_v6mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_v6mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0,
     &sysctl_net_inet_tcp_mss_v6_check, "I",
    "Default TCP Maximum Segment Size for IPv6");
 #endif /* INET6 */
 
 /*
  * Minimum MSS we accept and use. This prevents DoS attacks where
  * we are forced to a ridiculous low MSS like 20 and send hundreds
  * of packets instead of one. The effect scales with the available
  * bandwidth and quickly saturates the CPU and network interface
  * with packet generation and sending. Set to zero to disable MINMSS
  * checking. This setting prevents us from sending too small packets.
  */
 VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW,
      &VNET_NAME(tcp_minmss), 0,
     "Minimum TCP Maximum Segment Size");
 
 VNET_DEFINE(int, tcp_do_rfc1323) = 1;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc1323), 0,
     "Enable rfc1323 (high performance TCP) extensions");
 
 static int	tcp_log_debug = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
     &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
 
 static int	tcp_tcbhashsize;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 
 static int	do_tcpdrain = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
     "Enable tcp_drain routine for extra help when low on mbufs");
 
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD,
     &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs");
 
 static VNET_DEFINE(int, icmp_may_rst) = 1;
 #define	V_icmp_may_rst			VNET(icmp_may_rst)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(icmp_may_rst), 0,
     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 
 static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0;
 #define	V_tcp_isn_reseed_interval	VNET(tcp_isn_reseed_interval)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_isn_reseed_interval), 0,
     "Seconds between reseeding of ISN secret");
 
 static int	tcp_soreceive_stream;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
     &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets");
 
 #ifdef TCP_SIGNATURE
 static int	tcp_sig_checksigs = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, signature_verify_input, CTLFLAG_RW,
     &tcp_sig_checksigs, 0, "Verify RFC2385 digests on inbound traffic");
 #endif
 
 VNET_DEFINE(uma_zone_t, sack_hole_zone);
 #define	V_sack_hole_zone		VNET(sack_hole_zone)
 
 VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
 
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
 static void tcp_mtudisc(struct inpcb *, int);
 static char *	tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
 		    void *ip4hdr, const void *ip6hdr);
 static void	tcp_timer_discard(struct tcpcb *, uint32_t);
 
 
 static struct tcp_function_block tcp_def_funcblk = {
 	"default",
 	tcp_output,
 	tcp_do_segment,
 	tcp_default_ctloutput,
 	NULL,
 	NULL,	
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	0,
 	0
 };
 
 int t_functions_inited = 0;
 struct tcp_funchead t_functions;
 static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk;
 
 static void
 init_tcp_functions(void)
 {
 	if (t_functions_inited == 0) {
 		TAILQ_INIT(&t_functions);
 		rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0);
 		t_functions_inited = 1;
 	}
 }
 
 static struct tcp_function_block *
 find_tcp_functions_locked(struct tcp_function_set *fs)
 {
 	struct tcp_function *f;
 	struct tcp_function_block *blk=NULL;
 
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if (strcmp(f->tf_fb->tfb_tcp_block_name, fs->function_set_name) == 0) {
 			blk = f->tf_fb;
 			break;
 		}
 	}
 	return(blk);
 }
 
 static struct tcp_function_block *
 find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s)
 {
 	struct tcp_function_block *rblk=NULL;
 	struct tcp_function *f;
 
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if (f->tf_fb == blk) {
 			rblk = blk;
 			if (s) {
 				*s = f;
 			}
 			break;
 		}
 	}
 	return (rblk);
 }
 
 struct tcp_function_block *
 find_and_ref_tcp_functions(struct tcp_function_set *fs)
 {
 	struct tcp_function_block *blk;
 	
 	rw_rlock(&tcp_function_lock);	
 	blk = find_tcp_functions_locked(fs);
 	if (blk)
 		refcount_acquire(&blk->tfb_refcnt); 
 	rw_runlock(&tcp_function_lock);
 	return(blk);
 }
 
 struct tcp_function_block *
 find_and_ref_tcp_fb(struct tcp_function_block *blk)
 {
 	struct tcp_function_block *rblk;
 	
 	rw_rlock(&tcp_function_lock);	
 	rblk = find_tcp_fb_locked(blk, NULL);
 	if (rblk) 
 		refcount_acquire(&rblk->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	return(rblk);
 }
 
 
 static int
 sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
 {
 	int error=ENOENT;
 	struct tcp_function_set fs;
 	struct tcp_function_block *blk;
 
 	memset(&fs, 0, sizeof(fs));
 	rw_rlock(&tcp_function_lock);
 	blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL);
 	if (blk) {
 		/* Found him */
 		strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
 		fs.pcbcnt = blk->tfb_refcnt;
 	}
 	rw_runlock(&tcp_function_lock);	
 	error = sysctl_handle_string(oidp, fs.function_set_name,
 				     sizeof(fs.function_set_name), req);
 
 	/* Check for error or no change */
 	if (error != 0 || req->newptr == NULL)
 		return(error);
 
 	rw_wlock(&tcp_function_lock);
 	blk = find_tcp_functions_locked(&fs);
 	if ((blk == NULL) ||
 	    (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { 
 		error = ENOENT; 
 		goto done;
 	}
 	tcp_func_set_ptr = blk;
 done:
 	rw_wunlock(&tcp_function_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default,
 	    CTLTYPE_STRING | CTLFLAG_RW,
 	    NULL, 0, sysctl_net_inet_default_tcp_functions, "A",
 	    "Set/get the default TCP functions");
 
 static int
 sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS)
 {
 	int error, cnt, linesz;
 	struct tcp_function *f;
 	char *buffer, *cp;
 	size_t bufsz, outsz;
 
 	cnt = 0;
 	rw_rlock(&tcp_function_lock);
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		cnt++;
 	}
 	rw_runlock(&tcp_function_lock);
 
 	bufsz = (cnt+2) * (TCP_FUNCTION_NAME_LEN_MAX + 12) + 1;
 	buffer = malloc(bufsz, M_TEMP, M_WAITOK);
 
 	error = 0;
 	cp = buffer;
 
 	linesz = snprintf(cp, bufsz, "\n%-32s%c %s\n", "Stack", 'D', "PCB count");
 	cp += linesz;
 	bufsz -= linesz;
 	outsz = linesz;
 
 	rw_rlock(&tcp_function_lock);	
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		linesz = snprintf(cp, bufsz, "%-32s%c %u\n",
 		    f->tf_fb->tfb_tcp_block_name,
 		    (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ',
 		    f->tf_fb->tfb_refcnt);
 		if (linesz >= bufsz) {
 			error = EOVERFLOW;
 			break;
 		}
 		cp += linesz;
 		bufsz -= linesz;
 		outsz += linesz;
 	}
 	rw_runlock(&tcp_function_lock);
 	if (error == 0)
 		error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
 	free(buffer, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
 	    CTLTYPE_STRING|CTLFLAG_RD,
 	    NULL, 0, sysctl_net_inet_list_available, "A",
 	    "list available TCP Function sets");
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
  * variable net.inet.tcp.tcbhashsize
  */
 #ifndef TCBHASHSIZE
 #define TCBHASHSIZE	0
 #endif
 
 /*
  * XXX
  * Callouts should be moved into struct tcp directly.  They are currently
  * separate because the tcpcb structure is exported to userland for sysctl
  * parsing purposes, which do not know about callouts.
  */
 struct tcpcb_mem {
 	struct	tcpcb		tcb;
 	struct	tcp_timer	tt;
 	struct	cc_var		ccv;
 	struct	osd		osd;
 };
 
 static VNET_DEFINE(uma_zone_t, tcpcb_zone);
 #define	V_tcpcb_zone			VNET(tcpcb_zone)
 
 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
 MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory");
 
 static struct mtx isn_mtx;
 
 #define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
 #define	ISN_LOCK()	mtx_lock(&isn_mtx)
 #define	ISN_UNLOCK()	mtx_unlock(&isn_mtx)
 
 /*
  * TCP initialization.
  */
 static void
 tcp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
 	uma_zone_set_max(V_tcpcb_zone, maxsockets);
 	tcp_tw_zone_change();
 }
 
 static int
 tcp_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_INIT(inp, "inp", "tcpinp");
 	return (0);
 }
 
 /*
  * Take a value and get the next power of 2 that doesn't overflow.
  * Used to size the tcp_inpcb hash buckets.
  */
 static int
 maketcp_hashsize(int size)
 {
 	int hashsize;
 
 	/*
 	 * auto tune.
 	 * get the next power of 2 higher than maxsockets.
 	 */
 	hashsize = 1 << fls(size);
 	/* catch overflow, and just go one power of 2 smaller */
 	if (hashsize < size) {
 		hashsize = 1 << (fls(size) - 1);
 	}
 	return (hashsize);
 }
 
 int
 register_tcp_functions(struct tcp_function_block *blk, int wait)
 {
 	struct tcp_function_block *lblk;
 	struct tcp_function *n;
 	struct tcp_function_set fs;
 
 	if (t_functions_inited == 0) {
 		init_tcp_functions();
 	}
 	if ((blk->tfb_tcp_output == NULL) ||
 	    (blk->tfb_tcp_do_segment == NULL) ||
 	    (blk->tfb_tcp_ctloutput == NULL) ||
 	    (strlen(blk->tfb_tcp_block_name) == 0)) {
 		/* 
 		 * These functions are required and you
 		 * need a name.
 		 */
 		return (EINVAL);
 	}
 	if (blk->tfb_tcp_timer_stop_all ||
 	    blk->tfb_tcp_timers_left ||
 	    blk->tfb_tcp_timer_activate ||
 	    blk->tfb_tcp_timer_active ||
 	    blk->tfb_tcp_timer_stop) {
 		/*
 		 * If you define one timer function you 
 		 * must have them all.
 		 */
 		if ((blk->tfb_tcp_timer_stop_all == NULL) ||
 		    (blk->tfb_tcp_timers_left  == NULL) ||
 		    (blk->tfb_tcp_timer_activate == NULL) ||
 		    (blk->tfb_tcp_timer_active == NULL) ||
 		    (blk->tfb_tcp_timer_stop == NULL)) {
 			return (EINVAL);			
 		}
 	}	
 	n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
 	if (n == NULL) {
 		return (ENOMEM);
 	}
 	n->tf_fb = blk;
 	strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
 	rw_wlock(&tcp_function_lock);
 	lblk = find_tcp_functions_locked(&fs);
 	if (lblk) {
 		/* Duplicate name space not allowed */
 		rw_wunlock(&tcp_function_lock);
 		free(n, M_TCPFUNCTIONS);
 		return (EALREADY);
 	}
 	refcount_init(&blk->tfb_refcnt, 0);
 	blk->tfb_flags = 0;
 	TAILQ_INSERT_TAIL(&t_functions, n, tf_next);
 	rw_wunlock(&tcp_function_lock);
 	return(0);
 }	
 
 int
 deregister_tcp_functions(struct tcp_function_block *blk)
 {
 	struct tcp_function_block *lblk;
 	struct tcp_function *f;
 	int error=ENOENT;
 	
 	if (strcmp(blk->tfb_tcp_block_name, "default") == 0) {
 		/* You can't un-register the default */
 		return (EPERM);
 	}
 	rw_wlock(&tcp_function_lock);
 	if (blk == tcp_func_set_ptr) {
 		/* You can't free the current default */
 		rw_wunlock(&tcp_function_lock);
 		return (EBUSY);
 	}
 	if (blk->tfb_refcnt) {
 		/* Still tcb attached, mark it. */
 		blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
 		rw_wunlock(&tcp_function_lock);		
 		return (EBUSY);
 	}
 	lblk = find_tcp_fb_locked(blk, &f);
 	if (lblk) {
 		/* Found */
 		TAILQ_REMOVE(&t_functions, f, tf_next);
 		f->tf_fb = NULL;
 		free(f, M_TCPFUNCTIONS);
 		error = 0;
 	}
 	rw_wunlock(&tcp_function_lock);
 	return (error);
 }
 
 void
 tcp_init(void)
 {
 	const char *tcbhash_tuneable;
 	int hashsize;
 
 	tcbhash_tuneable = "net.inet.tcp.tcbhashsize";
 
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN,
 	    &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT,
 	    &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 	hashsize = TCBHASHSIZE;
 	TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
 	if (hashsize == 0) {
 		/*
 		 * Auto tune the hash size based on maxsockets.
 		 * A perfect hash would have a 1:1 mapping
 		 * (hashsize = maxsockets) however it's been
 		 * suggested that O(2) average is better.
 		 */
 		hashsize = maketcp_hashsize(maxsockets / 4);
 		/*
 		 * Our historical default is 512,
 		 * do not autotune lower than this.
 		 */
 		if (hashsize < 512)
 			hashsize = 512;
 		if (bootverbose && IS_DEFAULT_VNET(curvnet))
 			printf("%s: %s auto tuned to %d\n", __func__,
 			    tcbhash_tuneable, hashsize);
 	}
 	/*
 	 * We require a hashsize to be a power of two.
 	 * Previously if it was not a power of two we would just reset it
 	 * back to 512, which could be a nasty surprise if you did not notice
 	 * the error message.
 	 * Instead what we do is clip it to the closest power of two lower
 	 * than the specified hash value.
 	 */
 	if (!powerof2(hashsize)) {
 		int oldhashsize = hashsize;
 
 		hashsize = maketcp_hashsize(hashsize);
 		/* prevent absurdly low value */
 		if (hashsize < 16)
 			hashsize = 16;
 		printf("%s: WARNING: TCB hash size not a power of 2, "
 		    "clipped from %d to %d.\n", __func__, oldhashsize,
 		    hashsize);
 	}
 	in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize,
 	    "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE,
 	    IPI_HASHFIELDS_4TUPLE);
 
 	/*
 	 * These have to be type stable for the benefit of the timers.
 	 */
 	V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(V_tcpcb_zone, maxsockets);
 	uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached");
 
 	tcp_tw_init();
 	syncache_init();
 	tcp_hc_init();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
 	V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 
 	/* Skip initialization of globals for non-default instances. */
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	tcp_reass_global_init();
 
 	/* XXX virtualize those bellow? */
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;
 	tcp_keepidle = TCPTV_KEEP_IDLE;
 	tcp_keepintvl = TCPTV_KEEPINTVL;
 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 	tcp_msl = TCPTV_MSL;
 	tcp_rexmit_min = TCPTV_MIN;
 	if (tcp_rexmit_min < 1)
 		tcp_rexmit_min = 1;
 	tcp_persmin = TCPTV_PERSMIN;
 	tcp_persmax = TCPTV_PERSMAX;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 	tcp_tcbhashsize = hashsize;
 	/* Setup the tcp function block list */
 	init_tcp_functions();
 	register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
 
 	if (tcp_soreceive_stream) {
 #ifdef INET
 		tcp_usrreqs.pru_soreceive = soreceive_stream;
 #endif
 #ifdef INET6
 		tcp6_usrreqs.pru_soreceive = soreceive_stream;
 #endif /* INET6 */
 	}
 
 #ifdef INET6
 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
 #else /* INET6 */
 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
 #endif /* INET6 */
 	if (max_protohdr < TCP_MINPROTOHDR)
 		max_protohdr = TCP_MINPROTOHDR;
 	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
 		panic("tcp_init");
 #undef TCP_MINPROTOHDR
 
 	ISN_LOCK_INIT();
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 	EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
 		EVENTHANDLER_PRI_ANY);
 #ifdef TCPPCAP
 	tcp_pcap_init();
 #endif
 
 #ifdef TCP_RFC7413
 	tcp_fastopen_init();
 #endif
 }
 
 #ifdef VIMAGE
 void
 tcp_destroy(void)
 {
 	int error;
 
 #ifdef TCP_RFC7413
 	tcp_fastopen_destroy();
 #endif
 	tcp_hc_destroy();
 	syncache_destroy();
 	tcp_tw_destroy();
 	in_pcbinfo_destroy(&V_tcbinfo);
 	uma_zdestroy(V_sack_hole_zone);
 	uma_zdestroy(V_tcpcb_zone);
 
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error);
 	}
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error);
 	}
 }
 #endif
 
 void
 tcp_fini(void *xtp)
 {
 
 }
 
 /*
  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  * tcp_template used to store this data in mbufs, but we now recopy it out
  * of the tcpcb each time to conserve mbufs.
  */
 void
 tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
 {
 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
 
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		struct ip6_hdr *ip6;
 
 		ip6 = (struct ip6_hdr *)ip_ptr;
 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			(inp->inp_flow & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		struct ip *ip;
 
 		ip = (struct ip *)ip_ptr;
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = 5;
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_len = 0;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_ttl = inp->inp_ip_ttl;
 		ip->ip_sum = 0;
 		ip->ip_p = IPPROTO_TCP;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst = inp->inp_faddr;
 	}
 #endif /* INET */
 	th->th_sport = inp->inp_lport;
 	th->th_dport = inp->inp_fport;
 	th->th_seq = 0;
 	th->th_ack = 0;
 	th->th_x2 = 0;
 	th->th_off = 5;
 	th->th_flags = 0;
 	th->th_win = 0;
 	th->th_urp = 0;
 	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
 }
 
 /*
  * Create template to be used to send tcp packets on a connection.
  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  * use for this function is in keepalives, which use tcp_respond.
  */
 struct tcptemp *
 tcpip_maketemplate(struct inpcb *inp)
 {
 	struct tcptemp *t;
 
 	t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
 	if (t == NULL)
 		return (NULL);
 	tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
 	return (t);
 }
 
 /*
  * Send a single message to the TCP at address specified by
  * the given TCP/IP header.  If m == NULL, then we make a copy
  * of the tcpiphdr at th and send directly to the addressed host.
  * This is used to force keep alive messages out using the TCP
  * template for a connection.  If flags are given then we send
  * a message back to the TCP which originated the segment th,
  * and discard the mbuf containing it and any other attached mbufs.
  *
  * In any case the ack and sequence number of the transmitted
  * segment are as specified by the parameters.
  *
  * NOTE: If m != NULL, then th must point to *inside* the mbuf.
  */
 void
 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
     tcp_seq ack, tcp_seq seq, int flags)
 {
 	struct tcpopt to;
 	struct inpcb *inp;
 	struct ip *ip;
 	struct mbuf *optm;
 	struct tcphdr *nth;
 	u_char *optp;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
 	int optlen, tlen, win;
 	bool incl_opts;
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 
 #ifdef INET6
 	isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4);
 	ip6 = ipgen;
 #endif /* INET6 */
 	ip = ipgen;
 
 	if (tp != NULL) {
 		inp = tp->t_inpcb;
 		KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
 		INP_WLOCK_ASSERT(inp);
 	} else
 		inp = NULL;
 
 	incl_opts = false;
 	win = 0;
 	if (tp != NULL) {
 		if (!(flags & TH_RST)) {
 			win = sbspace(&inp->inp_socket->so_rcv);
 			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 				win = (long)TCP_MAXWIN << tp->rcv_scale;
 		}
 		if ((tp->t_flags & TF_NOOPT) == 0)
 			incl_opts = true;
 	}
 	if (m == NULL) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return;
 		m->m_data += max_linkhdr;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(m, struct ip6_hdr *);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 		{
 			bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 			ip = mtod(m, struct ip *);
 			nth = (struct tcphdr *)(ip + 1);
 		}
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		flags = TH_ACK;
 	} else {
 		/*
 		 *  reuse the mbuf. 
 		 * XXX MRT We inherrit the FIB, which is lucky.
 		 */
 		m_freem(m->m_next);
 		m->m_next = NULL;
 		m->m_data = (caddr_t)ipgen;
 		/* m_len is set later */
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 #ifdef INET6
 		if (isipv6) {
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 		{
 			xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
 			nth = (struct tcphdr *)(ip + 1);
 		}
 		if (th != nth) {
 			/*
 			 * this is usually a case when an extension header
 			 * exists between the IPv6 header and the
 			 * TCP header.
 			 */
 			nth->th_sport = th->th_sport;
 			nth->th_dport = th->th_dport;
 		}
 		xchg(nth->th_dport, nth->th_sport, uint16_t);
 #undef xchg
 	}
 	tlen = 0;
 #ifdef INET6
 	if (isipv6)
 		tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 		tlen = sizeof (struct tcpiphdr);
 #endif
 #ifdef INVARIANTS
 	m->m_len = 0;
 	KASSERT(M_TRAILINGSPACE(m) >= tlen,
 	    ("Not enough trailing space for message (m=%p, need=%d, have=%ld)",
 	    m, tlen, (long)M_TRAILINGSPACE(m)));
 #endif
 	m->m_len = tlen;
 	to.to_flags = 0;
 	if (incl_opts) {
 		/* Make sure we have room. */
 		if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) {
 			m->m_next = m_get(M_NOWAIT, MT_DATA);
 			if (m->m_next) {
 				optp = mtod(m->m_next, u_char *);
 				optm = m->m_next;
 			} else
 				incl_opts = false;
 		} else {
 			optp = (u_char *) (nth + 1);
 			optm = m;
 		}
 	}
 	if (incl_opts) {
 		/* Timestamps. */
 		if (tp->t_flags & TF_RCVD_TSTMP) {
 			to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 		}
 #ifdef TCP_SIGNATURE
 		/* TCP-MD5 (RFC2385). */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif
 
 		/* Add the options. */
 		tlen += optlen = tcp_addoptions(&to, optp);
 
 		/* Update m_len in the correct mbuf. */
 		optm->m_len += optlen;
 	} else
 		optlen = 0;
 #ifdef INET6
 	if (isipv6) {
 		ip6->ip6_flow = 0;
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons(tlen - sizeof(*ip6));
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		ip->ip_len = htons(tlen);
 		ip->ip_ttl = V_ip_defttl;
 		if (V_path_mtu_discovery)
 			ip->ip_off |= htons(IP_DF);
 	}
 #endif
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 	if (inp != NULL) {
 		/*
 		 * Packet is associated with a socket, so allow the
 		 * label of the response to reflect the socket label.
 		 */
 		INP_WLOCK_ASSERT(inp);
 		mac_inpcb_create_mbuf(inp, m);
 	} else {
 		/*
 		 * Packet is not associated with a socket, so possibly
 		 * update the label in place.
 		 */
 		mac_netinet_tcp_reply(m);
 	}
 #endif
 	nth->th_seq = htonl(seq);
 	nth->th_ack = htonl(ack);
 	nth->th_x2 = 0;
 	nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 	nth->th_flags = flags;
 	if (tp != NULL)
 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
 
 #ifdef TCP_SIGNATURE
 	if (to.to_flags & TOF_SIGNATURE) {
 		tcp_signature_compute(m, 0, 0, optlen, to.to_signature,
 		    IPSEC_DIR_OUTBOUND);
 	}
 #endif
 
 	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (isipv6) {
 		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 		nth->th_sum = in6_cksum_pseudo(ip6,
 		    tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0);
 		ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
 		    NULL, NULL);
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
 		nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
 	}
 #endif /* INET */
 #ifdef TCPDEBUG
 	if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
 #endif
 	TCP_PROBE3(debug__output, tp, th, mtod(m, const char *));
 	if (flags & TH_RST)
 		TCP_PROBE5(accept__refused, NULL, NULL, mtod(m, const char *),
 		    tp, nth);
 
 	TCP_PROBE5(send, NULL, tp, mtod(m, const char *), tp, nth);
 #ifdef INET6
 	if (isipv6)
 		(void) ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 		(void) ip_output(m, NULL, NULL, 0, NULL, inp);
 #endif
 }
 
 /*
  * Create a new TCP control block, making an
  * empty reassembly queue and hooking it to the argument
  * protocol control block.  The `inp' parameter must have
  * come from the zone allocator set up in tcp_init().
  */
 struct tcpcb *
 tcp_newtcpcb(struct inpcb *inp)
 {
 	struct tcpcb_mem *tm;
 	struct tcpcb *tp;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO);
 	if (tm == NULL)
 		return (NULL);
 	tp = &tm->tcb;
 
 	/* Initialise cc_var struct for this tcpcb. */
 	tp->ccv = &tm->ccv;
 	tp->ccv->type = IPPROTO_TCP;
 	tp->ccv->ccvc.tcp = tp;
 	rw_rlock(&tcp_function_lock);
 	tp->t_fb = tcp_func_set_ptr;
 	refcount_acquire(&tp->t_fb->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	if (tp->t_fb->tfb_tcp_fb_init) {
 		(*tp->t_fb->tfb_tcp_fb_init)(tp);
 	}
 	/*
 	 * Use the current system default CC algorithm.
 	 */
 	CC_LIST_RLOCK();
 	KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!"));
 	CC_ALGO(tp) = CC_DEFAULT();
 	CC_LIST_RUNLOCK();
 
 	if (CC_ALGO(tp)->cb_init != NULL)
 		if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
 			if (tp->t_fb->tfb_tcp_fb_fini)
 				(*tp->t_fb->tfb_tcp_fb_fini)(tp);
 			refcount_release(&tp->t_fb->tfb_refcnt);
 			uma_zfree(V_tcpcb_zone, tm);
 			return (NULL);
 		}
 
 	tp->osd = &tm->osd;
 	if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) {
 		if (tp->t_fb->tfb_tcp_fb_fini)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		uma_zfree(V_tcpcb_zone, tm);
 		return (NULL);
 	}
 
 #ifdef VIMAGE
 	tp->t_vnet = inp->inp_vnet;
 #endif
 	tp->t_timers = &tm->tt;
 	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
 	tp->t_maxseg =
 #ifdef INET6
 		isipv6 ? V_tcp_v6mssdflt :
 #endif /* INET6 */
 		V_tcp_mssdflt;
 
 	/* Set up our timeouts. */
 	callout_init(&tp->t_timers->tt_rexmt, 1);
 	callout_init(&tp->t_timers->tt_persist, 1);
 	callout_init(&tp->t_timers->tt_keep, 1);
 	callout_init(&tp->t_timers->tt_2msl, 1);
 	callout_init(&tp->t_timers->tt_delack, 1);
 
 	if (V_tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (V_tcp_do_sack)
 		tp->t_flags |= TF_SACK_PERMIT;
 	TAILQ_INIT(&tp->snd_holes);
 	/*
 	 * The tcpcb will hold a reference on its inpcb until tcp_discardcb()
 	 * is called.
 	 */
 	in_pcbref(inp);	/* Reference for tcpcb */
 	tp->t_inpcb = inp;
 
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 	 * reasonable initial retransmit time.
 	 */
 	tp->t_srtt = TCPTV_SRTTBASE;
 	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = V_ip_defttl;
 	inp->inp_ppcb = tp;
 #ifdef TCPPCAP
 	/*
 	 * Init the TCP PCAP queues.
 	 */
 	tcp_pcap_tcpcb_init(tp);
 #endif
 	return (tp);		/* XXX */
 }
 
 /*
  * Switch the congestion control algorithm back to NewReno for any active
  * control blocks using an algorithm which is about to go away.
  * This ensures the CC framework can allow the unload to proceed without leaving
  * any dangling pointers which would trigger a panic.
  * Returning non-zero would inform the CC framework that something went wrong
  * and it would be unsafe to allow the unload to proceed. However, there is no
  * way for this to occur with this implementation so we always return zero.
  */
 int
 tcp_ccalgounload(struct cc_algo *unload_algo)
 {
 	struct cc_algo *tmpalgo;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	/*
 	 * Check all active control blocks across all network stacks and change
 	 * any that are using "unload_algo" back to NewReno. If "unload_algo"
 	 * requires cleanup code to be run, call it.
 	 */
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		INP_INFO_WLOCK(&V_tcbinfo);
 		/*
 		 * New connections already part way through being initialised
 		 * with the CC algo we're removing will not race with this code
 		 * because the INP_INFO_WLOCK is held during initialisation. We
 		 * therefore don't enter the loop below until the connection
 		 * list has stabilised.
 		 */
 		LIST_FOREACH(inp, &V_tcb, inp_list) {
 			INP_WLOCK(inp);
 			/* Important to skip tcptw structs. */
 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
 			    (tp = intotcpcb(inp)) != NULL) {
 				/*
 				 * By holding INP_WLOCK here, we are assured
 				 * that the connection is not currently
 				 * executing inside the CC module's functions
 				 * i.e. it is safe to make the switch back to
 				 * NewReno.
 				 */
 				if (CC_ALGO(tp) == unload_algo) {
 					tmpalgo = CC_ALGO(tp);
 					/* NewReno does not require any init. */
 					CC_ALGO(tp) = &newreno_cc_algo;
 					if (tmpalgo->cb_destroy != NULL)
 						tmpalgo->cb_destroy(tp->ccv);
 				}
 			}
 			INP_WUNLOCK(inp);
 		}
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 struct tcpcb *
 tcp_drop(struct tcpcb *tp, int errno)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 
 	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tcp_state_change(tp, TCPS_CLOSED);
 		(void) tp->t_fb->tfb_tcp_output(tp);
 		TCPSTAT_INC(tcps_drops);
 	} else
 		TCPSTAT_INC(tcps_conndrops);
 	if (errno == ETIMEDOUT && tp->t_softerror)
 		errno = tp->t_softerror;
 	so->so_error = errno;
 	return (tcp_close(tp));
 }
 
 void
 tcp_discardcb(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 	int released;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Make sure that all of our timers are stopped before we delete the
 	 * PCB.
 	 *
 	 * If stopping a timer fails, we schedule a discard function in same
 	 * callout, and the last discard function called will take care of
 	 * deleting the tcpcb.
 	 */
 	tcp_timer_stop(tp, TT_REXMT);
 	tcp_timer_stop(tp, TT_PERSIST);
 	tcp_timer_stop(tp, TT_KEEP);
 	tcp_timer_stop(tp, TT_2MSL);
 	tcp_timer_stop(tp, TT_DELACK);
 	if (tp->t_fb->tfb_tcp_timer_stop_all) {
 		/* Call the stop-all function of the methods */
 		tp->t_fb->tfb_tcp_timer_stop_all(tp);
 	}
 
 	/*
 	 * If we got enough samples through the srtt filter,
 	 * save the rtt and rttvar in the routing entry.
 	 * 'Enough' is arbitrarily defined as 4 rtt samples.
 	 * 4 samples is enough for the srtt filter to converge
 	 * to within enough % of the correct value; fewer samples
 	 * and we could save a bogus rtt. The danger is not high
 	 * as tcp quickly recovers from everything.
 	 * XXX: Works very well but needs some more statistics!
 	 */
 	if (tp->t_rttupdated >= 4) {
 		struct hc_metrics_lite metrics;
 		u_long ssthresh;
 
 		bzero(&metrics, sizeof(metrics));
 		/*
 		 * Update the ssthresh always when the conditions below
 		 * are satisfied. This gives us better new start value
 		 * for the congestion avoidance for new connections.
 		 * ssthresh is only set if packet loss occured on a session.
 		 *
 		 * XXXRW: 'so' may be NULL here, and/or socket buffer may be
 		 * being torn down.  Ideally this code would not use 'so'.
 		 */
 		ssthresh = tp->snd_ssthresh;
 		if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
 			/*
 			 * convert the limit from user data bytes to
 			 * packets then to packet data bytes.
 			 */
 			ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
 			if (ssthresh < 2)
 				ssthresh = 2;
 			ssthresh *= (u_long)(tp->t_maxseg +
 #ifdef INET6
 			    (isipv6 ? sizeof (struct ip6_hdr) +
 				sizeof (struct tcphdr) :
 #endif
 				sizeof (struct tcpiphdr)
 #ifdef INET6
 			    )
 #endif
 			    );
 		} else
 			ssthresh = 0;
 		metrics.rmx_ssthresh = ssthresh;
 
 		metrics.rmx_rtt = tp->t_srtt;
 		metrics.rmx_rttvar = tp->t_rttvar;
 		metrics.rmx_cwnd = tp->snd_cwnd;
 		metrics.rmx_sendpipe = 0;
 		metrics.rmx_recvpipe = 0;
 
 		tcp_hc_update(&inp->inp_inc, &metrics);
 	}
 
 	/* free the reassembly queue, if any */
 	tcp_reass_flush(tp);
 
 #ifdef TCP_OFFLOAD
 	/* Disconnect offload device, if any. */
 	if (tp->t_flags & TF_TOE)
 		tcp_offload_detach(tp);
 #endif
 		
 	tcp_free_sackholes(tp);
 
 #ifdef TCPPCAP
 	/* Free the TCP PCAP queues. */
 	tcp_pcap_drain(&(tp->t_inpkts));
 	tcp_pcap_drain(&(tp->t_outpkts));
 #endif
 
 	/* Allow the CC algorithm to clean up after itself. */
 	if (CC_ALGO(tp)->cb_destroy != NULL)
 		CC_ALGO(tp)->cb_destroy(tp->ccv);
 
 	khelp_destroy_osd(tp->osd);
 
 	CC_ALGO(tp) = NULL;
 	inp->inp_ppcb = NULL;
 	if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
 		/* We own the last reference on tcpcb, let's free it. */
 		if ((tp->t_fb->tfb_tcp_timers_left) &&
 		    (tp->t_fb->tfb_tcp_timers_left(tp))) {
 			    /* Some fb timers left running! */
 			    return;
 		}
 		if (tp->t_fb->tfb_tcp_fb_fini)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_inpcb = NULL;
 		uma_zfree(V_tcpcb_zone, tp);
 		released = in_pcbrele_wlocked(inp);
 		KASSERT(!released, ("%s: inp %p should not have been released "
 			"here", __func__, inp));
 	}
 }
 
 void
 tcp_timer_2msl_discard(void *xtp)
 {
 
 	tcp_timer_discard((struct tcpcb *)xtp, TT_2MSL);
 }
 
 void
 tcp_timer_keep_discard(void *xtp)
 {
 
 	tcp_timer_discard((struct tcpcb *)xtp, TT_KEEP);
 }
 
 void
 tcp_timer_persist_discard(void *xtp)
 {
 
 	tcp_timer_discard((struct tcpcb *)xtp, TT_PERSIST);
 }
 
 void
 tcp_timer_rexmt_discard(void *xtp)
 {
 
 	tcp_timer_discard((struct tcpcb *)xtp, TT_REXMT);
 }
 
 void
 tcp_timer_delack_discard(void *xtp)
 {
 
 	tcp_timer_discard((struct tcpcb *)xtp, TT_DELACK);
 }
 
 void
 tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct inpcb *inp;
 
 	CURVNET_SET(tp->t_vnet);
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL",
 		__func__, tp));
 	INP_WLOCK(inp);
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0,
 		("%s: tcpcb has to be stopped here", __func__));
 	KASSERT((tp->t_timers->tt_flags & timer_type) != 0,
 		("%s: discard callout should be running", __func__));
 	tp->t_timers->tt_flags &= ~timer_type;
 	if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
 		/* We own the last reference on this tcpcb, let's free it. */
 		if ((tp->t_fb->tfb_tcp_timers_left) &&
 		    (tp->t_fb->tfb_tcp_timers_left(tp))) {
 			    /* Some fb timers left running! */
 			    goto leave;
 		}
 		if (tp->t_fb->tfb_tcp_fb_fini)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_inpcb = NULL;
 		uma_zfree(V_tcpcb_zone, tp);
 		if (in_pcbrele_wlocked(inp)) {
 			INP_INFO_RUNLOCK(&V_tcbinfo);
 			CURVNET_RESTORE();
 			return;
 		}
 	}
 leave:
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 /*
  * Attempt to close a TCP control block, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 struct tcpcb *
 tcp_close(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 
 	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_state == TCPS_LISTEN)
 		tcp_offload_listen_stop(tp);
 #endif
 #ifdef TCP_RFC7413
 	/*
 	 * This releases the TFO pending counter resource for TFO listen
 	 * sockets as well as passively-created TFO sockets that transition
 	 * from SYN_RECEIVED to CLOSED.
 	 */
 	if (tp->t_tfo_pending) {
 		tcp_fastopen_decrement_counter(tp->t_tfo_pending);
 		tp->t_tfo_pending = NULL;
 	}
 #endif
 	in_pcbdrop(inp);
 	TCPSTAT_INC(tcps_closed);
 	TCPSTATES_DEC(tp->t_state);
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	if (inp->inp_flags & INP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("tcp_close: !SS_PROTOREF"));
 		inp->inp_flags &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		ACCEPT_LOCK();
 		SOCK_LOCK(so);
 		so->so_state &= ~SS_PROTOREF;
 		sofree(so);
 		return (NULL);
 	}
 	return (tp);
 }
 
 void
 tcp_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	if (!do_tcpdrain)
 		return;
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		struct inpcb *inpb;
 		struct tcpcb *tcpb;
 
 	/*
 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
 	 * if there is one...
 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
 	 *      reassembly queue should be flushed, but in a situation
 	 *	where we're really low on mbufs, this is potentially
 	 *	useful.
 	 */
 		INP_INFO_WLOCK(&V_tcbinfo);
 		LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
 			if (inpb->inp_flags & INP_TIMEWAIT)
 				continue;
 			INP_WLOCK(inpb);
 			if ((tcpb = intotcpcb(inpb)) != NULL) {
 				tcp_reass_flush(tcpb);
 				tcp_clean_sackreport(tcpb);
 			}
 			INP_WUNLOCK(inpb);
 		}
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Notify a tcp user of an asynchronous error;
  * store error as soft error, but wake up user
  * (for now, won't do anything until can select for soft error).
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 static struct inpcb *
 tcp_notify(struct inpcb *inp, int error)
 {
 	struct tcpcb *tp;
 
 	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 * If connection hasn't completed, has retransmitted several times,
 	 * and receives a second error, give up now.  This is better
 	 * than waiting a long time to establish a connection that
 	 * can never complete.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
 	     error == EHOSTDOWN)) {
+		if (inp->inp_route.ro_rt) {
+			RTFREE(inp->inp_route.ro_rt);
+			inp->inp_route.ro_rt = (struct rtentry *)NULL;
+		}
 		return (inp);
 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
 	    tp->t_softerror) {
 		tp = tcp_drop(tp, error);
 		if (tp != NULL)
 			return (inp);
 		else
 			return (NULL);
 	} else {
 		tp->t_softerror = error;
 		return (inp);
 	}
 #if 0
 	wakeup( &so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 #endif
 }
 
 static int
 tcp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, m, n, pcb_count;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == NULL) {
 		n = V_tcbinfo.ipi_count +
 		    counter_u64_fetch(VNET(tcps_states)[TCPS_SYN_RECEIVED]);
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
 		return (0);
 	}
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_LIST_RLOCK(&V_tcbinfo);
 	gencnt = V_tcbinfo.ipi_gencnt;
 	n = V_tcbinfo.ipi_count;
 	INP_LIST_RUNLOCK(&V_tcbinfo);
 
 	m = counter_u64_fetch(VNET(tcps_states)[TCPS_SYN_RECEIVED]);
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ (n + m) * sizeof(struct xtcpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n + m;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	error = syncache_pcblist(req, m, &pcb_count);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == NULL)
 		return (ENOMEM);
 
 	INP_INFO_WLOCK(&V_tcbinfo);
 	for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
 	    inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
 		INP_WLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			/*
 			 * XXX: This use of cr_cansee(), introduced with
 			 * TCP state changes, is not quite right, but for
 			 * now, better than nothing.
 			 */
 			if (inp->inp_flags & INP_TIMEWAIT) {
 				if (intotw(inp) != NULL)
 					error = cr_cansee(req->td->td_ucred,
 					    intotw(inp)->tw_cred);
 				else
 					error = EINVAL;	/* Skip this inp. */
 			} else
 				error = cr_canseeinpcb(req->td->td_ucred, inp);
 			if (error == 0) {
 				in_pcbref(inp);
 				inp_list[i++] = inp;
 			}
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xtcpcb xt;
 			void *inp_ppcb;
 
 			bzero(&xt, sizeof(xt));
 			xt.xt_len = sizeof xt;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xt.xt_inp, sizeof *inp);
 			inp_ppcb = inp->inp_ppcb;
 			if (inp_ppcb == NULL)
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 			else if (inp->inp_flags & INP_TIMEWAIT) {
 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
 				xt.xt_tp.t_state = TCPS_TIME_WAIT;
 			} else {
 				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
 				if (xt.xt_tp.t_timers)
 					tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer);
 			}
 			if (inp->inp_socket != NULL)
 				sotoxsocket(inp->inp_socket, &xt.xt_socket);
 			else {
 				bzero(&xt.xt_socket, sizeof xt.xt_socket);
 				xt.xt_socket.xso_protocol = IPPROTO_TCP;
 			}
 			xt.xt_inp.inp_gencnt = inp->inp_gencnt;
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_RLOCK(&V_tcbinfo);
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		INP_LIST_RLOCK(&V_tcbinfo);
 		xig.xig_gen = V_tcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
 		INP_LIST_RUNLOCK(&V_tcbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
     tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
 
 #ifdef INET
 static int
 tcp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error;
 #ifdef INET
 	int mapped = 0;
 #endif
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 #ifdef INET
 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 			mapped = 1;
 		else
 #endif
 			return (EINVAL);
 	}
 
 #ifdef INET
 	if (mapped == 1)
 		inp = in_pcblookup(&V_tcbinfo,
 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 			addrs[1].sin6_port,
 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 			addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL);
 	else
 #endif
 		inp = in6_pcblookup(&V_tcbinfo,
 			&addrs[1].sin6_addr, addrs[1].sin6_port,
 			&addrs[0].sin6_addr, addrs[0].sin6_port,
 			INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
 #endif /* INET6 */
 
 
 #ifdef INET
 void
 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct ip *ip = vip;
 	struct tcphdr *th;
 	struct in_addr faddr;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct icmp *icp;
 	struct in_conninfo inc;
 	tcp_seq icmp_tcp_seq;
 	int mtu;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc_notify;
 	else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
 		notify = tcp_drop_syn_sent;
-	/*
-	 * Redirects don't need to be handled up here.
-	 */
-	else if (PRC_IS_REDIRECT(cmd))
+	else if (PRC_IS_REDIRECT(cmd)) {
+		/* signal EHOSTDOWN, as it flushes the cached route */
+		in_pcbnotifyall(&tcbinfo, faddr, EHOSTDOWN, notify);
 		return;
+	}
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 * XXX: We never get this from ICMP, otherwise it makes an
 	 * excellent DoS attack on machines with many connections.
 	 */
 	else if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 
 	if (ip == NULL) {
 		in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
 		return;
 	}
 
 	icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip));
 	th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src,
 	    th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
 	if (inp != NULL)  {
 		if (!(inp->inp_flags & INP_TIMEWAIT) &&
 		    !(inp->inp_flags & INP_DROPPED) &&
 		    !(inp->inp_socket == NULL)) {
 			icmp_tcp_seq = ntohl(th->th_seq);
 			tp = intotcpcb(inp);
 			if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
 			    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
 				if (cmd == PRC_MSGSIZE) {
 					/*
 					 * MTU discovery:
 					 * If we got a needfrag set the MTU
 					 * in the route to the suggested new
 					 * value (if given) and then notify.
 					 */
 				    	mtu = ntohs(icp->icmp_nextmtu);
 					/*
 					 * If no alternative MTU was
 					 * proposed, try the next smaller
 					 * one.
 					 */
 					if (!mtu)
 						mtu = ip_next_mtu(
 						    ntohs(ip->ip_len), 1);
 					if (mtu < V_tcp_minmss +
 					    sizeof(struct tcpiphdr))
 						mtu = V_tcp_minmss +
 						    sizeof(struct tcpiphdr);
 					/*
 					 * Only process the offered MTU if it
 					 * is smaller than the current one.
 					 */
 					if (mtu < tp->t_maxseg +
 					    sizeof(struct tcpiphdr)) {
 						bzero(&inc, sizeof(inc));
 						inc.inc_faddr = faddr;
 						inc.inc_fibnum =
 						    inp->inp_inc.inc_fibnum;
 						tcp_hc_updatemtu(&inc, mtu);
 						tcp_mtudisc(inp, mtu);
 					}
 				} else
 					inp = (*notify)(inp,
 					    inetctlerrmap[cmd]);
 			}
 		}
 		if (inp != NULL)
 			INP_WUNLOCK(inp);
 	} else {
 		bzero(&inc, sizeof(inc));
 		inc.inc_fport = th->th_dport;
 		inc.inc_lport = th->th_sport;
 		inc.inc_faddr = faddr;
 		inc.inc_laddr = ip->ip_src;
 		syncache_unreach(&inc, th);
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 }
 #endif /* INET */
 
 #ifdef INET6
 void
 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 	struct tcphdr th;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	int off;
 	struct tcp_portonly {
 		u_int16_t th_sport;
 		u_int16_t th_dport;
 	} *thp;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc_notify;
 	else if (!PRC_IS_REDIRECT(cmd) &&
 		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		off = 0;	/* fool gcc */
 		sa6_src = &sa6_any;
 	}
 
 	if (ip6 != NULL) {
 		struct in_conninfo inc;
 		/*
 		 * XXX: We assume that when IPV6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		/* check if we can safely examine src and dst ports */
 		if (m->m_pkthdr.len < off + sizeof(*thp))
 			return;
 
 		bzero(&th, sizeof(th));
 		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
 
 		in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
 		    (struct sockaddr *)ip6cp->ip6c_src,
 		    th.th_sport, cmd, NULL, notify);
 
 		bzero(&inc, sizeof(inc));
 		inc.inc_fport = th.th_dport;
 		inc.inc_lport = th.th_sport;
 		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
 		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
 		inc.inc_flags |= INC_ISIPV6;
 		INP_INFO_RLOCK(&V_tcbinfo);
 		syncache_unreach(&inc, &th);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	} else
 		in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
 			      0, cmd, NULL, notify);
 }
 #endif /* INET6 */
 
 
 /*
  * Following is where TCP initial sequence number generation occurs.
  *
  * There are two places where we must use initial sequence numbers:
  * 1.  In SYN-ACK packets.
  * 2.  In SYN packets.
  *
  * All ISNs for SYN-ACK packets are generated by the syncache.  See
  * tcp_syncache.c for details.
  *
  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
  * depends on this property.  In addition, these ISNs should be
  * unguessable so as to prevent connection hijacking.  To satisfy
  * the requirements of this situation, the algorithm outlined in
  * RFC 1948 is used, with only small modifications.
  *
  * Implementation details:
  *
  * Time is based off the system timer, and is corrected so that it
  * increases by one megabyte per second.  This allows for proper
  * recycling on high speed LANs while still leaving over an hour
  * before rollover.
  *
  * As reading the *exact* system time is too expensive to be done
  * whenever setting up a TCP connection, we increment the time
  * offset in two ways.  First, a small random positive increment
  * is added to isn_offset for each connection that is set up.
  * Second, the function tcp_isn_tick fires once per clock tick
  * and increments isn_offset as necessary so that sequence numbers
  * are incremented at approximately ISN_BYTES_PER_SECOND.  The
  * random positive increments serve only to ensure that the same
  * exact sequence number is never sent out twice (as could otherwise
  * happen when a port is recycled in less than the system tick
  * interval.)
  *
  * net.inet.tcp.isn_reseed_interval controls the number of seconds
  * between seeding of isn_secret.  This is normally set to zero,
  * as reseeding should not be necessary.
  *
  * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
  * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock.  In
  * general, this means holding an exclusive (write) lock.
  */
 
 #define ISN_BYTES_PER_SECOND 1048576
 #define ISN_STATIC_INCREMENT 4096
 #define ISN_RANDOM_INCREMENT (4096 - 1)
 
 static VNET_DEFINE(u_char, isn_secret[32]);
 static VNET_DEFINE(int, isn_last);
 static VNET_DEFINE(int, isn_last_reseed);
 static VNET_DEFINE(u_int32_t, isn_offset);
 static VNET_DEFINE(u_int32_t, isn_offset_old);
 
 #define	V_isn_secret			VNET(isn_secret)
 #define	V_isn_last			VNET(isn_last)
 #define	V_isn_last_reseed		VNET(isn_last_reseed)
 #define	V_isn_offset			VNET(isn_offset)
 #define	V_isn_offset_old		VNET(isn_offset_old)
 
 tcp_seq
 tcp_new_isn(struct tcpcb *tp)
 {
 	MD5_CTX isn_ctx;
 	u_int32_t md5_buffer[4];
 	tcp_seq new_isn;
 	u_int32_t projected_offset;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	ISN_LOCK();
 	/* Seed if this is the first use, reseed if requested. */
 	if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) &&
 	     (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
 		< (u_int)ticks))) {
 		read_random(&V_isn_secret, sizeof(V_isn_secret));
 		V_isn_last_reseed = ticks;
 	}
 
 	/* Compute the md5 hash and return the ISN. */
 	MD5Init(&isn_ctx);
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
 #ifdef INET6
 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
 			  sizeof(struct in6_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
 			  sizeof(struct in6_addr));
 	} else
 #endif
 	{
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
 			  sizeof(struct in_addr));
 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
 			  sizeof(struct in_addr));
 	}
 	MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret));
 	MD5Final((u_char *) &md5_buffer, &isn_ctx);
 	new_isn = (tcp_seq) md5_buffer[0];
 	V_isn_offset += ISN_STATIC_INCREMENT +
 		(arc4random() & ISN_RANDOM_INCREMENT);
 	if (ticks != V_isn_last) {
 		projected_offset = V_isn_offset_old +
 		    ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last);
 		if (SEQ_GT(projected_offset, V_isn_offset))
 			V_isn_offset = projected_offset;
 		V_isn_offset_old = V_isn_offset;
 		V_isn_last = ticks;
 	}
 	new_isn += V_isn_offset;
 	ISN_UNLOCK();
 	return (new_isn);
 }
 
 /*
  * When a specific ICMP unreachable message is received and the
  * connection state is SYN-SENT, drop the connection.  This behavior
  * is controlled by the icmp_may_rst sysctl.
  */
 struct inpcb *
 tcp_drop_syn_sent(struct inpcb *inp, int errno)
 {
 	struct tcpcb *tp;
 
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	if (tp->t_state != TCPS_SYN_SENT)
 		return (inp);
 
 	tp = tcp_drop(tp, errno);
 	if (tp != NULL)
 		return (inp);
 	else
 		return (NULL);
 }
 
 /*
  * When `need fragmentation' ICMP is received, update our idea of the MSS
  * based on the new value. Also nudge TCP to send something, since we
  * know the packet we just sent was dropped.
  * This duplicates some code in the tcp_mss() function in tcp_input.c.
  */
 static struct inpcb *
 tcp_mtudisc_notify(struct inpcb *inp, int error)
 {
 
 	tcp_mtudisc(inp, -1);
 	return (inp);
 }
 
 static void
 tcp_mtudisc(struct inpcb *inp, int mtuoffer)
 {
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(inp);
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return;
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
 
 	tcp_mss_update(tp, -1, mtuoffer, NULL, NULL);
   
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	/* If the mss is larger than the socket buffer, decrease the mss. */
 	if (so->so_snd.sb_hiwat < tp->t_maxseg)
 		tp->t_maxseg = so->so_snd.sb_hiwat;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	TCPSTAT_INC(tcps_mturesent);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = tp->snd_una;
 	tcp_free_sackholes(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp->t_flags);
 	tp->t_fb->tfb_tcp_output(tp);
 }
 
 #ifdef INET
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated, then return 0.  This routine
  * is called by TCP routines that access the rmx structure and by
  * tcp_mss_update to get the peer/interface MTU.
  */
 u_long
 tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct nhop4_extended nh4;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
 
 	if (inc->inc_faddr.s_addr != INADDR_ANY) {
 
 		if (fib4_lookup_nh_ext(inc->inc_fibnum, inc->inc_faddr,
 		    NHR_REF, 0, &nh4) != 0)
 			return (0);
 
 		ifp = nh4.nh_ifp;
 		maxmtu = nh4.nh_mtu;
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO4 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 				cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 				cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 			}
 		}
 		fib4_free_nh_ext(inc->inc_fibnum, &nh4);
 	}
 	return (maxmtu);
 }
 #endif /* INET */
 
 #ifdef INET6
 u_long
 tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct nhop6_extended nh6;
 	struct in6_addr dst6;
 	uint32_t scopeid;
 	struct ifnet *ifp;
 	u_long maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
 
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 		in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid);
 		if (fib6_lookup_nh_ext(inc->inc_fibnum, &dst6, scopeid, 0,
 		    0, &nh6) != 0)
 			return (0);
 
 		ifp = nh6.nh_ifp;
 		maxmtu = nh6.nh_mtu;
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO6 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 				cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 				cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 			}
 		}
 		fib6_free_nh_ext(inc->inc_fibnum, &nh6);
 	}
 
 	return (maxmtu);
 }
 #endif /* INET6 */
 
 /*
  * Calculate effective SMSS per RFC5681 definition for a given TCP
  * connection at its current state, taking into account SACK and etc.
  */
 u_int
 tcp_maxseg(const struct tcpcb *tp)
 {
 	u_int optlen;
 
 	if (tp->t_flags & TF_NOOPT)
 		return (tp->t_maxseg);
 
 	/*
 	 * Here we have a simplified code from tcp_addoptions(),
 	 * without a proper loop, and having most of paddings hardcoded.
 	 * We might make mistakes with padding here in some edge cases,
 	 * but this is harmless, since result of tcp_maxseg() is used
 	 * only in cwnd and ssthresh estimations.
 	 */
 #define	PAD(len)	((((len) / 4) + !!((len) % 4)) * 4)
 	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 		if (tp->t_flags & TF_RCVD_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = 0;
 #ifdef TCP_SIGNATURE
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PAD(TCPOLEN_SIGNATURE);
 #endif
 		if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) {
 			optlen += TCPOLEN_SACKHDR;
 			optlen += tp->rcv_numsacks * TCPOLEN_SACK;
 			optlen = PAD(optlen);
 		}
 	} else {
 		if (tp->t_flags & TF_REQ_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = PAD(TCPOLEN_MAXSEG);
 		if (tp->t_flags & TF_REQ_SCALE)
 			optlen += PAD(TCPOLEN_WINDOW);
 #ifdef TCP_SIGNATURE
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PAD(TCPOLEN_SIGNATURE);
 #endif
 		if (tp->t_flags & TF_SACK_PERMIT)
 			optlen += PAD(TCPOLEN_SACK_PERMITTED);
 	}
 #undef PAD
 	optlen = min(optlen, TCP_MAXOLEN);
 	return (tp->t_maxseg - optlen);
 }
 
 #ifdef IPSEC
 /* compute ESP/AH header size for TCP, including outer IP header. */
 size_t
 ipsec_hdrsiz_tcp(struct tcpcb *tp)
 {
 	struct inpcb *inp;
 	struct mbuf *m;
 	size_t hdrsiz;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct tcphdr *th;
 
 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL) ||
 		(!key_havesp(IPSEC_DIR_OUTBOUND)))
 		return (0);
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (!m)
 		return (0);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)(ip6 + 1);
 		m->m_pkthdr.len = m->m_len =
 			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		tcpip_fillheaders(inp, ip6, th);
 		hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	} else
 #endif /* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)(ip + 1);
 		m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
 		tcpip_fillheaders(inp, ip, th);
 		hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 	}
 
 	m_free(m);
 	return (hdrsiz);
 }
 #endif /* IPSEC */
 
 #ifdef TCP_SIGNATURE
 /*
  * Callback function invoked by m_apply() to digest TCP segment data
  * contained within an mbuf chain.
  */
 static int
 tcp_signature_apply(void *fstate, void *data, u_int len)
 {
 
 	MD5Update(fstate, (u_char *)data, len);
 	return (0);
 }
 
 /*
  * XXX The key is retrieved from the system's PF_KEY SADB, by keying a
  * search with the destination IP address, and a 'magic SPI' to be
  * determined by the application. This is hardcoded elsewhere to 1179
 */
 struct secasvar *
 tcp_get_sav(struct mbuf *m, u_int direction)
 {
 	union sockaddr_union dst;
 	struct secasvar *sav;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	/* Extract the destination from the IP header in the mbuf. */
 	bzero(&dst, sizeof(union sockaddr_union));
 	ip = mtod(m, struct ip *);
 #ifdef INET6
 	ip6 = NULL;	/* Make the compiler happy. */
 #endif
 	switch (ip->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		dst.sa.sa_len = sizeof(struct sockaddr_in);
 		dst.sa.sa_family = AF_INET;
 		dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
 		    ip->ip_src : ip->ip_dst;
 		break;
 #endif
 #ifdef INET6
 	case (IPV6_VERSION >> 4):
 		ip6 = mtod(m, struct ip6_hdr *);
 		dst.sa.sa_len = sizeof(struct sockaddr_in6);
 		dst.sa.sa_family = AF_INET6;
 		dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ?
 		    ip6->ip6_src : ip6->ip6_dst;
 		break;
 #endif
 	default:
 		return (NULL);
 		/* NOTREACHED */
 		break;
 	}
 
 	/* Look up an SADB entry which matches the address of the peer. */
 	sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
 	if (sav == NULL) {
 		ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__,
 		    (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) :
 #ifdef INET6
 			(ip->ip_v == (IPV6_VERSION >> 4)) ?
 			    ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) :
 #endif
 			"(unsupported)"));
 	}
 
 	return (sav);
 }
 
 /*
  * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Parameters:
  * m		pointer to head of mbuf chain
  * len		length of TCP segment data, excluding options
  * optlen	length of TCP segment options
  * buf		pointer to storage for computed MD5 digest
  * sav		pointer to security assosiation
  *
  * We do this over ip, tcphdr, segment data, and the key in the SADB.
  * When called from tcp_input(), we can be sure that th_sum has been
  * zeroed out and verified already.
  *
  * Releases reference to SADB key before return. 
  *
  * Return 0 if successful, otherwise return -1.
  *
  */
 int
 tcp_signature_do_compute(struct mbuf *m, int len, int optlen,
     u_char *buf, struct secasvar *sav)
 {
 #ifdef INET
 	struct ippseudo ippseudo;
 #endif
 	MD5_CTX ctx;
 	int doff;
 	struct ip *ip;
 #ifdef INET
 	struct ipovly *ipovly;
 #endif
 	struct tcphdr *th;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	struct in6_addr in6;
 	uint32_t plen;
 	uint16_t nhdr;
 #endif
 	u_short savecsum;
 
 	KASSERT(m != NULL, ("NULL mbuf chain"));
 	KASSERT(buf != NULL, ("NULL signature pointer"));
 
 	/* Extract the destination from the IP header in the mbuf. */
 	ip = mtod(m, struct ip *);
 #ifdef INET6
 	ip6 = NULL;	/* Make the compiler happy. */
 #endif
 
 	MD5Init(&ctx);
 	/*
 	 * Step 1: Update MD5 hash with IP(v6) pseudo-header.
 	 *
 	 * XXX The ippseudo header MUST be digested in network byte order,
 	 * or else we'll fail the regression test. Assume all fields we've
 	 * been doing arithmetic on have been in host byte order.
 	 * XXX One cannot depend on ipovly->ih_len here. When called from
 	 * tcp_output(), the underlying ip_len member has not yet been set.
 	 */
 	switch (ip->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		ipovly = (struct ipovly *)ip;
 		ippseudo.ippseudo_src = ipovly->ih_src;
 		ippseudo.ippseudo_dst = ipovly->ih_dst;
 		ippseudo.ippseudo_pad = 0;
 		ippseudo.ippseudo_p = IPPROTO_TCP;
 		ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) +
 		    optlen);
 		MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));
 
 		th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip));
 		doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen;
 		break;
 #endif
 #ifdef INET6
 	/*
 	 * RFC 2385, 2.0  Proposal
 	 * For IPv6, the pseudo-header is as described in RFC 2460, namely the
 	 * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero-
 	 * extended next header value (to form 32 bits), and 32-bit segment
 	 * length.
 	 * Note: Upper-Layer Packet Length comes before Next Header.
 	 */
 	case (IPV6_VERSION >> 4):
 		in6 = ip6->ip6_src;
 		in6_clearscope(&in6);
 		MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
 		in6 = ip6->ip6_dst;
 		in6_clearscope(&in6);
 		MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
 		plen = htonl(len + sizeof(struct tcphdr) + optlen);
 		MD5Update(&ctx, (char *)&plen, sizeof(uint32_t));
 		nhdr = 0;
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 		nhdr = IPPROTO_TCP;
 		MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
 
 		th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr));
 		doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen;
 		break;
 #endif
 	default:
 		KEY_FREESAV(&sav);
 		return (-1);
 		/* NOTREACHED */
 		break;
 	}
 
 
 	/*
 	 * Step 2: Update MD5 hash with TCP header, excluding options.
 	 * The TCP checksum must be set to zero.
 	 */
 	savecsum = th->th_sum;
 	th->th_sum = 0;
 	MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
 	th->th_sum = savecsum;
 
 	/*
 	 * Step 3: Update MD5 hash with TCP segment data.
 	 *         Use m_apply() to avoid an early m_pullup().
 	 */
 	if (len > 0)
 		m_apply(m, doff, len, tcp_signature_apply, &ctx);
 
 	/*
 	 * Step 4: Update MD5 hash with shared secret.
 	 */
 	MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth));
 	MD5Final(buf, &ctx);
 
 	key_sa_recordxfer(sav, m);
 	KEY_FREESAV(&sav);
 	return (0);
 }
 
 /*
  * Compute TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Return 0 if successful, otherwise return -1.
  */
 int
 tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
     u_char *buf, u_int direction)
 {
 	struct secasvar *sav;
 
 	if ((sav = tcp_get_sav(m, direction)) == NULL)
 		return (-1);
 
 	return (tcp_signature_do_compute(m, len, optlen, buf, sav));
 }
 
 /*
  * Verify the TCP-MD5 hash of a TCP segment. (RFC2385)
  *
  * Parameters:
  * m		pointer to head of mbuf chain
  * len		length of TCP segment data, excluding options
  * optlen	length of TCP segment options
  * buf		pointer to storage for computed MD5 digest
  * direction	direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
  *
  * Return 1 if successful, otherwise return 0.
  */
 int
 tcp_signature_verify(struct mbuf *m, int off0, int tlen, int optlen,
     struct tcpopt *to, struct tcphdr *th, u_int tcpbflag)
 {
 	char tmpdigest[TCP_SIGLEN];
 
 	if (tcp_sig_checksigs == 0)
 		return (1);
 	if ((tcpbflag & TF_SIGNATURE) == 0) {
 		if ((to->to_flags & TOF_SIGNATURE) != 0) {
 
 			/*
 			 * If this socket is not expecting signature but
 			 * the segment contains signature just fail.
 			 */
 			TCPSTAT_INC(tcps_sig_err_sigopt);
 			TCPSTAT_INC(tcps_sig_rcvbadsig);
 			return (0);
 		}
 
 		/* Signature is not expected, and not present in segment. */
 		return (1);
 	}
 
 	/*
 	 * If this socket is expecting signature but the segment does not
 	 * contain any just fail.
 	 */
 	if ((to->to_flags & TOF_SIGNATURE) == 0) {
 		TCPSTAT_INC(tcps_sig_err_nosigopt);
 		TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (0);
 	}
 	if (tcp_signature_compute(m, off0, tlen, optlen, &tmpdigest[0],
 	    IPSEC_DIR_INBOUND) == -1) {
 		TCPSTAT_INC(tcps_sig_err_buildsig);
 		TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (0);
 	}
 	
 	if (bcmp(to->to_signature, &tmpdigest[0], TCP_SIGLEN) != 0) {
 		TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (0);
 	}
 	TCPSTAT_INC(tcps_sig_rcvgoodsig);
 	return (1);
 }
 #endif /* TCP_SIGNATURE */
 
 static int
 sysctl_drop(SYSCTL_HANDLER_ARGS)
 {
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct tcptw *tw;
 	struct sockaddr_in *fin, *lin;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 #endif
 	int error;
 
 	inp = NULL;
 	fin = lin = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 			break;
 		}
 		error = sa6_embedscope(fin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 #endif
 	default:
 		return (EINVAL);
 	}
 	INP_INFO_RLOCK(&V_tcbinfo);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr,
 		    fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port,
 		    INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port,
 		    lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 	}
 	if (inp != NULL) {
 		if (inp->inp_flags & INP_TIMEWAIT) {
 			/*
 			 * XXXRW: There currently exists a state where an
 			 * inpcb is present, but its timewait state has been
 			 * discarded.  For now, don't allow dropping of this
 			 * type of inpcb.
 			 */
 			tw = intotw(inp);
 			if (tw != NULL)
 				tcp_twclose(tw, 0);
 			else
 				INP_WUNLOCK(inp);
 		} else if (!(inp->inp_flags & INP_DROPPED) &&
 			   !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
 			tp = intotcpcb(inp);
 			tp = tcp_drop(tp, ECONNABORTED);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 		} else
 			INP_WUNLOCK(inp);
 	} else
 		error = ESRCH;
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL,
     0, sysctl_drop, "", "Drop TCP connection");
 
 /*
  * Generate a standardized TCP log line for use throughout the
  * tcp subsystem.  Memory allocation is done with M_NOWAIT to
  * allow use in the interrupt context.
  *
  * NB: The caller MUST free(s, M_TCPLOG) the returned string.
  * NB: The function may return NULL if memory allocation failed.
  *
  * Due to header inclusion and ordering limitations the struct ip
  * and ip6_hdr pointers have to be passed as void pointers.
  */
 char *
 tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (tcp_log_in_vain == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 char *
 tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (tcp_log_debug == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 static char *
 tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
     const void *ip6hdr)
 {
 	char *s, *sp;
 	size_t size;
 	struct ip *ip;
 #ifdef INET6
 	const struct ip6_hdr *ip6;
 
 	ip6 = (const struct ip6_hdr *)ip6hdr;
 #endif /* INET6 */
 	ip = (struct ip *)ip4hdr;
 
 	/*
 	 * The log line looks like this:
 	 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
 	 */
 	size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
 	    sizeof(PRINT_TH_FLAGS) + 1 +
 #ifdef INET6
 	    2 * INET6_ADDRSTRLEN;
 #else
 	    2 * INET_ADDRSTRLEN;
 #endif /* INET6 */
 
 	s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
 	if (s == NULL)
 		return (NULL);
 
 	strcat(s, "TCP: [");
 	sp = s + strlen(s);
 
 	if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) {
 		inet_ntoa_r(inc->inc_faddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		inet_ntoa_r(inc->inc_laddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 #ifdef INET6
 	} else if (inc) {
 		ip6_sprintf(sp, &inc->inc6_faddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &inc->inc6_laddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 	} else if (ip6 && th) {
 		ip6_sprintf(sp, &ip6->ip6_src);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &ip6->ip6_dst);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET6 */
 #ifdef INET
 	} else if (ip && th) {
 		inet_ntoa_r(ip->ip_src, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		inet_ntoa_r(ip->ip_dst, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET */
 	} else {
 		free(s, M_TCPLOG);
 		return (NULL);
 	}
 	sp = s + strlen(s);
 	if (th)
 		sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS);
 	if (*(s + size - 1) != '\0')
 		panic("%s: string too long", __func__);
 	return (s);
 }
 
 /*
  * A subroutine which makes it easy to track TCP state changes with DTrace.
  * This function shouldn't be called for t_state initializations that don't
  * correspond to actual TCP state transitions.
  */
 void
 tcp_state_change(struct tcpcb *tp, int newstate)
 {
 #if defined(KDTRACE_HOOKS)
 	int pstate = tp->t_state;
 #endif
 
 	TCPSTATES_DEC(tp->t_state);
 	TCPSTATES_INC(newstate);
 	tp->t_state = newstate;
 	TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate);
 }
Index: head/sys/netinet/tcp_timer.c
===================================================================
--- head/sys/netinet/tcp_timer.c	(revision 297224)
+++ head/sys/netinet/tcp_timer.c	(revision 297225)
@@ -1,1014 +1,1016 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 #include <net/netisr.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/in_systm.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/cc/cc.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 
 int    tcp_persmin;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
 
 int    tcp_persmax;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
 
 int	tcp_keepinit;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
 
 int	tcp_keepidle;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
 
 int	tcp_keepintvl;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
 
 int	tcp_delacktime;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
     "Time before a delayed ACK is sent");
 
 int	tcp_msl;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
 
 int	tcp_rexmit_min;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
     "Minimum Retransmission Timeout");
 
 int	tcp_rexmit_slop;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
     "Retransmission Timer Slop");
 
 static int	always_keepalive = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
     &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
 
 int    tcp_fast_finwait2_recycle = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 
     &tcp_fast_finwait2_recycle, 0,
     "Recycle closed FIN_WAIT_2 connections faster");
 
 int    tcp_finwait2_timeout;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
 
 int	tcp_keepcnt = TCPTV_KEEPCNT;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
     "Number of keepalive probes to send");
 
 	/* max idle probes */
 int	tcp_maxpersistidle;
 
 static int	tcp_rexmit_drop_options = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
     &tcp_rexmit_drop_options, 0,
     "Drop TCP options from 3rd and later retransmitted SYN");
 
 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
 #define	V_tcp_pmtud_blackhole_detect	VNET(tcp_pmtud_blackhole_detect)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
     CTLFLAG_RW|CTLFLAG_VNET,
     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
     "Path MTU Discovery Black Hole Detection Enabled");
 
 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
 #define	V_tcp_pmtud_blackhole_activated \
     VNET(tcp_pmtud_blackhole_activated)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
     CTLFLAG_RD|CTLFLAG_VNET,
     &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
     "Path MTU Discovery Black Hole Detection, Activation Count");
 
 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
 #define	V_tcp_pmtud_blackhole_activated_min_mss \
     VNET(tcp_pmtud_blackhole_activated_min_mss)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
     CTLFLAG_RD|CTLFLAG_VNET,
     &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
     "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
 
 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
 #define	V_tcp_pmtud_blackhole_failed	VNET(tcp_pmtud_blackhole_failed)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
     CTLFLAG_RD|CTLFLAG_VNET,
     &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
     "Path MTU Discovery Black Hole Detection, Failure Count");
 
 #ifdef INET
 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
 #define	V_tcp_pmtud_blackhole_mss	VNET(tcp_pmtud_blackhole_mss)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
     CTLFLAG_RW|CTLFLAG_VNET,
     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
     "Path MTU Discovery Black Hole Detection lowered MSS");
 #endif
 
 #ifdef INET6
 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
 #define	V_tcp_v6pmtud_blackhole_mss	VNET(tcp_v6pmtud_blackhole_mss)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
     CTLFLAG_RW|CTLFLAG_VNET,
     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
 #endif
 
 #ifdef	RSS
 static int	per_cpu_timers = 1;
 #else
 static int	per_cpu_timers = 0;
 #endif
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
     &per_cpu_timers , 0, "run tcp timers on all cpus");
 
 #if 0
 #define	INP_CPU(inp)	(per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
 		((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
 #endif
 
 /*
  * Map the given inp to a CPU id.
  *
  * This queries RSS if it's compiled in, else it defaults to the current
  * CPU ID.
  */
 static inline int
 inp_to_cpuid(struct inpcb *inp)
 {
 	u_int cpuid;
 
 #ifdef	RSS
 	if (per_cpu_timers) {
 		cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 		if (cpuid == NETISR_CPUID_NONE)
 			return (curcpu);	/* XXX */
 		else
 			return (cpuid);
 	}
 #else
 	/* Legacy, pre-RSS behaviour */
 	if (per_cpu_timers) {
 		/*
 		 * We don't have a flowid -> cpuid mapping, so cheat and
 		 * just map unknown cpuids to curcpu.  Not the best, but
 		 * apparently better than defaulting to swi 0.
 		 */
 		cpuid = inp->inp_flowid % (mp_maxid + 1);
 		if (! CPU_ABSENT(cpuid))
 			return (cpuid);
 		return (curcpu);
 	}
 #endif
 	/* Default for RSS and non-RSS - cpuid 0 */
 	else {
 		return (0);
 	}
 }
 
 /*
  * Tcp protocol timeout routine called every 500 ms.
  * Updates timestamps used for TCP
  * causes finite state machine actions if timers expire.
  */
 void
 tcp_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		(void) tcp_tw_2msl_scan(0);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
 
 int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
 
 static int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
 
 /*
  * TCP timer processing.
  */
 
 void
 tcp_timer_delack(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_delack) ||
 	    !callout_active(&tp->t_timers->tt_delack)) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_delack);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
 		("%s: tp %p delack callout should be running", __func__, tp));
 
 	tp->t_flags |= TF_ACKNOW;
 	TCPSTAT_INC(tcps_delack);
 	(void) tp->t_fb->tfb_tcp_output(tp);
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_2msl(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	tcp_free_sackholes(tp);
 	if (callout_pending(&tp->t_timers->tt_2msl) ||
 	    !callout_active(&tp->t_timers->tt_2msl)) {
 		INP_WUNLOCK(tp->t_inpcb);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_2msl);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
 		("%s: tp %p 2msl callout should be running", __func__, tp));
 	/*
 	 * 2 MSL timeout in shutdown went off.  If we're closed but
 	 * still waiting for peer to close and connection has been idle
 	 * too long delete connection control block.  Otherwise, check
 	 * again in a bit.
 	 *
 	 * If in TIME_WAIT state just ignore as this timeout is handled in
 	 * tcp_tw_2msl_scan().
 	 *
 	 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 
 	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 
 	 * Ignore fact that there were recent incoming segments.
 	 */
 	if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
 	    tp->t_inpcb && tp->t_inpcb->inp_socket && 
 	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
 		TCPSTAT_INC(tcps_finwait2_drops);
 		tp = tcp_close(tp);             
 	} else {
 		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
 			if (!callout_reset(&tp->t_timers->tt_2msl,
 			   TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) {
 				tp->t_timers->tt_flags &= ~TT_2MSL_RST;
 			}
 		} else
 		       tp = tcp_close(tp);
        }
 
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_keep(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct tcptemp *t_template;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_keep) ||
 	    !callout_active(&tp->t_timers->tt_keep)) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_keep);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
 		("%s: tp %p keep callout should be running", __func__, tp));
 	/*
 	 * Keep-alive timer went off; send something
 	 * or drop connection if idle for too long.
 	 */
 	TCPSTAT_INC(tcps_keeptimeo);
 	if (tp->t_state < TCPS_ESTABLISHED)
 		goto dropit;
 	if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
 	    tp->t_state <= TCPS_CLOSING) {
 		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
 			goto dropit;
 		/*
 		 * Send a packet designed to force a response
 		 * if the peer is up and reachable:
 		 * either an ACK if the connection is still alive,
 		 * or an RST if the peer has closed the connection
 		 * due to timeout or reboot.
 		 * Using sequence number tp->snd_una-1
 		 * causes the transmitted zero-length segment
 		 * to lie outside the receive window;
 		 * by the protocol spec, this requires the
 		 * correspondent TCP to respond.
 		 */
 		TCPSTAT_INC(tcps_keepprobe);
 		t_template = tcpip_maketemplate(inp);
 		if (t_template) {
 			tcp_respond(tp, t_template->tt_ipgen,
 				    &t_template->tt_t, (struct mbuf *)NULL,
 				    tp->rcv_nxt, tp->snd_una - 1, 0);
 			free(t_template, M_TEMP);
 		}
 		if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
 		    tcp_timer_keep, tp)) {
 			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
 		}
 	} else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
 		    tcp_timer_keep, tp)) {
 			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
 		}
 
 #ifdef TCPDEBUG
 	if (inp->inp_socket->so_options & SO_DEBUG)
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 	return;
 
 dropit:
 	TCPSTAT_INC(tcps_keepdrops);
 	tp = tcp_drop(tp, ETIMEDOUT);
 
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	if (tp != NULL)
 		INP_WUNLOCK(tp->t_inpcb);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_persist(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_persist) ||
 	    !callout_active(&tp->t_timers->tt_persist)) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_persist);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
 		("%s: tp %p persist callout should be running", __func__, tp));
 	/*
 	 * Persistance timer into zero window.
 	 * Force a byte to be output, if possible.
 	 */
 	TCPSTAT_INC(tcps_persisttimeo);
 	/*
 	 * Hack: if the peer is dead/unreachable, we do not
 	 * time out if the window is closed.  After a full
 	 * backoff, drop the connection if the idle time
 	 * (no responses to probes) reaches the maximum
 	 * backoff that we would use if retransmitting.
 	 */
 	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 		TCPSTAT_INC(tcps_persistdrop);
 		tp = tcp_drop(tp, ETIMEDOUT);
 		goto out;
 	}
 	/*
 	 * If the user has closed the socket then drop a persisting
 	 * connection after a much reduced timeout.
 	 */
 	if (tp->t_state > TCPS_CLOSE_WAIT &&
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 		TCPSTAT_INC(tcps_persistdrop);
 		tp = tcp_drop(tp, ETIMEDOUT);
 		goto out;
 	}
 	tcp_setpersist(tp);
 	tp->t_flags |= TF_FORCEDATA;
 	(void) tp->t_fb->tfb_tcp_output(tp);
 	tp->t_flags &= ~TF_FORCEDATA;
 
 out:
 #ifdef TCPDEBUG
 	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
 		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_rexmt(void * xtp)
 {
 	struct tcpcb *tp = xtp;
 	CURVNET_SET(tp->t_vnet);
 	int rexmt;
 	int headlocked;
 	struct inpcb *inp;
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_rexmt) ||
 	    !callout_active(&tp->t_timers->tt_rexmt)) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_rexmt);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
 		("%s: tp %p rexmt callout should be running", __func__, tp));
 	tcp_free_sackholes(tp);
 	/*
 	 * Retransmission timer went off.  Message has not
 	 * been acked within retransmit interval.  Back off
 	 * to a longer retransmit interval and retransmit one segment.
 	 */
 	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		TCPSTAT_INC(tcps_timeoutdrop);
 
 		tp = tcp_drop(tp, tp->t_softerror ?
 			      tp->t_softerror : ETIMEDOUT);
 		headlocked = 1;
 		goto out;
 	}
 	INP_INFO_RUNLOCK(&V_tcbinfo);
 	headlocked = 0;
 	if (tp->t_state == TCPS_SYN_SENT) {
 		/*
 		 * If the SYN was retransmitted, indicate CWND to be
 		 * limited to 1 segment in cc_conn_init().
 		 */
 		tp->snd_cwnd = 1;
 	} else if (tp->t_rxtshift == 1) {
 		/*
 		 * first retransmit; record ssthresh and cwnd so they can
 		 * be recovered if this turns out to be a "bad" retransmit.
 		 * A retransmit is considered "bad" if an ACK for this
 		 * segment is received within RTT/2 interval; the assumption
 		 * here is that the ACK was already in flight.  See
 		 * "On Estimating End-to-End Network Path Properties" by
 		 * Allman and Paxson for more details.
 		 */
 		tp->snd_cwnd_prev = tp->snd_cwnd;
 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
 		tp->snd_recover_prev = tp->snd_recover;
 		if (IN_FASTRECOVERY(tp->t_flags))
 			tp->t_flags |= TF_WASFRECOVERY;
 		else
 			tp->t_flags &= ~TF_WASFRECOVERY;
 		if (IN_CONGRECOVERY(tp->t_flags))
 			tp->t_flags |= TF_WASCRECOVERY;
 		else
 			tp->t_flags &= ~TF_WASCRECOVERY;
 		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
 		tp->t_flags |= TF_PREVVALID;
 	} else
 		tp->t_flags &= ~TF_PREVVALID;
 	TCPSTAT_INC(tcps_rexmttimeo);
 	if ((tp->t_state == TCPS_SYN_SENT) ||
 	    (tp->t_state == TCPS_SYN_RECEIVED))
 		rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
 	else
 		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 	TCPT_RANGESET(tp->t_rxtcur, rexmt,
 		      tp->t_rttmin, TCPTV_REXMTMAX);
 
 	/*
 	 * We enter the path for PLMTUD if connection is established or, if
 	 * connection is FIN_WAIT_1 status, reason for the last is that if
 	 * amount of data we send is very small, we could send it in couple of
 	 * packets and process straight to FIN. In that case we won't catch
 	 * ESTABLISHED state.
 	 */
 	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
 	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
 #ifdef INET6
 		int isipv6;
 #endif
 
 		/*
 		 * Idea here is that at each stage of mtu probe (usually, 1448
 		 * -> 1188 -> 524) should be given 2 chances to recover before
 		 *  further clamping down. 'tp->t_rxtshift % 2 == 0' should
 		 *  take care of that.
 		 */
 		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
 		    (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
 		    (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) {
 			/*
 			 * Enter Path MTU Black-hole Detection mechanism:
 			 * - Disable Path MTU Discovery (IP "DF" bit).
 			 * - Reduce MTU to lower value than what we
 			 *   negotiated with peer.
 			 */
 			/* Record that we may have found a black hole. */
 			tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
 
 			/* Keep track of previous MSS. */
 			tp->t_pmtud_saved_maxseg = tp->t_maxseg;
 
 			/* 
 			 * Reduce the MSS to blackhole value or to the default
 			 * in an attempt to retransmit.
 			 */
 #ifdef INET6
 			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
 			if (isipv6 &&
 			    tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
 				V_tcp_pmtud_blackhole_activated++;
 			} else if (isipv6) {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_v6mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch to
 				 * minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				V_tcp_pmtud_blackhole_activated_min_mss++;
 			}
 #endif
 #if defined(INET6) && defined(INET)
 			else
 #endif
 #ifdef INET
 			if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
 				V_tcp_pmtud_blackhole_activated++;
 			} else {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch to
 				 * minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				V_tcp_pmtud_blackhole_activated_min_mss++;
 			}
 #endif
 			/*
 			 * Reset the slow-start flight size
 			 * as it may depend on the new MSS.
 			 */
 			if (CC_ALGO(tp)->conn_init != NULL)
 				CC_ALGO(tp)->conn_init(tp->ccv);
 		} else {
 			/*
 			 * If further retransmissions are still unsuccessful
 			 * with a lowered MTU, maybe this isn't a blackhole and
 			 * we restore the previous MSS and blackhole detection
 			 * flags.
 			 * The limit '6' is determined by giving each probe
 			 * stage (1448, 1188, 524) 2 chances to recover.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
 			    (tp->t_rxtshift > 6)) {
 				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
 				tp->t_maxseg = tp->t_pmtud_saved_maxseg;
 				V_tcp_pmtud_blackhole_failed++;
 				/*
 				 * Reset the slow-start flight size as it
 				 * may depend on the new MSS.
 				 */
 				if (CC_ALGO(tp)->conn_init != NULL)
 					CC_ALGO(tp)->conn_init(tp->ccv);
 			}
 		}
 	}
 
 	/*
 	 * Disable RFC1323 and SACK if we haven't got any response to
 	 * our third SYN to work-around some broken terminal servers
 	 * (most of which have hopefully been retired) that have bad VJ
 	 * header compression code which trashes TCP segments containing
 	 * unknown-to-them TCP options.
 	 */
 	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
 	    (tp->t_rxtshift == 3))
 		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
 	/*
 	 * If we backed off this far, our srtt estimate is probably bogus.
 	 * Clobber it so we'll take the next rtt measurement as our srtt;
 	 * move the current srtt into rttvar to keep the current
 	 * retransmit times until then.
 	 */
 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
 #ifdef INET6
 		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
 			in6_losing(tp->t_inpcb);
+		else
 #endif
+			in_losing(tp->t_inpcb);
 		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
 		tp->t_srtt = 0;
 	}
 	tp->snd_nxt = tp->snd_una;
 	tp->snd_recover = tp->snd_max;
 	/*
 	 * Force a segment to be sent.
 	 */
 	tp->t_flags |= TF_ACKNOW;
 	/*
 	 * If timing a segment in this window, stop the timer.
 	 */
 	tp->t_rtttime = 0;
 
 	cc_cong_signal(tp, NULL, CC_RTO);
 
 	(void) tp->t_fb->tfb_tcp_output(tp);
 
 out:
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
 	if (headlocked)
 		INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
 {
 	struct callout *t_callout;
 	timeout_t *f_callout;
 	struct inpcb *inp = tp->t_inpcb;
 	int cpu = inp_to_cpuid(inp);
 	uint32_t f_reset;
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return;
 #endif
 
 	if (tp->t_timers->tt_flags & TT_STOPPED)
 		return;
 
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			f_callout = tcp_timer_delack;
 			f_reset = TT_DELACK_RST;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			f_callout = tcp_timer_rexmt;
 			f_reset = TT_REXMT_RST;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			f_callout = tcp_timer_persist;
 			f_reset = TT_PERSIST_RST;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			f_callout = tcp_timer_keep;
 			f_reset = TT_KEEP_RST;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			f_callout = tcp_timer_2msl;
 			f_reset = TT_2MSL_RST;
 			break;
 		default:
 			if (tp->t_fb->tfb_tcp_timer_activate) {
 				tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
 				return;
 			}
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 	if (delta == 0) {
 		if ((tp->t_timers->tt_flags & timer_type) &&
 		    (callout_stop(t_callout) > 0) &&
 		    (tp->t_timers->tt_flags & f_reset)) {
 			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
 		}
 	} else {
 		if ((tp->t_timers->tt_flags & timer_type) == 0) {
 			tp->t_timers->tt_flags |= (timer_type | f_reset);
 			callout_reset_on(t_callout, delta, f_callout, tp, cpu);
 		} else {
 			/* Reset already running callout on the same CPU. */
 			if (!callout_reset(t_callout, delta, f_callout, tp)) {
 				/*
 				 * Callout not cancelled, consider it as not
 				 * properly restarted. */
 				tp->t_timers->tt_flags &= ~f_reset;
 			}
 		}
 	}
 }
 
 int
 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct callout *t_callout;
 
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			break;
 		default:
 			if (tp->t_fb->tfb_tcp_timer_active) {
 				return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
 			}
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 	return callout_active(t_callout);
 }
 
 void
 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct callout *t_callout;
 	timeout_t *f_callout;
 	uint32_t f_reset;
 
 	tp->t_timers->tt_flags |= TT_STOPPED;
 
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			f_callout = tcp_timer_delack_discard;
 			f_reset = TT_DELACK_RST;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			f_callout = tcp_timer_rexmt_discard;
 			f_reset = TT_REXMT_RST;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			f_callout = tcp_timer_persist_discard;
 			f_reset = TT_PERSIST_RST;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			f_callout = tcp_timer_keep_discard;
 			f_reset = TT_KEEP_RST;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			f_callout = tcp_timer_2msl_discard;
 			f_reset = TT_2MSL_RST;
 			break;
 		default:
 			if (tp->t_fb->tfb_tcp_timer_stop) {
 				/* 
 				 * XXXrrs we need to look at this with the
 				 * stop case below (flags).
 				 */
 				tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
 				return;
 			}
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 
 	if (tp->t_timers->tt_flags & timer_type) {
 		if ((callout_stop(t_callout) > 0) &&
 		    (tp->t_timers->tt_flags & f_reset)) {
 			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
 		} else {
 			/*
 			 * Can't stop the callout, defer tcpcb actual deletion
 			 * to the last tcp timer discard callout.
 			 * The TT_STOPPED flag will ensure that no tcp timer
 			 * callouts can be restarted on our behalf, and
 			 * past this point currently running callouts waiting
 			 * on inp lock will return right away after the
 			 * classical check for callout reset/stop events:
 			 * callout_pending() || !callout_active()
 			 */
 			callout_reset(t_callout, 1, f_callout, tp);
 		}
 	}
 }
 
 #define	ticks_to_msecs(t)	(1000*(t) / hz)
 
 void
 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
     struct xtcp_timer *xtimer)
 {
 	sbintime_t now;
 
 	bzero(xtimer, sizeof(*xtimer));
 	if (timer == NULL)
 		return;
 	now = getsbinuptime();
 	if (callout_active(&timer->tt_delack))
 		xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_rexmt))
 		xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_persist))
 		xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_keep))
 		xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
 	if (callout_active(&timer->tt_2msl))
 		xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
 	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
 }
Index: head/sys/netinet/udp_usrreq.c
===================================================================
--- head/sys/netinet/udp_usrreq.c	(revision 297224)
+++ head/sys/netinet/udp_usrreq.c	(revision 297225)
@@ -1,1915 +1,1933 @@
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2008 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * Copyright (c) 2014 Kevin Lo
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/udplite.h>
 #include <netinet/in_rss.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/esp.h>
 #endif
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * UDP and UDP-Lite protocols implementation.
  * Per RFC 768, August, 1980.
  * Per RFC 3828, July, 2004.
  */
 
 /*
  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
  * removes the only data integrity mechanism for packets and malformed
  * packets that would otherwise be discarded due to bad checksums, and may
  * cause problems (especially for NFS data blocks).
  */
 VNET_DEFINE(int, udp_cksum) = 1;
 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(udp_cksum), 0, "compute udp checksum");
 
 int	udp_log_in_vain = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
     &udp_log_in_vain, 0, "Log all incoming UDP packets");
 
 VNET_DEFINE(int, udp_blackhole) = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(udp_blackhole), 0,
     "Do not send port unreachables for refused connects");
 
 u_long	udp_sendspace = 9216;		/* really max datagram size */
 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
 
 u_long	udp_recvspace = 40 * (1024 +
 #ifdef INET6
 				      sizeof(struct sockaddr_in6)
 #else
 				      sizeof(struct sockaddr_in)
 #endif
 				      );	/* 40 1K datagrams */
 
 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
 
 VNET_DEFINE(struct inpcbhead, udb);		/* from udp_var.h */
 VNET_DEFINE(struct inpcbinfo, udbinfo);
 VNET_DEFINE(struct inpcbhead, ulitecb);
 VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
 static VNET_DEFINE(uma_zone_t, udpcb_zone);
 #define	V_udpcb_zone			VNET(udpcb_zone)
 
 #ifndef UDBHASHSIZE
 #define	UDBHASHSIZE	128
 #endif
 
 VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);		/* from udp_var.h */
 VNET_PCPUSTAT_SYSINIT(udpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
     udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(udpstat);
 #endif /* VIMAGE */
 #ifdef INET
 static void	udp_detach(struct socket *so);
 static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
 		    struct mbuf *, struct thread *);
 #endif
 
 #ifdef IPSEC
 #ifdef IPSEC_NAT_T
 #define	UF_ESPINUDP_ALL	(UF_ESPINUDP_NON_IKE|UF_ESPINUDP)
 #ifdef INET
 static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int);
 #endif
 #endif /* IPSEC_NAT_T */
 #endif /* IPSEC */
 
 static void
 udp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
 	uma_zone_set_max(V_udpcb_zone, maxsockets);
 }
 
 static int
 udp_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp;
 
 	inp = mem;
 	INP_LOCK_INIT(inp, "inp", "udpinp");
 	return (0);
 }
 
 static int
 udplite_inpcb_init(void *mem, int size, int flags)
 {
 	struct inpcb *inp;
 
 	inp = mem;
 	INP_LOCK_INIT(inp, "inp", "udpliteinp");
 	return (0);
 }
 
 void
 udp_init(void)
 {
 
 	/*
 	 * For now default to 2-tuple UDP hashing - until the fragment
 	 * reassembly code can also update the flowid.
 	 *
 	 * Once we can calculate the flowid that way and re-establish
 	 * a 4-tuple, flip this to 4-tuple.
 	 */
 	in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
 	    "udp_inpcb", udp_inpcb_init, NULL, 0,
 	    IPI_HASHFIELDS_2TUPLE);
 	V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(V_udpcb_zone, maxsockets);
 	uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
 	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 void
 udplite_init(void)
 {
 
 	in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE,
 	    UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL,
 	    0, IPI_HASHFIELDS_2TUPLE);
 }
 
 /*
  * Kernel module interface for updating udpstat.  The argument is an index
  * into udpstat treated as an array of u_long.  While this encodes the
  * general layout of udpstat into the caller, it doesn't encode its location,
  * so that future changes to add, for example, per-CPU stats support won't
  * cause binary compatibility problems for kernel modules.
  */
 void
 kmod_udpstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(udpstat)[statnum], 1);
 }
 
 int
 udp_newudpcb(struct inpcb *inp)
 {
 	struct udpcb *up;
 
 	up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
 	if (up == NULL)
 		return (ENOBUFS);
 	inp->inp_ppcb = up;
 	return (0);
 }
 
 void
 udp_discardcb(struct udpcb *up)
 {
 
 	uma_zfree(V_udpcb_zone, up);
 }
 
 #ifdef VIMAGE
 void
 udp_destroy(void)
 {
 
 	in_pcbinfo_destroy(&V_udbinfo);
 	uma_zdestroy(V_udpcb_zone);
 }
 
 void
 udplite_destroy(void)
 {
 
 	in_pcbinfo_destroy(&V_ulitecbinfo);
 }
 #endif
 
 #ifdef INET
 /*
  * Subroutine of udp_input(), which appends the provided mbuf chain to the
  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
  * contains the source address.  If the socket ends up being an IPv6 socket,
  * udp_append() will convert to a sockaddr_in6 before passing the address
  * into the socket code.
  *
  * In the normal case udp_append() will return 0, indicating that you
  * must unlock the inp. However if a tunneling protocol is in place we increment
  * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we
  * then decrement the reference count. If the inp_rele returns 1, indicating the
  * inp is gone, we return that to the caller to tell them *not* to unlock
  * the inp. In the case of multi-cast this will cause the distribution
  * to stop (though most tunneling protocols known currently do *not* use
  * multicast).
  */
 static int
 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
     struct sockaddr_in *udp_in)
 {
 	struct sockaddr *append_sa;
 	struct socket *so;
 	struct mbuf *opts = 0;
 #ifdef INET6
 	struct sockaddr_in6 udp_in6;
 #endif
 	struct udpcb *up;
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * Engage the tunneling protocol.
 	 */
 	up = intoudpcb(inp);
 	if (up->u_tun_func != NULL) {
 		in_pcbref(inp);
 		INP_RUNLOCK(inp);
 		(*up->u_tun_func)(n, off, inp, (struct sockaddr *)udp_in,
 		    up->u_tun_ctx);
 		INP_RLOCK(inp);
 		return (in_pcbrele_rlocked(inp));
 	}
 
 	off += sizeof(struct udphdr);
 
 #ifdef IPSEC
 	/* Check AH/ESP integrity. */
 	if (ipsec4_in_reject(n, inp)) {
 		m_freem(n);
 		return (0);
 	}
 #ifdef IPSEC_NAT_T
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
 	if (up->u_flags & UF_ESPINUDP_ALL) {	/* IPSec UDP encaps. */
 		n = udp4_espdecap(inp, n, off);
 		if (n == NULL)				/* Consumed. */
 			return (0);
 	}
 #endif /* IPSEC_NAT_T */
 #endif /* IPSEC */
 #ifdef MAC
 	if (mac_inpcb_check_deliver(inp, n) != 0) {
 		m_freem(n);
 		return (0);
 	}
 #endif /* MAC */
 	if (inp->inp_flags & INP_CONTROLOPTS ||
 	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6)
 			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
 		else
 #endif /* INET6 */
 			ip_savecontrol(inp, &opts, ip, n);
 	}
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		bzero(&udp_in6, sizeof(udp_in6));
 		udp_in6.sin6_len = sizeof(udp_in6);
 		udp_in6.sin6_family = AF_INET6;
 		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
 		append_sa = (struct sockaddr *)&udp_in6;
 	} else
 #endif /* INET6 */
 		append_sa = (struct sockaddr *)udp_in;
 	m_adj(n, off);
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		UDPSTAT_INC(udps_fullsock);
 	} else
 		sorwakeup_locked(so);
 	return (0);
 }
 
 int
 udp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct ip *ip;
 	struct udphdr *uh;
 	struct ifnet *ifp;
 	struct inpcb *inp;
 	uint16_t len, ip_len;
 	struct inpcbinfo *pcbinfo;
 	struct ip save_ip;
 	struct sockaddr_in udp_in;
 	struct mbuf *m;
 	struct m_tag *fwd_tag;
 	int cscov_partial, iphlen;
 
 	m = *mp;
 	iphlen = *offp;
 	ifp = m->m_pkthdr.rcvif;
 	*mp = NULL;
 	UDPSTAT_INC(udps_ipackets);
 
 	/*
 	 * Strip IP options, if any; should skip this, make available to
 	 * user, and use on returned packets, but we don't yet have a way to
 	 * check the checksum with options still present.
 	 */
 	if (iphlen > sizeof (struct ip)) {
 		ip_stripoptions(m);
 		iphlen = sizeof(struct ip);
 	}
 
 	/*
 	 * Get IP and UDP header together in first mbuf.
 	 */
 	ip = mtod(m, struct ip *);
 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
 			UDPSTAT_INC(udps_hdrops);
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 	}
 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
 	cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
 
 	/*
 	 * Destination port of 0 is illegal, based on RFC768.
 	 */
 	if (uh->uh_dport == 0)
 		goto badunlocked;
 
 	/*
 	 * Construct sockaddr format source address.  Stuff source address
 	 * and datagram in user buffer.
 	 */
 	bzero(&udp_in, sizeof(udp_in));
 	udp_in.sin_len = sizeof(udp_in);
 	udp_in.sin_family = AF_INET;
 	udp_in.sin_port = uh->uh_sport;
 	udp_in.sin_addr = ip->ip_src;
 
 	/*
 	 * Make mbuf data length reflect UDP length.  If not enough data to
 	 * reflect UDP length, drop.
 	 */
 	len = ntohs((u_short)uh->uh_ulen);
 	ip_len = ntohs(ip->ip_len) - iphlen;
 	if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) {
 		/* Zero means checksum over the complete packet. */
 		if (len == 0)
 			len = ip_len;
 		cscov_partial = 0;
 	}
 	if (ip_len != len) {
 		if (len > ip_len || len < sizeof(struct udphdr)) {
 			UDPSTAT_INC(udps_badlen);
 			goto badunlocked;
 		}
 		if (proto == IPPROTO_UDP)
 			m_adj(m, len - ip_len);
 	}
 
 	/*
 	 * Save a copy of the IP header in case we want restore it for
 	 * sending an ICMP error message in response.
 	 */
 	if (!V_udp_blackhole)
 		save_ip = *ip;
 	else
 		memset(&save_ip, 0, sizeof(save_ip));
 
 	/*
 	 * Checksum extended UDP header and data.
 	 */
 	if (uh->uh_sum) {
 		u_short uh_sum;
 
 		if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
 		    !cscov_partial) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				uh_sum = m->m_pkthdr.csum_data;
 			else
 				uh_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr, htonl((u_short)len +
 				    m->m_pkthdr.csum_data + proto));
 			uh_sum ^= 0xffff;
 		} else {
 			char b[9];
 
 			bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
 			bzero(((struct ipovly *)ip)->ih_x1, 9);
 			((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ?
 			    uh->uh_ulen : htons(ip_len);
 			uh_sum = in_cksum(m, len + sizeof (struct ip));
 			bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
 		}
 		if (uh_sum) {
 			UDPSTAT_INC(udps_badsum);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	} else {
 		if (proto == IPPROTO_UDP) {
 			UDPSTAT_INC(udps_nosum);
 		} else {
 			/* UDPLite requires a checksum */
 			/* XXX: What is the right UDPLite MIB counter here? */
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	}
 
 	pcbinfo = udp_get_inpcbinfo(proto);
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    in_broadcast(ip->ip_dst, ifp)) {
 		struct inpcb *last;
 		struct inpcbhead *pcblist;
 		struct ip_moptions *imo;
 
 		INP_INFO_RLOCK(pcbinfo);
 		pcblist = udp_get_pcblist(proto);
 		last = NULL;
 		LIST_FOREACH(inp, pcblist, inp_list) {
 			if (inp->inp_lport != uh->uh_dport)
 				continue;
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_laddr.s_addr != INADDR_ANY &&
 			    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 				continue;
 			if (inp->inp_faddr.s_addr != INADDR_ANY &&
 			    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 				continue;
 			if (inp->inp_fport != 0 &&
 			    inp->inp_fport != uh->uh_sport)
 				continue;
 
 			INP_RLOCK(inp);
 
 			/*
 			 * XXXRW: Because we weren't holding either the inpcb
 			 * or the hash lock when we checked for a match
 			 * before, we should probably recheck now that the
 			 * inpcb lock is held.
 			 */
 
 			/*
 			 * Handle socket delivery policy for any-source
 			 * and source-specific multicast. [RFC3678]
 			 */
 			imo = inp->inp_moptions;
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 				struct sockaddr_in	 group;
 				int			 blocked;
 				if (imo == NULL) {
 					INP_RUNLOCK(inp);
 					continue;
 				}
 				bzero(&group, sizeof(struct sockaddr_in));
 				group.sin_len = sizeof(struct sockaddr_in);
 				group.sin_family = AF_INET;
 				group.sin_addr = ip->ip_dst;
 
 				blocked = imo_multi_filter(imo, ifp,
 					(struct sockaddr *)&group,
 					(struct sockaddr *)&udp_in);
 				if (blocked != MCAST_PASS) {
 					if (blocked == MCAST_NOTGMEMBER)
 						IPSTAT_INC(ips_notmember);
 					if (blocked == MCAST_NOTSMEMBER ||
 					    blocked == MCAST_MUTED)
 						UDPSTAT_INC(udps_filtermcast);
 					INP_RUNLOCK(inp);
 					continue;
 				}
 			}
 			if (last != NULL) {
 				struct mbuf *n;
 
 				if ((n = m_copy(m, 0, M_COPYALL)) != NULL) {
 					UDP_PROBE(receive, NULL, last, ip,
 					    last, uh);
 					if (udp_append(last, ip, n, iphlen,
 						&udp_in)) {
 						goto inp_lost;
 					}
 				}
 				INP_RUNLOCK(last);
 			}
 			last = inp;
 			/*
 			 * Don't look for additional matches if this one does
 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
 			 * socket options set.  This heuristic avoids
 			 * searching through all pcbs in the common case of a
 			 * non-shared port.  It assumes that an application
 			 * will never clear these options after setting them.
 			 */
 			if ((last->inp_socket->so_options &
 			    (SO_REUSEPORT|SO_REUSEADDR)) == 0)
 				break;
 		}
 
 		if (last == NULL) {
 			/*
 			 * No matching pcb found; discard datagram.  (No need
 			 * to send an ICMP Port Unreachable for a broadcast
 			 * or multicast datgram.)
 			 */
 			UDPSTAT_INC(udps_noportbcast);
 			if (inp)
 				INP_RUNLOCK(inp);
 			INP_INFO_RUNLOCK(pcbinfo);
 			goto badunlocked;
 		}
 		UDP_PROBE(receive, NULL, last, ip, last, uh);
 		if (udp_append(last, ip, m, iphlen, &udp_in) == 0) 
 			INP_RUNLOCK(last);
 	inp_lost:
 		INP_INFO_RUNLOCK(pcbinfo);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Locate pcb for datagram.
 	 */
 
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		struct sockaddr_in *next_hop;
 
 		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
 
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * Already got one like this?
 		 */
 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in_pcblookup(pcbinfo, ip->ip_src,
 			    uh->uh_sport, next_hop->sin_addr,
 			    next_hop->sin_port ? htons(next_hop->sin_port) :
 			    uh->uh_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_RLOCKPCB, ifp);
 		}
 		/* Remove the tag from the packet. We don't need it anymore. */
 		m_tag_delete(m, fwd_tag);
 		m->m_flags &= ~M_IP_NEXTHOP;
 	} else
 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
 		    INPLOOKUP_RLOCKPCB, ifp, m);
 	if (inp == NULL) {
 		if (udp_log_in_vain) {
 			char buf[4*sizeof "123"];
 
 			strcpy(buf, inet_ntoa(ip->ip_dst));
 			log(LOG_INFO,
 			    "Connection attempt to UDP %s:%d from %s:%d\n",
 			    buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
 			    ntohs(uh->uh_sport));
 		}
 		UDPSTAT_INC(udps_noport);
 		if (m->m_flags & (M_BCAST | M_MCAST)) {
 			UDPSTAT_INC(udps_noportbcast);
 			goto badunlocked;
 		}
 		if (V_udp_blackhole)
 			goto badunlocked;
 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
 			goto badunlocked;
 		*ip = save_ip;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Check the minimum TTL for socket.
 	 */
 	INP_RLOCK_ASSERT(inp);
 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
 		INP_RUNLOCK(inp);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	if (cscov_partial) {
 		struct udpcb *up;
 
 		up = intoudpcb(inp);
 		if (up->u_rxcslen == 0 || up->u_rxcslen > len) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	}
 
 	UDP_PROBE(receive, NULL, inp, ip, inp, uh);
 	if (udp_append(inp, ip, m, iphlen, &udp_in) == 0) 
 		INP_RUNLOCK(inp);
 	return (IPPROTO_DONE);
 
 badunlocked:
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 #endif /* INET */
 
 /*
  * Notify a udp user of an asynchronous error; just wake up so that they can
  * collect error status.
  */
 struct inpcb *
 udp_notify(struct inpcb *inp, int errno)
 {
 
 	/*
 	 * While udp_ctlinput() always calls udp_notify() with a read lock
 	 * when invoking it directly, in_pcbnotifyall() currently uses write
 	 * locks due to sharing code with TCP.  For now, accept either a read
 	 * or a write lock, but a read lock is sufficient.
 	 */
 	INP_LOCK_ASSERT(inp);
+	if ((errno == EHOSTUNREACH || errno == ENETUNREACH ||
+	     errno == EHOSTDOWN) && inp->inp_route.ro_rt) {
+		RTFREE(inp->inp_route.ro_rt);
+		inp->inp_route.ro_rt = (struct rtentry *)NULL;
+	}
 
 	inp->inp_socket->so_error = errno;
 	sorwakeup(inp->inp_socket);
 	sowwakeup(inp->inp_socket);
 	return (inp);
 }
 
 #ifdef INET
 static void
 udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
     struct inpcbinfo *pcbinfo)
 {
 	struct ip *ip = vip;
 	struct udphdr *uh;
 	struct in_addr faddr;
 	struct inpcb *inp;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
-	/*
-	 * Redirects don't need to be handled up here.
-	 */
-	if (PRC_IS_REDIRECT(cmd))
+	if (PRC_IS_REDIRECT(cmd)) {
+		/* signal EHOSTDOWN, as it flushes the cached route */
+		in_pcbnotifyall(&udbinfo, faddr, EHOSTDOWN, udp_notify);
 		return;
+	}
 
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 *
 	 * XXX: We never get this from ICMP, otherwise it makes an excellent
 	 * DoS attack on machines with many connections.
 	 */
 	if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip != NULL) {
 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
 		    ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL);
 		if (inp != NULL) {
 			INP_RLOCK_ASSERT(inp);
 			if (inp->inp_socket != NULL) {
 				udp_notify(inp, inetctlerrmap[cmd]);
 			}
 			INP_RUNLOCK(inp);
 		}
 	} else
 		in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
 		    udp_notify);
 }
 void
 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 
 	return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
 }
 
 void
 udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 
 	return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
 }
 #endif /* INET */
 
 static int
 udp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
 	struct inpcb *inp, **inp_list;
 	inp_gen_t gencnt;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the PCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = V_udbinfo.ipi_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return (0);
 	}
 
 	if (req->newptr != 0)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	INP_INFO_RLOCK(&V_udbinfo);
 	gencnt = V_udbinfo.ipi_gencnt;
 	n = V_udbinfo.ipi_count;
 	INP_INFO_RUNLOCK(&V_udbinfo);
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ n * sizeof(struct xinpcb));
 	if (error != 0)
 		return (error);
 
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
 	if (inp_list == 0)
 		return (ENOMEM);
 
 	INP_INFO_RLOCK(&V_udbinfo);
 	for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
 	     inp = LIST_NEXT(inp, inp_list)) {
 		INP_WLOCK(inp);
 		if (inp->inp_gencnt <= gencnt &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
 			in_pcbref(inp);
 			inp_list[i++] = inp;
 		}
 		INP_WUNLOCK(inp);
 	}
 	INP_INFO_RUNLOCK(&V_udbinfo);
 	n = i;
 
 	error = 0;
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (inp->inp_gencnt <= gencnt) {
 			struct xinpcb xi;
 
 			bzero(&xi, sizeof(xi));
 			xi.xi_len = sizeof xi;
 			/* XXX should avoid extra copy */
 			bcopy(inp, &xi.xi_inp, sizeof *inp);
 			if (inp->inp_socket)
 				sotoxsocket(inp->inp_socket, &xi.xi_socket);
 			xi.xi_inp.inp_gencnt = inp->inp_gencnt;
 			INP_RUNLOCK(inp);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 		} else
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WLOCK(&V_udbinfo);
 	for (i = 0; i < n; i++) {
 		inp = inp_list[i];
 		INP_RLOCK(inp);
 		if (!in_pcbrele_rlocked(inp))
 			INP_RUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(&V_udbinfo);
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.  If the
 		 * generation differs from what we told her before, she knows
 		 * that something happened while we were processing this
 		 * request, and it might be necessary to retry.
 		 */
 		INP_INFO_RLOCK(&V_udbinfo);
 		xig.xig_gen = V_udbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_udbinfo.ipi_count;
 		INP_INFO_RUNLOCK(&V_udbinfo);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	free(inp_list, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
     udp_pcblist, "S,xinpcb", "List of active UDP sockets");
 
 #ifdef INET
 static int
 udp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port,
 	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		INP_RLOCK_ASSERT(inp);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
     udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
 #endif /* INET */
 
 int
 udp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 	int isudplite, error, optval;
 
 	error = 0;
 	isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	INP_WLOCK(inp);
 	if (sopt->sopt_level != so->so_proto->pr_protocol) {
 #ifdef INET6
 		if (INP_CHECK_SOCKAF(so, AF_INET6)) {
 			INP_WUNLOCK(inp);
 			error = ip6_ctloutput(so, sopt);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			INP_WUNLOCK(inp);
 			error = ip_ctloutput(so, sopt);
 		}
 #endif
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case UDP_ENCAP:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 			inp = sotoinpcb(so);
 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 			INP_WLOCK(inp);
 #ifdef IPSEC_NAT_T
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 #endif
 			switch (optval) {
 			case 0:
 				/* Clear all UDP encap. */
 #ifdef IPSEC_NAT_T
 				up->u_flags &= ~UF_ESPINUDP_ALL;
 #endif
 				break;
 #ifdef IPSEC_NAT_T
 			case UDP_ENCAP_ESPINUDP:
 			case UDP_ENCAP_ESPINUDP_NON_IKE:
 				up->u_flags &= ~UF_ESPINUDP_ALL;
 				if (optval == UDP_ENCAP_ESPINUDP)
 					up->u_flags |= UF_ESPINUDP;
 				else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE)
 					up->u_flags |= UF_ESPINUDP_NON_IKE;
 				break;
 #endif
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 		case UDPLITE_SEND_CSCOV:
 		case UDPLITE_RECV_CSCOV:
 			if (!isudplite) {
 				INP_WUNLOCK(inp);
 				error = ENOPROTOOPT;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 			    sizeof(optval));
 			if (error != 0)
 				break;
 			inp = sotoinpcb(so);
 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 			INP_WLOCK(inp);
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 			if ((optval != 0 && optval < 8) || (optval > 65535)) {
 				INP_WUNLOCK(inp);
 				error = EINVAL;
 				break;
 			}
 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
 				up->u_txcslen = optval;
 			else
 				up->u_rxcslen = optval;
 			INP_WUNLOCK(inp);
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 #ifdef IPSEC_NAT_T
 		case UDP_ENCAP:
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 			optval = up->u_flags & UF_ESPINUDP_ALL;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 #endif
 		case UDPLITE_SEND_CSCOV:
 		case UDPLITE_RECV_CSCOV:
 			if (!isudplite) {
 				INP_WUNLOCK(inp);
 				error = ENOPROTOOPT;
 				break;
 			}
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
 				optval = up->u_txcslen;
 			else
 				optval = up->u_rxcslen;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}	
 	return (error);
 }
 
 #ifdef INET
 #define	UH_WLOCKED	2
 #define	UH_RLOCKED	1
 #define	UH_UNLOCKED	0
 static int
 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *td)
 {
 	struct udpiphdr *ui;
 	int len = m->m_pkthdr.len;
 	struct in_addr faddr, laddr;
 	struct cmsghdr *cm;
 	struct inpcbinfo *pcbinfo;
 	struct sockaddr_in *sin, src;
 	int cscov_partial = 0;
 	int error = 0;
 	int ipflags;
 	u_short fport, lport;
-	int unlock_udbinfo;
+	int unlock_udbinfo, unlock_inp;
 	u_char tos;
 	uint8_t pr;
 	uint16_t cscov = 0;
 	uint32_t flowid = 0;
 	uint8_t flowtype = M_HASHTYPE_NONE;
 
 	/*
 	 * udp_output() may need to temporarily bind or connect the current
 	 * inpcb.  As such, we don't know up front whether we will need the
 	 * pcbinfo lock or not.  Do any work to decide what is needed up
 	 * front before acquiring any locks.
 	 */
 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
 		if (control)
 			m_freem(control);
 		m_freem(m);
 		return (EMSGSIZE);
 	}
 
 	src.sin_family = 0;
-	INP_RLOCK(inp);
+	sin = (struct sockaddr_in *)addr;
+	if (sin == NULL ||
+	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
+		INP_WLOCK(inp);
+		unlock_inp = UH_WLOCKED;
+	} else {
+		INP_RLOCK(inp);
+		unlock_inp = UH_RLOCKED;
+	}
 	tos = inp->inp_ip_tos;
 	if (control != NULL) {
 		/*
 		 * XXX: Currently, we assume all the optional information is
 		 * stored in a single mbuf.
 		 */
 		if (control->m_next) {
-			INP_RUNLOCK(inp);
+			if (unlock_inp == UH_WLOCKED)
+				INP_WUNLOCK(inp);
+			else
+				INP_RUNLOCK(inp);
 			m_freem(control);
 			m_freem(m);
 			return (EINVAL);
 		}
 		for (; control->m_len > 0;
 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 			cm = mtod(control, struct cmsghdr *);
 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
 			    || cm->cmsg_len > control->m_len) {
 				error = EINVAL;
 				break;
 			}
 			if (cm->cmsg_level != IPPROTO_IP)
 				continue;
 
 			switch (cm->cmsg_type) {
 			case IP_SENDSRCADDR:
 				if (cm->cmsg_len !=
 				    CMSG_LEN(sizeof(struct in_addr))) {
 					error = EINVAL;
 					break;
 				}
 				bzero(&src, sizeof(src));
 				src.sin_family = AF_INET;
 				src.sin_len = sizeof(src);
 				src.sin_port = inp->inp_lport;
 				src.sin_addr =
 				    *(struct in_addr *)CMSG_DATA(cm);
 				break;
 
 			case IP_TOS:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
 					error = EINVAL;
 					break;
 				}
 				tos = *(u_char *)CMSG_DATA(cm);
 				break;
 
 			case IP_FLOWID:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
 					error = EINVAL;
 					break;
 				}
 				flowid = *(uint32_t *) CMSG_DATA(cm);
 				break;
 
 			case IP_FLOWTYPE:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
 					error = EINVAL;
 					break;
 				}
 				flowtype = *(uint32_t *) CMSG_DATA(cm);
 				break;
 
 #ifdef	RSS
 			case IP_RSSBUCKETID:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
 					error = EINVAL;
 					break;
 				}
 				/* This is just a placeholder for now */
 				break;
 #endif	/* RSS */
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			if (error)
 				break;
 		}
 		m_freem(control);
 	}
 	if (error) {
-		INP_RUNLOCK(inp);
+		if (unlock_inp == UH_WLOCKED)
+			INP_WUNLOCK(inp);
+		else
+			INP_RUNLOCK(inp);
 		m_freem(m);
 		return (error);
 	}
 
 	/*
 	 * Depending on whether or not the application has bound or connected
 	 * the socket, we may have to do varying levels of work.  The optimal
 	 * case is for a connected UDP socket, as a global lock isn't
 	 * required at all.
 	 *
 	 * In order to decide which we need, we require stability of the
 	 * inpcb binding, which we ensure by acquiring a read lock on the
 	 * inpcb.  This doesn't strictly follow the lock order, so we play
 	 * the trylock and retry game; note that we may end up with more
 	 * conservative locks than required the second time around, so later
 	 * assertions have to accept that.  Further analysis of the number of
 	 * misses under contention is required.
 	 *
 	 * XXXRW: Check that hash locking update here is correct.
 	 */
 	pr = inp->inp_socket->so_proto->pr_protocol;
 	pcbinfo = udp_get_inpcbinfo(pr);
 	sin = (struct sockaddr_in *)addr;
 	if (sin != NULL &&
 	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
-		INP_RUNLOCK(inp);
-		INP_WLOCK(inp);
 		INP_HASH_WLOCK(pcbinfo);
 		unlock_udbinfo = UH_WLOCKED;
 	} else if ((sin != NULL && (
 	    (sin->sin_addr.s_addr == INADDR_ANY) ||
 	    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
 	    (inp->inp_laddr.s_addr == INADDR_ANY) ||
 	    (inp->inp_lport == 0))) ||
 	    (src.sin_family == AF_INET)) {
 		INP_HASH_RLOCK(pcbinfo);
 		unlock_udbinfo = UH_RLOCKED;
 	} else
 		unlock_udbinfo = UH_UNLOCKED;
 
 	/*
 	 * If the IP_SENDSRCADDR control message was specified, override the
 	 * source address for this datagram.  Its use is invalidated if the
 	 * address thus specified is incomplete or clobbers other inpcbs.
 	 */
 	laddr = inp->inp_laddr;
 	lport = inp->inp_lport;
 	if (src.sin_family == AF_INET) {
 		INP_HASH_LOCK_ASSERT(pcbinfo);
 		if ((lport == 0) ||
 		    (laddr.s_addr == INADDR_ANY &&
 		     src.sin_addr.s_addr == INADDR_ANY)) {
 			error = EINVAL;
 			goto release;
 		}
 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
 		    &laddr.s_addr, &lport, td->td_ucred);
 		if (error)
 			goto release;
 	}
 
 	/*
 	 * If a UDP socket has been connected, then a local address/port will
 	 * have been selected and bound.
 	 *
 	 * If a UDP socket has not been connected to, then an explicit
 	 * destination address must be used, in which case a local
 	 * address/port may not have been selected and bound.
 	 */
 	if (sin != NULL) {
 		INP_LOCK_ASSERT(inp);
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto release;
 		}
 
 		/*
 		 * Jail may rewrite the destination address, so let it do
 		 * that before we use it.
 		 */
 		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
 		if (error)
 			goto release;
 
 		/*
 		 * If a local address or port hasn't yet been selected, or if
 		 * the destination address needs to be rewritten due to using
 		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
 		 * to do the heavy lifting.  Once a port is selected, we
 		 * commit the binding back to the socket; we also commit the
 		 * binding of the address if in jail.
 		 *
 		 * If we already have a valid binding and we're not
 		 * requesting a destination address rewrite, use a fast path.
 		 */
 		if (inp->inp_laddr.s_addr == INADDR_ANY ||
 		    inp->inp_lport == 0 ||
 		    sin->sin_addr.s_addr == INADDR_ANY ||
 		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
 			INP_HASH_LOCK_ASSERT(pcbinfo);
 			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
 			    &lport, &faddr.s_addr, &fport, NULL,
 			    td->td_ucred);
 			if (error)
 				goto release;
 
 			/*
 			 * XXXRW: Why not commit the port if the address is
 			 * !INADDR_ANY?
 			 */
 			/* Commit the local port if newly assigned. */
 			if (inp->inp_laddr.s_addr == INADDR_ANY &&
 			    inp->inp_lport == 0) {
 				INP_WLOCK_ASSERT(inp);
 				INP_HASH_WLOCK_ASSERT(pcbinfo);
 				/*
 				 * Remember addr if jailed, to prevent
 				 * rebinding.
 				 */
 				if (prison_flag(td->td_ucred, PR_IP4))
 					inp->inp_laddr = laddr;
 				inp->inp_lport = lport;
 				if (in_pcbinshash(inp) != 0) {
 					inp->inp_lport = 0;
 					error = EAGAIN;
 					goto release;
 				}
 				inp->inp_flags |= INP_ANONPORT;
 			}
 		} else {
 			faddr = sin->sin_addr;
 			fport = sin->sin_port;
 		}
 	} else {
 		INP_LOCK_ASSERT(inp);
 		faddr = inp->inp_faddr;
 		fport = inp->inp_fport;
 		if (faddr.s_addr == INADDR_ANY) {
 			error = ENOTCONN;
 			goto release;
 		}
 	}
 
 	/*
 	 * Calculate data length and get a mbuf for UDP, IP, and possible
 	 * link-layer headers.  Immediate slide the data pointer back forward
 	 * since we won't use that space at this layer.
 	 */
 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto release;
 	}
 	m->m_data += max_linkhdr;
 	m->m_len -= max_linkhdr;
 	m->m_pkthdr.len -= max_linkhdr;
 
 	/*
 	 * Fill in mbuf with extended UDP header and addresses and length put
 	 * into network format.
 	 */
 	ui = mtod(m, struct udpiphdr *);
 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
 	ui->ui_pr = pr;
 	ui->ui_src = laddr;
 	ui->ui_dst = faddr;
 	ui->ui_sport = lport;
 	ui->ui_dport = fport;
 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
 	if (pr == IPPROTO_UDPLITE) {
 		struct udpcb *up;
 		uint16_t plen;
 
 		up = intoudpcb(inp);
 		cscov = up->u_txcslen;
 		plen = (u_short)len + sizeof(struct udphdr);
 		if (cscov >= plen)
 			cscov = 0;
 		ui->ui_len = htons(plen);
 		ui->ui_ulen = htons(cscov);
 		/*
 		 * For UDP-Lite, checksum coverage length of zero means
 		 * the entire UDPLite packet is covered by the checksum.
 		 */
 		cscov_partial = (cscov == 0) ? 0 : 1;
 	} else
 		ui->ui_v = IPVERSION << 4;
 
 	/*
 	 * Set the Don't Fragment bit in the IP header.
 	 */
 	if (inp->inp_flags & INP_DONTFRAG) {
 		struct ip *ip;
 
 		ip = (struct ip *)&ui->ui_i;
 		ip->ip_off |= htons(IP_DF);
 	}
 
 	ipflags = 0;
 	if (inp->inp_socket->so_options & SO_DONTROUTE)
 		ipflags |= IP_ROUTETOIF;
 	if (inp->inp_socket->so_options & SO_BROADCAST)
 		ipflags |= IP_ALLOWBROADCAST;
 	if (inp->inp_flags & INP_ONESBCAST)
 		ipflags |= IP_SENDONES;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 	/*
 	 * Set up checksum and output datagram.
 	 */
 	ui->ui_sum = 0;
 	if (pr == IPPROTO_UDPLITE) {
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		if (cscov_partial) {
 			if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
 				ui->ui_sum = 0xffff;
 		} else {
 			if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
 				ui->ui_sum = 0xffff;
 		}
 	} else if (V_udp_cksum) {
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
 		    htons((u_short)len + sizeof(struct udphdr) + pr));
 		m->m_pkthdr.csum_flags = CSUM_UDP;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	}
 	((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
 	((struct ip *)ui)->ip_tos = tos;		/* XXX */
 	UDPSTAT_INC(udps_opackets);
 
 	/*
 	 * Setup flowid / RSS information for outbound socket.
 	 *
 	 * Once the UDP code decides to set a flowid some other way,
 	 * this allows the flowid to be overridden by userland.
 	 */
 	if (flowtype != M_HASHTYPE_NONE) {
 		m->m_pkthdr.flowid = flowid;
 		M_HASHTYPE_SET(m, flowtype);
 #ifdef	RSS
 	} else {
 		uint32_t hash_val, hash_type;
 		/*
 		 * Calculate an appropriate RSS hash for UDP and
 		 * UDP Lite.
 		 *
 		 * The called function will take care of figuring out
 		 * whether a 2-tuple or 4-tuple hash is required based
 		 * on the currently configured scheme.
 		 *
 		 * Later later on connected socket values should be
 		 * cached in the inpcb and reused, rather than constantly
 		 * re-calculating it.
 		 *
 		 * UDP Lite is a different protocol number and will
 		 * likely end up being hashed as a 2-tuple until
 		 * RSS / NICs grow UDP Lite protocol awareness.
 		 */
 		if (rss_proto_software_hash_v4(faddr, laddr, fport, lport,
 		    pr, &hash_val, &hash_type) == 0) {
 			m->m_pkthdr.flowid = hash_val;
 			M_HASHTYPE_SET(m, hash_type);
 		}
 #endif
 	}
 
 #ifdef	RSS
 	/*
 	 * Don't override with the inp cached flowid value.
 	 *
 	 * Depending upon the kind of send being done, the inp
 	 * flowid/flowtype values may actually not be appropriate
 	 * for this particular socket send.
 	 *
 	 * We should either leave the flowid at zero (which is what is
 	 * currently done) or set it to some software generated
 	 * hash value based on the packet contents.
 	 */
 	ipflags |= IP_NODEFAULTFLOWID;
 #endif	/* RSS */
 
 	if (unlock_udbinfo == UH_WLOCKED)
 		INP_HASH_WUNLOCK(pcbinfo);
 	else if (unlock_udbinfo == UH_RLOCKED)
 		INP_HASH_RUNLOCK(pcbinfo);
 	UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
-	error = ip_output(m, inp->inp_options, NULL, ipflags,
+	error = ip_output(m, inp->inp_options,
+	    (unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags,
 	    inp->inp_moptions, inp);
-	if (unlock_udbinfo == UH_WLOCKED)
+	if (unlock_inp == UH_WLOCKED)
 		INP_WUNLOCK(inp);
 	else
 		INP_RUNLOCK(inp);
 	return (error);
 
 release:
 	if (unlock_udbinfo == UH_WLOCKED) {
 		INP_HASH_WUNLOCK(pcbinfo);
 		INP_WUNLOCK(inp);
 	} else if (unlock_udbinfo == UH_RLOCKED) {
 		INP_HASH_RUNLOCK(pcbinfo);
 		INP_RUNLOCK(inp);
 	} else
 		INP_RUNLOCK(inp);
 	m_freem(m);
 	return (error);
 }
 
 
 #if defined(IPSEC) && defined(IPSEC_NAT_T)
 /*
  * Potentially decap ESP in UDP frame.  Check for an ESP header
  * and optional marker; if present, strip the UDP header and
  * push the result through IPSec.
  *
  * Returns mbuf to be processed (potentially re-allocated) or
  * NULL if consumed and/or processed.
  */
 static struct mbuf *
 udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
 {
 	size_t minlen, payload, skip, iphlen;
 	caddr_t data;
 	struct udpcb *up;
 	struct m_tag *tag;
 	struct udphdr *udphdr;
 	struct ip *ip;
 
 	INP_RLOCK_ASSERT(inp);
 
 	/* 
 	 * Pull up data so the longest case is contiguous:
 	 *    IP/UDP hdr + non ESP marker + ESP hdr.
 	 */
 	minlen = off + sizeof(uint64_t) + sizeof(struct esp);
 	if (minlen > m->m_pkthdr.len)
 		minlen = m->m_pkthdr.len;
 	if ((m = m_pullup(m, minlen)) == NULL) {
 		IPSECSTAT_INC(ips_in_inval);
 		return (NULL);		/* Bypass caller processing. */
 	}
 	data = mtod(m, caddr_t);	/* Points to ip header. */
 	payload = m->m_len - off;	/* Size of payload. */
 
 	if (payload == 1 && data[off] == '\xff')
 		return (m);		/* NB: keepalive packet, no decap. */
 
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
 	KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0,
 	    ("u_flags 0x%x", up->u_flags));
 
 	/* 
 	 * Check that the payload is large enough to hold an
 	 * ESP header and compute the amount of data to remove.
 	 *
 	 * NB: the caller has already done a pullup for us.
 	 * XXX can we assume alignment and eliminate bcopys?
 	 */
 	if (up->u_flags & UF_ESPINUDP_NON_IKE) {
 		/*
 		 * draft-ietf-ipsec-nat-t-ike-0[01].txt and
 		 * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring
 		 * possible AH mode non-IKE marker+non-ESP marker
 		 * from draft-ietf-ipsec-udp-encaps-00.txt.
 		 */
 		uint64_t marker;
 
 		if (payload <= sizeof(uint64_t) + sizeof(struct esp))
 			return (m);	/* NB: no decap. */
 		bcopy(data + off, &marker, sizeof(uint64_t));
 		if (marker != 0)	/* Non-IKE marker. */
 			return (m);	/* NB: no decap. */
 		skip = sizeof(uint64_t) + sizeof(struct udphdr);
 	} else {
 		uint32_t spi;
 
 		if (payload <= sizeof(struct esp)) {
 			IPSECSTAT_INC(ips_in_inval);
 			m_freem(m);
 			return (NULL);	/* Discard. */
 		}
 		bcopy(data + off, &spi, sizeof(uint32_t));
 		if (spi == 0)		/* Non-ESP marker. */
 			return (m);	/* NB: no decap. */
 		skip = sizeof(struct udphdr);
 	}
 
 	/*
 	 * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
 	 * the UDP ports. This is required if we want to select
 	 * the right SPD for multiple hosts behind same NAT.
 	 *
 	 * NB: ports are maintained in network byte order everywhere
 	 *     in the NAT-T code.
 	 */
 	tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
 		2 * sizeof(uint16_t), M_NOWAIT);
 	if (tag == NULL) {
 		IPSECSTAT_INC(ips_in_nomem);
 		m_freem(m);
 		return (NULL);		/* Discard. */
 	}
 	iphlen = off - sizeof(struct udphdr);
 	udphdr = (struct udphdr *)(data + iphlen);
 	((uint16_t *)(tag + 1))[0] = udphdr->uh_sport;
 	((uint16_t *)(tag + 1))[1] = udphdr->uh_dport;
 	m_tag_prepend(m, tag);
 
 	/*
 	 * Remove the UDP header (and possibly the non ESP marker)
 	 * IP header length is iphlen
 	 * Before:
 	 *   <--- off --->
 	 *   +----+------+-----+
 	 *   | IP |  UDP | ESP |
 	 *   +----+------+-----+
 	 *        <-skip->
 	 * After:
 	 *          +----+-----+
 	 *          | IP | ESP |
 	 *          +----+-----+
 	 *   <-skip->
 	 */
 	ovbcopy(data, data + skip, iphlen);
 	m_adj(m, skip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_len = htons(ntohs(ip->ip_len) - skip);
 	ip->ip_p = IPPROTO_ESP;
 
 	/*
 	 * We cannot yet update the cksums so clear any
 	 * h/w cksum flags as they are no longer valid.
 	 */
 	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)
 		m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
 
 	(void) ipsec_common_input(m, iphlen, offsetof(struct ip, ip_p),
 				AF_INET, ip->ip_p);
 	return (NULL);			/* NB: consumed, bypass processing. */
 }
 #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
 
 static void
 udp_abort(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_HASH_WLOCK(pcbinfo);
 		in_pcbdisconnect(inp);
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		INP_HASH_WUNLOCK(pcbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
 	error = soreserve(so, udp_sendspace, udp_recvspace);
 	if (error)
 		return (error);
 	INP_INFO_WLOCK(pcbinfo);
 	error = in_pcballoc(so, pcbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(pcbinfo);
 		return (error);
 	}
 
 	inp = sotoinpcb(so);
 	inp->inp_vflag |= INP_IPV4;
 	inp->inp_ip_ttl = V_ip_defttl;
 
 	error = udp_newudpcb(inp);
 	if (error) {
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 		INP_INFO_WUNLOCK(pcbinfo);
 		return (error);
 	}
 
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(pcbinfo);
 	return (0);
 }
 #endif /* INET */
 
 int
 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, void *ctx)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 
 	KASSERT(so->so_type == SOCK_DGRAM,
 	    ("udp_set_kernel_tunneling: !dgram"));
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	if (up->u_tun_func != NULL) {
 		INP_WUNLOCK(inp);
 		return (EBUSY);
 	}
 	up->u_tun_func = f;
 	up->u_tun_ctx = ctx;
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 #ifdef INET
 static int
 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(pcbinfo);
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(pcbinfo);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp_close(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_HASH_WLOCK(pcbinfo);
 		in_pcbdisconnect(inp);
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		INP_HASH_WUNLOCK(pcbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct sockaddr_in *sin;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_WUNLOCK(inp);
 		return (EISCONN);
 	}
 	sin = (struct sockaddr_in *)nam;
 	error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
 	if (error != 0) {
 		INP_WUNLOCK(inp);
 		return (error);
 	}
 	INP_HASH_WLOCK(pcbinfo);
 	error = in_pcbconnect(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(pcbinfo);
 	if (error == 0)
 		soisconnected(so);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct udpcb *up;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
 	    ("udp_detach: not disconnected"));
 	INP_INFO_WLOCK(pcbinfo);
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: up == NULL", __func__));
 	inp->inp_ppcb = NULL;
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	INP_INFO_WUNLOCK(pcbinfo);
 	udp_discardcb(up);
 }
 
 static int
 udp_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
 		INP_WUNLOCK(inp);
 		return (ENOTCONN);
 	}
 	INP_HASH_WLOCK(pcbinfo);
 	in_pcbdisconnect(inp);
 	inp->inp_laddr.s_addr = INADDR_ANY;
 	INP_HASH_WUNLOCK(pcbinfo);
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
 	return (udp_output(inp, m, addr, control, td));
 }
 #endif /* INET */
 
 int
 udp_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 #ifdef INET
 struct pr_usrreqs udp_usrreqs = {
 	.pru_abort =		udp_abort,
 	.pru_attach =		udp_attach,
 	.pru_bind =		udp_bind,
 	.pru_connect =		udp_connect,
 	.pru_control =		in_control,
 	.pru_detach =		udp_detach,
 	.pru_disconnect =	udp_disconnect,
 	.pru_peeraddr =		in_getpeeraddr,
 	.pru_send =		udp_send,
 	.pru_soreceive =	soreceive_dgram,
 	.pru_sosend =		sosend_dgram,
 	.pru_shutdown =		udp_shutdown,
 	.pru_sockaddr =		in_getsockaddr,
 	.pru_sosetlabel =	in_pcbsosetlabel,
 	.pru_close =		udp_close,
 };
 #endif /* INET */
Index: head/sys/netinet6/in6_pcb.c
===================================================================
--- head/sys/netinet6/in6_pcb.c	(revision 297224)
+++ head/sys/netinet6/in6_pcb.c	(revision 297225)
@@ -1,1275 +1,1278 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_pcb.c,v 1.31 2001/05/21 05:45:10 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_pcbgroup.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/tcp_var.h>
 #include <netinet/ip6.h>
 #include <netinet/ip_var.h>
 
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/in_pcb.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/scope6_var.h>
 
 static struct inpcb *in6_pcblookup_hash_locked(struct inpcbinfo *,
     struct in6_addr *, u_int, struct in6_addr *, u_int, int, struct ifnet *);
 
 int
 in6_pcbbind(register struct inpcb *inp, struct sockaddr *nam,
     struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	u_short	lport = 0;
 	int error, lookupflags = 0;
 	int reuseport = (so->so_options & SO_REUSEPORT);
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	if (TAILQ_EMPTY(&V_in6_ifaddrhead))	/* XXX broken! */
 		return (EADDRNOTAVAIL);
 	if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip6(cred, &inp->in6p_laddr,
 		    ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
 			return (error);
 	} else {
 		sin6 = (struct sockaddr_in6 *)nam;
 		if (nam->sa_len != sizeof(*sin6))
 			return (EINVAL);
 		/*
 		 * family check.
 		 */
 		if (nam->sa_family != AF_INET6)
 			return (EAFNOSUPPORT);
 
 		if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
 			return(error);
 
 		if ((error = prison_local_ip6(cred, &sin6->sin6_addr,
 		    ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
 			return (error);
 
 		lport = sin6->sin6_port;
 		if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow compepte duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			struct ifaddr *ifa;
 
 			sin6->sin6_port = 0;		/* yech... */
 			if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin6)) ==
 			    NULL &&
 			    (inp->inp_flags & INP_BINDANY) == 0) {
 				return (EADDRNOTAVAIL);
 			}
 
 			/*
 			 * XXX: bind to an anycast address might accidentally
 			 * cause sending a packet with anycast source address.
 			 * We should allow to bind to a deprecated address, since
 			 * the application dares to use it.
 			 */
 			if (ifa != NULL &&
 			    ((struct in6_ifaddr *)ifa)->ia6_flags &
 			    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) {
 				ifa_free(ifa);
 				return (EADDRNOTAVAIL);
 			}
 			if (ifa != NULL)
 				ifa_free(ifa);
 		}
 		if (lport) {
 			struct inpcb *t;
 			struct tcptw *tw;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
 			    0))
 				return (EACCES);
 			if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) &&
 			    priv_check_cred(inp->inp_cred,
 			    PRIV_NETINET_REUSEPORT, 0) != 0) {
 				t = in6_pcblookup_local(pcbinfo,
 				    &sin6->sin6_addr, lport,
 				    INPLOOKUP_WILDCARD, cred);
 				if (t &&
 				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) &&
 				    (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
 				     !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) ||
 				     (t->inp_flags2 & INP_REUSEPORT) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 
 				/*
 				 * If the socket is a BINDMULTI socket, then
 				 * the credentials need to match and the
 				 * original socket also has to have been bound
 				 * with BINDMULTI.
 				 */
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 
 #ifdef INET
 				if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
 				    IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 					struct sockaddr_in sin;
 
 					in6_sin6_2_sin(&sin, sin6);
 					t = in_pcblookup_local(pcbinfo,
 					    sin.sin_addr, lport,
 					    INPLOOKUP_WILDCARD, cred);
 					if (t &&
 					    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 					    ((t->inp_flags &
 					      INP_TIMEWAIT) == 0) &&
 					    (so->so_type != SOCK_STREAM ||
 					     ntohl(t->inp_faddr.s_addr) ==
 					      INADDR_ANY) &&
 					    (inp->inp_cred->cr_uid !=
 					     t->inp_cred->cr_uid))
 						return (EADDRINUSE);
 
 					if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 						return (EADDRINUSE);
 				}
 #endif
 			}
 			t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr,
 			    lport, lookupflags, cred);
 			if (t && (t->inp_flags & INP_TIMEWAIT)) {
 				/*
 				 * XXXRW: If an incpb has had its timewait
 				 * state recycled, we treat the address as
 				 * being in use (for now).  This is better
 				 * than a panic, but not desirable.
 				 */
 				tw = intotw(t);
 				if (tw == NULL ||
 				    (reuseport & tw->tw_so_options) == 0)
 					return (EADDRINUSE);
 			} else if (t && (reuseport & inp_so_options(t)) == 0) {
 				return (EADDRINUSE);
 			}
 #ifdef INET
 			if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
 			    IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 				struct sockaddr_in sin;
 
 				in6_sin6_2_sin(&sin, sin6);
 				t = in_pcblookup_local(pcbinfo, sin.sin_addr,
 				    lport, lookupflags, cred);
 				if (t && t->inp_flags & INP_TIMEWAIT) {
 					tw = intotw(t);
 					if (tw == NULL)
 						return (EADDRINUSE);
 					if ((reuseport & tw->tw_so_options) == 0
 					    && (ntohl(t->inp_laddr.s_addr) !=
 					     INADDR_ANY || ((inp->inp_vflag &
 					     INP_IPV6PROTO) ==
 					     (t->inp_vflag & INP_IPV6PROTO))))
 						return (EADDRINUSE);
 				} else if (t &&
 				    (reuseport & inp_so_options(t)) == 0 &&
 				    (ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				    (t->inp_vflag & INP_IPV6PROTO) != 0))
 					return (EADDRINUSE);
 			}
 #endif
 		}
 		inp->in6p_laddr = sin6->sin6_addr;
 	}
 	if (lport == 0) {
 		if ((error = in6_pcbsetport(&inp->in6p_laddr, inp, cred)) != 0) {
 			/* Undo an address bind that may have occurred. */
 			inp->in6p_laddr = in6addr_any;
 			return (error);
 		}
 	} else {
 		inp->inp_lport = lport;
 		if (in_pcbinshash(inp) != 0) {
 			inp->in6p_laddr = in6addr_any;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 	return (0);
 }
 
 /*
  *   Transform old in6_pcbconnect() into an inner subroutine for new
  *   in6_pcbconnect(): Do some validity-checking on the remote
  *   address (in mbuf 'nam') and then determine local host address
  *   (i.e., which interface) to use to access that remote host.
  *
  *   This preserves definition of in6_pcbconnect(), while supporting a
  *   slightly different version for T/TCP.  (This is more than
  *   a bit of a kludge, but cleaning up the internal interfaces would
  *   have forced minor changes in every protocol).
  */
 static int
 in6_pcbladdr(register struct inpcb *inp, struct sockaddr *nam,
     struct in6_addr *plocal_addr6)
 {
 	register struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
 	int error = 0;
 	int scope_ambiguous = 0;
 	struct in6_addr in6a;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);	/* XXXRW: why? */
 
 	if (nam->sa_len != sizeof (*sin6))
 		return (EINVAL);
 	if (sin6->sin6_family != AF_INET6)
 		return (EAFNOSUPPORT);
 	if (sin6->sin6_port == 0)
 		return (EADDRNOTAVAIL);
 
 	if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
 		scope_ambiguous = 1;
 	if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
 		return(error);
 
 	if (!TAILQ_EMPTY(&V_in6_ifaddrhead)) {
 		/*
 		 * If the destination address is UNSPECIFIED addr,
 		 * use the loopback addr, e.g ::1.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 			sin6->sin6_addr = in6addr_loopback;
 	}
 	if ((error = prison_remote_ip6(inp->inp_cred, &sin6->sin6_addr)) != 0)
 		return (error);
 
 	error = in6_selectsrc_socket(sin6, inp->in6p_outputopts,
 	    inp, inp->inp_cred, scope_ambiguous, &in6a, NULL);
 	if (error)
 		return (error);
 
 	/*
 	 * Do not update this earlier, in case we return with an error.
 	 *
 	 * XXX: this in6_selectsrc_socket result might replace the bound local
 	 * address with the address specified by setsockopt(IPV6_PKTINFO).
 	 * Is it the intended behavior?
 	 */
 	*plocal_addr6 = in6a;
 
 	/*
 	 * Don't do pcblookup call here; return interface in
 	 * plocal_addr6
 	 * and exit to caller, that will do the lookup.
 	 */
 
 	return (0);
 }
 
 /*
  * Outer subroutine:
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in6_pcbconnect_mbuf(register struct inpcb *inp, struct sockaddr *nam,
     struct ucred *cred, struct mbuf *m)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	register struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
 	struct in6_addr addr6;
 	int error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	/*
 	 * Call inner routine, to assign local interface address.
 	 * in6_pcbladdr() may automatically fill in sin6_scope_id.
 	 */
 	if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0)
 		return (error);
 
 	if (in6_pcblookup_hash_locked(pcbinfo, &sin6->sin6_addr,
 			       sin6->sin6_port,
 			      IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
 			      ? &addr6 : &inp->in6p_laddr,
 			      inp->inp_lport, 0, NULL) != NULL) {
 		return (EADDRINUSE);
 	}
 	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 		if (inp->inp_lport == 0) {
 			error = in6_pcbbind(inp, (struct sockaddr *)0, cred);
 			if (error)
 				return (error);
 		}
 		inp->in6p_laddr = addr6;
 	}
 	inp->in6p_faddr = sin6->sin6_addr;
 	inp->inp_fport = sin6->sin6_port;
 	/* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
 	inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
 	if (inp->inp_flags & IN6P_AUTOFLOWLABEL)
 		inp->inp_flow |=
 		    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
 
 	in_pcbrehash_mbuf(inp, m);
 
 	return (0);
 }
 
 int
 in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 
 	return (in6_pcbconnect_mbuf(inp, nam, cred, NULL));
 }
 
 void
 in6_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	bzero((caddr_t)&inp->in6p_faddr, sizeof(inp->in6p_faddr));
 	inp->inp_fport = 0;
 	/* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
 	inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
 	in_pcbrehash(inp);
 }
 
 struct sockaddr *
 in6_sockaddr(in_port_t port, struct in6_addr *addr_p)
 {
 	struct sockaddr_in6 *sin6;
 
 	sin6 = malloc(sizeof *sin6, M_SONAME, M_WAITOK);
 	bzero(sin6, sizeof *sin6);
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(*sin6);
 	sin6->sin6_port = port;
 	sin6->sin6_addr = *addr_p;
 	(void)sa6_recoverscope(sin6); /* XXX: should catch errors */
 
 	return (struct sockaddr *)sin6;
 }
 
 struct sockaddr *
 in6_v4mapsin6_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in sin;
 	struct sockaddr_in6 *sin6_p;
 
 	bzero(&sin, sizeof sin);
 	sin.sin_family = AF_INET;
 	sin.sin_len = sizeof(sin);
 	sin.sin_port = port;
 	sin.sin_addr = *addr_p;
 
 	sin6_p = malloc(sizeof *sin6_p, M_SONAME,
 		M_WAITOK);
 	in6_sin_2_v4mapsin6(&sin, sin6_p);
 
 	return (struct sockaddr *)sin6_p;
 }
 
 int
 in6_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	register struct inpcb *inp;
 	struct in6_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in6_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->in6p_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in6_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in6_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in6_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in6_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->in6p_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in6_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct	inpcb *inp;
 	int	error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in6_mapped_sockaddr: inp == NULL"));
 
 #ifdef INET
 	if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) {
 		error = in_getsockaddr(so, nam);
 		if (error == 0)
 			in6_sin_2_v4mapsin6_in_sock(nam);
 	} else
 #endif
 	{
 		/* scope issues will be handled in in6_getsockaddr(). */
 		error = in6_getsockaddr(so, nam);
 	}
 
 	return error;
 }
 
 int
 in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct	inpcb *inp;
 	int	error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in6_mapped_peeraddr: inp == NULL"));
 
 #ifdef INET
 	if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) {
 		error = in_getpeeraddr(so, nam);
 		if (error == 0)
 			in6_sin_2_v4mapsin6_in_sock(nam);
 	} else
 #endif
 	/* scope issues will be handled in in6_getpeeraddr(). */
 	error = in6_getpeeraddr(so, nam);
 
 	return error;
 }
 
 /*
  * Pass some notification to all connections of a protocol
  * associated with address dst.  The local address and/or port numbers
  * may be specified to limit the search.  The "usual action" will be
  * taken, depending on the ctlinput cmd.  The caller must filter any
  * cmds that are uninteresting (e.g., no error in the map).
  * Call the protocol specific routine (if any) to report
  * any errors for each matching socket.
  */
 void
 in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr *dst,
     u_int fport_arg, const struct sockaddr *src, u_int lport_arg,
     int cmd, void *cmdarg,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb *inp, *inp_temp;
 	struct sockaddr_in6 sa6_src, *sa6_dst;
 	u_short	fport = fport_arg, lport = lport_arg;
 	u_int32_t flowinfo;
 	int errno;
 
 	if ((unsigned)cmd >= PRC_NCMDS || dst->sa_family != AF_INET6)
 		return;
 
 	sa6_dst = (struct sockaddr_in6 *)dst;
 	if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr))
 		return;
 
 	/*
 	 * note that src can be NULL when we get notify by local fragmentation.
 	 */
 	sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src;
 	flowinfo = sa6_src.sin6_flowinfo;
 
 	/*
 	 * Redirects go to all references to the destination,
 	 * and use in6_rtchange to invalidate the route cache.
 	 * Dead host indications: also use in6_rtchange to invalidate
 	 * the cache, and deliver the error to all the sockets.
 	 * Otherwise, if we have knowledge of the local port and address,
 	 * deliver only to that socket.
 	 */
 	if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) {
 		fport = 0;
 		lport = 0;
 		bzero((caddr_t)&sa6_src.sin6_addr, sizeof(sa6_src.sin6_addr));
 
 		if (cmd != PRC_HOSTDEAD)
 			notify = in6_rtchange;
 	}
 	errno = inet6ctlerrmap[cmd];
 	INP_INFO_WLOCK(pcbinfo);
 	LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 		if ((inp->inp_vflag & INP_IPV6) == 0) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 
 		/*
 		 * If the error designates a new path MTU for a destination
 		 * and the application (associated with this socket) wanted to
 		 * know the value, notify.
 		 * XXX: should we avoid to notify the value to TCP sockets?
 		 */
 		if (cmd == PRC_MSGSIZE && cmdarg != NULL)
 			ip6_notify_pmtu(inp, (struct sockaddr_in6 *)dst,
 					*(u_int32_t *)cmdarg);
 
 		/*
 		 * Detect if we should notify the error. If no source and
 		 * destination ports are specifed, but non-zero flowinfo and
 		 * local address match, notify the error. This is the case
 		 * when the error is delivered with an encrypted buffer
 		 * by ESP. Otherwise, just compare addresses and ports
 		 * as usual.
 		 */
 		if (lport == 0 && fport == 0 && flowinfo &&
 		    inp->inp_socket != NULL &&
 		    flowinfo == (inp->inp_flow & IPV6_FLOWLABEL_MASK) &&
 		    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr))
 			goto do_notify;
 		else if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr,
 					     &sa6_dst->sin6_addr) ||
 			 inp->inp_socket == 0 ||
 			 (lport && inp->inp_lport != lport) ||
 			 (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) &&
 			  !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
 					      &sa6_src.sin6_addr)) ||
 			 (fport && inp->inp_fport != fport)) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 
 	  do_notify:
 		if (notify) {
 			if ((*notify)(inp, errno))
 				INP_WUNLOCK(inp);
 		} else
 			INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 /*
  * Lookup a PCB based on the local address and port.  Caller must hold the
  * hash lock.  No inpcb locks or references are acquired.
  */
 struct inpcb *
 in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr,
     u_short lport, int lookupflags, struct ucred *cred)
 {
 	register struct inpcb *inp;
 	int matchwild = 3, wildcard;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(
 		    INP6_PCBHASHKEY(&in6addr_any), lport, 0,
 		    pcbinfo->ipi_hashmask)];
 		LIST_FOREACH(inp, head, inp_hash) {
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
 			if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 			    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
 			    inp->inp_lport == lport) {
 				/* Found. */
 				if (cred == NULL ||
 				    prison_equal_ip6(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 				if (cred != NULL &&
 				    !prison_equal_ip6(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					continue;
 				/* XXX inp locking */
 				if ((inp->inp_vflag & INP_IPV6) == 0)
 					continue;
 				if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
 					wildcard++;
 				if (!IN6_IS_ADDR_UNSPECIFIED(
 					&inp->in6p_laddr)) {
 					if (IN6_IS_ADDR_UNSPECIFIED(laddr))
 						wildcard++;
 					else if (!IN6_ARE_ADDR_EQUAL(
 					    &inp->in6p_laddr, laddr))
 						continue;
 				} else {
 					if (!IN6_IS_ADDR_UNSPECIFIED(laddr))
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0)
 						break;
 				}
 			}
 		}
 		return (match);
 	}
 }
 
 void
 in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb *in6p;
 	struct ip6_moptions *im6o;
 	int i, gap;
 
 	INP_INFO_WLOCK(pcbinfo);
 	LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) {
 		INP_WLOCK(in6p);
 		im6o = in6p->in6p_moptions;
 		if ((in6p->inp_vflag & INP_IPV6) && im6o != NULL) {
 			/*
 			 * Unselect the outgoing ifp for multicast if it
 			 * is being detached.
 			 */
 			if (im6o->im6o_multicast_ifp == ifp)
 				im6o->im6o_multicast_ifp = NULL;
 			/*
 			 * Drop multicast group membership if we joined
 			 * through the interface being detached.
 			 */
 			gap = 0;
 			for (i = 0; i < im6o->im6o_num_memberships; i++) {
 				if (im6o->im6o_membership[i]->in6m_ifp ==
 				    ifp) {
 					in6_mc_leave(im6o->im6o_membership[i],
 					    NULL);
 					gap++;
 				} else if (gap != 0) {
 					im6o->im6o_membership[i - gap] =
 					    im6o->im6o_membership[i];
 				}
 			}
 			im6o->im6o_num_memberships -= gap;
 		}
 		INP_WUNLOCK(in6p);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 /*
  * Check for alternatives when higher level complains
  * about service problems.  For now, invalidate cached
  * routing information.  If the route was created dynamically
  * (by a redirect), time to try a default gateway again.
  */
 void
 in6_losing(struct inpcb *in6p)
 {
 
-	/*
-	 * We don't store route pointers in the routing table anymore
-	 */
+	if (in6p->inp_route6.ro_rt) {
+		RTFREE(in6p->inp_route6.ro_rt);
+		in6p->inp_route6.ro_rt = (struct rtentry *)NULL;
+	}
 	return;
 }
 
 /*
  * After a routing change, flush old routing
  * and allocate a (hopefully) better one.
  */
 struct inpcb *
 in6_rtchange(struct inpcb *inp, int errno)
 {
-	/*
-	 * We don't store route pointers in the routing table anymore
-	 */
+
+	if (inp->inp_route6.ro_rt) {
+		RTFREE(inp->inp_route6.ro_rt);
+		inp->inp_route6.ro_rt = (struct rtentry *)NULL;
+	}
 	return inp;
 }
 
 #ifdef PCBGROUP
 /*
  * Lookup PCB in hash list, using pcbgroup tables.
  */
 static struct inpcb *
 in6_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
     struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr,
     u_int lport_arg, int lookupflags, struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	INP_GROUP_LOCK(pcbgroup);
 	head = &pcbgroup->ipg_hashbase[INP_PCBHASH(
 	    INP6_PCBHASHKEY(faddr), lport, fport, pcbgroup->ipg_hashmask)];
 	LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV6) == 0)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) &&
 		    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP6))
 				goto found;
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL) {
 		inp = tmpinp;
 		goto found;
 	}
 
 	/*
 	 * Then look for a wildcard match in the pcbgroup.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbgroup->ipg_hashbase[
 		    INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)];
 		LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
 
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) ||
 			    inp->inp_lport != lport) {
 				continue;
 			}
 
 			injail = prison_flag(inp->inp_cred, PR_IP6);
 			if (injail) {
 				if (prison_check_ip6(inp->inp_cred,
 				    laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 				if (injail)
 					jail_wild = inp;
 				else
 					local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 		if (inp != NULL)
 			goto found;
 	}
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbinfo->ipi_wildbase[INP_PCBHASH(
 		    INP6_PCBHASHKEY(&in6addr_any), lport, 0,
 		    pcbinfo->ipi_wildmask)];
 		LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
 
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) ||
 			    inp->inp_lport != lport) {
 				continue;
 			}
 
 			injail = prison_flag(inp->inp_cred, PR_IP6);
 			if (injail) {
 				if (prison_check_ip6(inp->inp_cred,
 				    laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) {
 				if (injail)
 					goto found;
 				else
 					local_exact = inp;
 			} else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 				if (injail)
 					jail_wild = inp;
 				else
 					local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		inp = jail_wild;
 		if (inp == NULL)
 			inp = jail_wild;
 		if (inp == NULL)
 			inp = local_exact;
 		if (inp == NULL)
 			inp = local_wild;
 		if (inp != NULL)
 			goto found;
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 	INP_GROUP_UNLOCK(pcbgroup);
 	return (NULL);
 
 found:
 	in_pcbref(inp);
 	INP_GROUP_UNLOCK(pcbgroup);
 	if (lookupflags & INPLOOKUP_WLOCKPCB) {
 		INP_WLOCK(inp);
 		if (in_pcbrele_wlocked(inp))
 			return (NULL);
 	} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 		INP_RLOCK(inp);
 		if (in_pcbrele_rlocked(inp))
 			return (NULL);
 	} else
 		panic("%s: locking buf", __func__);
 	return (inp);
 }
 #endif /* PCBGROUP */
 
 /*
  * Lookup PCB in hash list.
  */
 static struct inpcb *
 in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
     u_int fport_arg, struct in6_addr *laddr, u_int lport_arg,
     int lookupflags, struct ifnet *ifp)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(
 	    INP6_PCBHASHKEY(faddr), lport, fport, pcbinfo->ipi_hashmask)];
 	LIST_FOREACH(inp, head, inp_hash) {
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV6) == 0)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) &&
 		    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP6))
 				return (inp);
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(
 		    INP6_PCBHASHKEY(&in6addr_any), lport, 0,
 		    pcbinfo->ipi_hashmask)];
 		LIST_FOREACH(inp, head, inp_hash) {
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
 
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) ||
 			    inp->inp_lport != lport) {
 				continue;
 			}
 
 			injail = prison_flag(inp->inp_cred, PR_IP6);
 			if (injail) {
 				if (prison_check_ip6(inp->inp_cred,
 				    laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) {
 				if (injail)
 					return (inp);
 				else
 					local_exact = inp;
 			} else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 				if (injail)
 					jail_wild = inp;
 				else
 					local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		if (jail_wild != NULL)
 			return (jail_wild);
 		if (local_exact != NULL)
 			return (local_exact);
 		if (local_wild != NULL)
 			return (local_wild);
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 
 	/*
 	 * Not found.
 	 */
 	return (NULL);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
  * hash list lock, and will return the inpcb locked (i.e., requires
  * INPLOOKUP_LOCKPCB).
  */
 static struct inpcb *
 in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
     u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags,
     struct ifnet *ifp)
 {
 	struct inpcb *inp;
 
 	INP_HASH_RLOCK(pcbinfo);
 	inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 	    (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
 	if (inp != NULL) {
 		in_pcbref(inp);
 		INP_HASH_RUNLOCK(pcbinfo);
 		if (lookupflags & INPLOOKUP_WLOCKPCB) {
 			INP_WLOCK(inp);
 			if (in_pcbrele_wlocked(inp))
 				return (NULL);
 		} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 			INP_RLOCK(inp);
 			if (in_pcbrele_rlocked(inp))
 				return (NULL);
 		} else
 			panic("%s: locking bug", __func__);
 	} else
 		INP_HASH_RUNLOCK(pcbinfo);
 	return (inp);
 }
 
 /*
  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
  * from which a pre-calculated hash value may be extracted.
  *
  * Possibly more of this logic should be in in6_pcbgroup.c.
  */
 struct inpcb *
 in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport,
     struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 {
 #if defined(PCBGROUP) && !defined(RSS)
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	/*
 	 * When not using RSS, use connection groups in preference to the
 	 * reservation table when looking up 4-tuples.  When using RSS, just
 	 * use the reservation table, due to the cost of the Toeplitz hash
 	 * in software.
 	 *
 	 * XXXRW: This policy belongs in the pcbgroup code, as in principle
 	 * we could be doing RSS with a non-Toeplitz hash that is affordable
 	 * in software.
 	 */
 #if defined(PCBGROUP) && !defined(RSS)
 	if (in_pcbgroup_enabled(pcbinfo)) {
 		pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 	}
 #endif
 	return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 
 struct inpcb *
 in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
     u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, struct mbuf *m)
 {
 #ifdef PCBGROUP
 	struct inpcbgroup *pcbgroup;
 #endif
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 #ifdef PCBGROUP
 	/*
 	 * If we can use a hardware-generated hash to look up the connection
 	 * group, use that connection group to find the inpcb.  Otherwise
 	 * fall back on a software hash -- or the reservation table if we're
 	 * using RSS.
 	 *
 	 * XXXRW: As above, that policy belongs in the pcbgroup code.
 	 */
 	if (in_pcbgroup_enabled(pcbinfo) &&
 	    M_HASHTYPE_TEST(m, M_HASHTYPE_NONE) == 0) {
 		pcbgroup = in6_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
 		    m->m_pkthdr.flowid);
 		if (pcbgroup != NULL)
 			return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr,
 			    fport, laddr, lport, lookupflags, ifp));
 #ifndef RSS
 		pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 		    fport);
 		return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 		    laddr, lport, lookupflags, ifp));
 #endif
 	}
 #endif
 	return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp));
 }
 
 void
 init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m)
 {
 	struct ip6_hdr *ip;
 
 	ip = mtod(m, struct ip6_hdr *);
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_len = sizeof(*sin6);
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_addr = ip->ip6_src;
 
 	(void)sa6_recoverscope(sin6); /* XXX: should catch errors... */
 
 	return;
 }
Index: head/sys/netinet6/ip6_output.c
===================================================================
--- head/sys/netinet6/ip6_output.c	(revision 297224)
+++ head/sys/netinet6/ip6_output.c	(revision 297225)
@@ -1,3062 +1,3075 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
 #include "opt_sctp.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 
 #include <machine/in_cksum.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/pfil.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_rss.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #include <netipsec/key.h>
 #include <netinet6/ip6_ipsec.h>
 #endif /* IPSEC */
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <netinet6/ip6protosw.h>
 #include <netinet6/scope6_var.h>
 
 #ifdef FLOWTABLE
 #include <net/flowtable.h>
 #endif
 
 extern int in6_mcast_loop;
 
 struct ip6_exthdrs {
 	struct mbuf *ip6e_ip6;
 	struct mbuf *ip6e_hbh;
 	struct mbuf *ip6e_dest1;
 	struct mbuf *ip6e_rthdr;
 	struct mbuf *ip6e_dest2;
 };
 
 static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **,
 			   struct ucred *, int);
 static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *,
 	struct socket *, struct sockopt *);
 static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *);
 static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *,
 	struct ucred *, int, int, int);
 
 static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
 static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
 	struct ip6_frag **);
 static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
 static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
 static int ip6_getpmtu(struct route_in6 *, int,
 	struct ifnet *, struct in6_addr *, u_long *, int *, u_int);
 static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long,
 	u_long *, int *);
 static int ip6_getpmtu_ctl(u_int, struct in6_addr *, u_long *);
 static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
 
 
 /*
  * Make an extension header from option data.  hp is the source, and
  * mp is the destination.
  */
 #define MAKE_EXTHDR(hp, mp)						\
     do {								\
 	if (hp) {							\
 		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
 		error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
 		    ((eh)->ip6e_len + 1) << 3);				\
 		if (error)						\
 			goto freehdrs;					\
 	}								\
     } while (/*CONSTCOND*/ 0)
 
 /*
  * Form a chain of extension headers.
  * m is the extension header mbuf
  * mp is the previous mbuf in the chain
  * p is the next header
  * i is the type of option.
  */
 #define MAKE_CHAIN(m, mp, p, i)\
     do {\
 	if (m) {\
 		if (!hdrsplit) \
 			panic("assumption failed: hdr not split"); \
 		*mtod((m), u_char *) = *(p);\
 		*(p) = (i);\
 		p = mtod((m), u_char *);\
 		(m)->m_next = (mp)->m_next;\
 		(mp)->m_next = (m);\
 		(mp) = (m);\
 	}\
     } while (/*CONSTCOND*/ 0)
 
 void
 in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset)
 {
 	u_short csum;
 
 	csum = in_cksum_skip(m, offset + plen, offset);
 	if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0)
 		csum = 0xffff;
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(u_short) > m->m_len) {
 		printf("%s: delayed m_pullup, m->len: %d plen %u off %u "
 		    "csum_flags=%b\n", __func__, m->m_len, plen, offset,
 		    (int)m->m_pkthdr.csum_flags, CSUM_BITS);
 		/*
 		 * XXX this should not happen, but if it does, the correct
 		 * behavior may be to insert the checksum in the appropriate
 		 * next mbuf in the chain.
 		 */
 		return;
 	}
 	*(u_short *)(m->m_data + offset) = csum;
 }
 
 int
 ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto,
     int mtu, uint32_t id)
 {
 	struct mbuf *m, **mnext, *m_frgpart;
 	struct ip6_hdr *ip6, *mhip6;
 	struct ip6_frag *ip6f;
 	int off;
 	int error;
 	int tlen = m0->m_pkthdr.len;
 
 	m = m0;
 	ip6 = mtod(m, struct ip6_hdr *);
 	mnext = &m->m_nextpkt;
 
 	for (off = hlen; off < tlen; off += mtu) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (!m) {
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 		m->m_flags = m0->m_flags & M_COPYFLAGS;
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 		m->m_data += max_linkhdr;
 		mhip6 = mtod(m, struct ip6_hdr *);
 		*mhip6 = *ip6;
 		m->m_len = sizeof(*mhip6);
 		error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
 		if (error) {
 			IP6STAT_INC(ip6s_odropped);
 			return (error);
 		}
 		ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
 		if (off + mtu >= tlen)
 			mtu = tlen - off;
 		else
 			ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
 		mhip6->ip6_plen = htons((u_short)(mtu + hlen +
 		    sizeof(*ip6f) - sizeof(struct ip6_hdr)));
 		if ((m_frgpart = m_copy(m0, off, mtu)) == 0) {
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 		m_cat(m, m_frgpart);
 		m->m_pkthdr.len = mtu + hlen + sizeof(*ip6f);
 		m->m_pkthdr.fibnum = m0->m_pkthdr.fibnum;
 		m->m_pkthdr.rcvif = NULL;
 		ip6f->ip6f_reserved = 0;
 		ip6f->ip6f_ident = id;
 		ip6f->ip6f_nxt = nextproto;
 		IP6STAT_INC(ip6s_ofragments);
 		in6_ifstat_inc(ifp, ifs6_out_fragcreat);
 	}
 
 	return (0);
 }
 
 /*
  * IP6 output. The packet in mbuf chain m contains a skeletal IP6
  * header (with pri, len, nxt, hlim, src, dst).
  * This function may modify ver and hlim only.
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route_in6 ro is present and has ro_rt initialized, route lookup would be
  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
  * then result of route lookup is stored in ro->ro_rt.
  *
  * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and
  * nd_ifinfo.linkmtu is u_int32_t.  so we use u_long to hold largest one,
  * which is rt_mtu.
  *
  * ifpp - XXX: just for statistics
  */
 /*
  * XXX TODO: no flowid is assigned for outbound flows?
  */
 int
 ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
     struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
     struct ifnet **ifpp, struct inpcb *inp)
 {
 	struct ip6_hdr *ip6;
 	struct ifnet *ifp, *origifp;
 	struct mbuf *m = m0;
 	struct mbuf *mprev = NULL;
 	int hlen, tlen, len;
 	struct route_in6 ip6route;
 	struct rtentry *rt = NULL;
 	struct sockaddr_in6 *dst, src_sa, dst_sa;
 	struct in6_addr odst;
 	int error = 0;
 	struct in6_ifaddr *ia = NULL;
 	u_long mtu;
 	int alwaysfrag, dontfrag;
 	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
 	struct ip6_exthdrs exthdrs;
 	struct in6_addr finaldst, src0, dst0;
 	u_int32_t zone;
 	struct route_in6 *ro_pmtu = NULL;
 	int hdrsplit = 0;
 	int sw_csum, tso;
 	int needfiblookup;
 	uint32_t fibnum;
 	struct m_tag *fwd_tag = NULL;
 	uint32_t id;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (ip6 == NULL) {
 		printf ("ip6 is NULL");
 		goto bad;
 	}
 
 	if (inp != NULL) {
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if ((flags & IP_NODEFAULTFLOWID) == 0) {
 			/* unconditionally set flowid */
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 		}
 	}
 
 	finaldst = ip6->ip6_dst;
 	bzero(&exthdrs, sizeof(exthdrs));
 	if (opt) {
 		/* Hop-by-Hop options header */
 		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
 		/* Destination options header(1st part) */
 		if (opt->ip6po_rthdr) {
 			/*
 			 * Destination options header(1st part)
 			 * This only makes sense with a routing header.
 			 * See Section 9.2 of RFC 3542.
 			 * Disabling this part just for MIP6 convenience is
 			 * a bad idea.  We need to think carefully about a
 			 * way to make the advanced API coexist with MIP6
 			 * options, which might automatically be inserted in
 			 * the kernel.
 			 */
 			MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
 		}
 		/* Routing header */
 		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
 		/* Destination options header(2nd part) */
 		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
 	}
 
 #ifdef IPSEC
 	/*
 	 * IPSec checking which handles several cases.
 	 * FAST IPSEC: We re-injected the packet.
 	 * XXX: need scope argument.
 	 */
 	switch(ip6_ipsec_output(&m, inp, &error))
 	{
 	case 1:                 /* Bad packet */
 		goto freehdrs;
 	case -1:                /* IPSec done */
 		goto done;
 	case 0:                 /* No IPSec */
 	default:
 		break;
 	}
 #endif /* IPSEC */
 
 	/*
 	 * Calculate the total length of the extension header chain.
 	 * Keep the length of the unfragmentable part for fragmentation.
 	 */
 	optlen = 0;
 	if (exthdrs.ip6e_hbh)
 		optlen += exthdrs.ip6e_hbh->m_len;
 	if (exthdrs.ip6e_dest1)
 		optlen += exthdrs.ip6e_dest1->m_len;
 	if (exthdrs.ip6e_rthdr)
 		optlen += exthdrs.ip6e_rthdr->m_len;
 	unfragpartlen = optlen + sizeof(struct ip6_hdr);
 
 	/* NOTE: we don't add AH/ESP length here (done in ip6_ipsec_output) */
 	if (exthdrs.ip6e_dest2)
 		optlen += exthdrs.ip6e_dest2->m_len;
 
 	/*
 	 * If there is at least one extension header,
 	 * separate IP6 header from the payload.
 	 */
 	if (optlen && !hdrsplit) {
 		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 			m = NULL;
 			goto freehdrs;
 		}
 		m = exthdrs.ip6e_ip6;
 		hdrsplit++;
 	}
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* adjust mbuf packet header length */
 	m->m_pkthdr.len += optlen;
 	plen = m->m_pkthdr.len - sizeof(*ip6);
 
 	/* If this is a jumbo payload, insert a jumbo payload option. */
 	if (plen > IPV6_MAXPACKET) {
 		if (!hdrsplit) {
 			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 				m = NULL;
 				goto freehdrs;
 			}
 			m = exthdrs.ip6e_ip6;
 			hdrsplit++;
 		}
 		/* adjust pointer */
 		ip6 = mtod(m, struct ip6_hdr *);
 		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
 			goto freehdrs;
 		ip6->ip6_plen = 0;
 	} else
 		ip6->ip6_plen = htons(plen);
 
 	/*
 	 * Concatenate headers and fill in next header fields.
 	 * Here we have, on "m"
 	 *	IPv6 payload
 	 * and we insert headers accordingly.  Finally, we should be getting:
 	 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
 	 *
 	 * during the header composing process, "m" points to IPv6 header.
 	 * "mprev" points to an extension header prior to esp.
 	 */
 	u_char *nexthdrp = &ip6->ip6_nxt;
 	mprev = m;
 
 	/*
 	 * we treat dest2 specially.  this makes IPsec processing
 	 * much easier.  the goal here is to make mprev point the
 	 * mbuf prior to dest2.
 	 *
 	 * result: IPv6 dest2 payload
 	 * m and mprev will point to IPv6 header.
 	 */
 	if (exthdrs.ip6e_dest2) {
 		if (!hdrsplit)
 			panic("assumption failed: hdr not split");
 		exthdrs.ip6e_dest2->m_next = m->m_next;
 		m->m_next = exthdrs.ip6e_dest2;
 		*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
 		ip6->ip6_nxt = IPPROTO_DSTOPTS;
 	}
 
 	/*
 	 * result: IPv6 hbh dest1 rthdr dest2 payload
 	 * m will point to IPv6 header.  mprev will point to the
 	 * extension header prior to dest2 (rthdr in the above case).
 	 */
 	MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
 	MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
 		   IPPROTO_DSTOPTS);
 	MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
 		   IPPROTO_ROUTING);
 
 	/*
 	 * If there is a routing header, discard the packet.
 	 */
 	if (exthdrs.ip6e_rthdr) {
 		 error = EINVAL;
 		 goto bad;
 	}
 
 	/* Source address validation */
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
 	    (flags & IPV6_UNSPECSRC) == 0) {
 		error = EOPNOTSUPP;
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 		error = EOPNOTSUPP;
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 
 	IP6STAT_INC(ip6s_localout);
 
 	/*
 	 * Route packet.
 	 */
 	if (ro == 0) {
 		ro = &ip6route;
 		bzero((caddr_t)ro, sizeof(*ro));
 	}
 	ro_pmtu = ro;
 	if (opt && opt->ip6po_rthdr)
 		ro = &opt->ip6po_route;
 	dst = (struct sockaddr_in6 *)&ro->ro_dst;
 #ifdef FLOWTABLE
 	if (ro->ro_rt == NULL)
 		(void )flowtable_lookup(AF_INET6, m, (struct route *)ro);
 #endif
 	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
 again:
 	/*
 	 * if specified, try to fill in the traffic class field.
 	 * do not override if a non-zero value is already set.
 	 * we check the diffserv field and the ecn field separately.
 	 */
 	if (opt && opt->ip6po_tclass >= 0) {
 		int mask = 0;
 
 		if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
 			mask |= 0xfc;
 		if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
 			mask |= 0x03;
 		if (mask != 0)
 			ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
 	}
 
 	/* fill in or override the hop limit field, if necessary. */
 	if (opt && opt->ip6po_hlim != -1)
 		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
 	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (im6o != NULL)
 			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
 		else
 			ip6->ip6_hlim = V_ip6_defmcasthlim;
 	}
 
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
-	if (ro->ro_rt && fwd_tag == NULL) {
+	/*
+	 * Validate route against routing table additions;
+	 * a better/more specific route might have been added.
+	 * Make sure address family is set in route.
+	 */
+	if (inp) {
+		ro->ro_dst.sin6_family = AF_INET6;
+		RT_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, fibnum);
+	}
+	if (ro->ro_rt && fwd_tag == NULL && (ro->ro_rt->rt_flags & RTF_UP) &&
+	    ro->ro_dst.sin6_family == AF_INET6 &&
+	    IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) {
 		rt = ro->ro_rt;
 		ifp = ro->ro_rt->rt_ifp;
 	} else {
 		if (fwd_tag == NULL) {
 			bzero(&dst_sa, sizeof(dst_sa));
 			dst_sa.sin6_family = AF_INET6;
 			dst_sa.sin6_len = sizeof(dst_sa);
 			dst_sa.sin6_addr = ip6->ip6_dst;
 		}
 		error = in6_selectroute_fib(&dst_sa, opt, im6o, ro, &ifp,
 		    &rt, fibnum);
 		if (error != 0) {
 			if (ifp != NULL)
 				in6_ifstat_inc(ifp, ifs6_out_discard);
 			goto bad;
 		}
 	}
 	if (rt == NULL) {
 		/*
 		 * If in6_selectroute() does not return a route entry,
 		 * dst may not have been updated.
 		 */
 		*dst = dst_sa;	/* XXX */
 	}
 
 	/*
 	 * then rt (for unicast) and ifp must be non-NULL valid values.
 	 */
 	if ((flags & IPV6_FORWARDING) == 0) {
 		/* XXX: the FORWARDING flag can be set for mrouting. */
 		in6_ifstat_inc(ifp, ifs6_out_request);
 	}
 	if (rt != NULL) {
 		ia = (struct in6_ifaddr *)(rt->rt_ifa);
 		counter_u64_add(rt->rt_pksent, 1);
 	}
 
 
 	/*
 	 * The outgoing interface must be in the zone of source and
 	 * destination addresses.
 	 */
 	origifp = ifp;
 
 	src0 = ip6->ip6_src;
 	if (in6_setscope(&src0, origifp, &zone))
 		goto badscope;
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = ip6->ip6_src;
 	if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id)
 		goto badscope;
 
 	dst0 = ip6->ip6_dst;
 	if (in6_setscope(&dst0, origifp, &zone))
 		goto badscope;
 	/* re-initialize to be sure */
 	bzero(&dst_sa, sizeof(dst_sa));
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(dst_sa);
 	dst_sa.sin6_addr = ip6->ip6_dst;
 	if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) {
 		goto badscope;
 	}
 
 	/* We should use ia_ifp to support the case of
 	 * sending packets to an address of our own.
 	 */
 	if (ia != NULL && ia->ia_ifp)
 		ifp = ia->ia_ifp;
 
 	/* scope check is done. */
 	goto routefound;
 
   badscope:
 	IP6STAT_INC(ip6s_badscope);
 	in6_ifstat_inc(origifp, ifs6_out_discard);
 	if (error == 0)
 		error = EHOSTUNREACH; /* XXX */
 	goto bad;
 
   routefound:
 	if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (opt && opt->ip6po_nextroute.ro_rt) {
 			/*
 			 * The nexthop is explicitly specified by the
 			 * application.  We assume the next hop is an IPv6
 			 * address.
 			 */
 			dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
 		}
 		else if ((rt->rt_flags & RTF_GATEWAY))
 			dst = (struct sockaddr_in6 *)rt->rt_gateway;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
 	} else {
 		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
 		in6_ifstat_inc(ifp, ifs6_out_mcast);
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if (!(ifp->if_flags & IFF_MULTICAST)) {
 			IP6STAT_INC(ip6s_noroute);
 			in6_ifstat_inc(ifp, ifs6_out_discard);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		if ((im6o == NULL && in6_mcast_loop) ||
 		    (im6o && im6o->im6o_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we have not joined
 			 * the address; protocols will filter it later,
 			 * thus deferring a hash lookup and lock acquisition
 			 * at the expense of an m_copym().
 			 */
 			ip6_mloopback(ifp, m);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IPV6_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip6_mloopback(),
 			 * above, will be forwarded by the ip6_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
 				/*
 				 * XXX: ip6_mforward expects that rcvif is NULL
 				 * when it is called from the originating path.
 				 * However, it may not always be the case.
 				 */
 				m->m_pkthdr.rcvif = NULL;
 				if (ip6_mforward(ip6, ifp, m) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 		/*
 		 * Multicasts with a hoplimit of zero may be looped back,
 		 * above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip6_mloopback() will
 		 * loop back a copy if this host actually belongs to the
 		 * destination group on the loopback interface.
 		 */
 		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
 		    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
 			m_freem(m);
 			goto done;
 		}
 	}
 
 	/*
 	 * Fill the outgoing inteface to tell the upper layer
 	 * to increment per-interface statistics.
 	 */
 	if (ifpp)
 		*ifpp = ifp;
 
 	/* Determine path MTU. */
 	if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &finaldst, &mtu,
 	    &alwaysfrag, fibnum)) != 0)
 		goto bad;
 
 	/*
 	 * The caller of this function may specify to use the minimum MTU
 	 * in some cases.
 	 * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
 	 * setting.  The logic is a bit complicated; by default, unicast
 	 * packets will follow path MTU while multicast packets will be sent at
 	 * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
 	 * including unicast ones will be sent at the minimum MTU.  Multicast
 	 * packets will always be sent at the minimum MTU unless
 	 * IP6PO_MINMTU_DISABLE is explicitly specified.
 	 * See RFC 3542 for more details.
 	 */
 	if (mtu > IPV6_MMTU) {
 		if ((flags & IPV6_MINMTU))
 			mtu = IPV6_MMTU;
 		else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
 			mtu = IPV6_MMTU;
 		else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 			 (opt == NULL ||
 			  opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
 			mtu = IPV6_MMTU;
 		}
 	}
 
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	/*
 	 * If the outgoing packet contains a hop-by-hop options header,
 	 * it must be examined and processed even by the source node.
 	 * (RFC 2460, section 4.)
 	 */
 	if (exthdrs.ip6e_hbh) {
 		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
 		u_int32_t dummy; /* XXX unused */
 		u_int32_t plen = 0; /* XXX: ip6_process will check the value */
 
 #ifdef DIAGNOSTIC
 		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
 			panic("ip6e_hbh is not contiguous");
 #endif
 		/*
 		 *  XXX: if we have to send an ICMPv6 error to the sender,
 		 *       we need the M_LOOP flag since icmp6_error() expects
 		 *       the IPv6 and the hop-by-hop options header are
 		 *       contiguous unless the flag is set.
 		 */
 		m->m_flags |= M_LOOP;
 		m->m_pkthdr.rcvif = ifp;
 		if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
 		    ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
 		    &dummy, &plen) < 0) {
 			/* m was already freed at this point */
 			error = EINVAL;/* better error? */
 			goto done;
 		}
 		m->m_flags &= ~M_LOOP; /* XXX */
 		m->m_pkthdr.rcvif = NULL;
 	}
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED(&V_inet6_pfil_hook))
 		goto passout;
 
 	odst = ip6->ip6_dst;
 	/* Run through list of hooks for output packets. */
 	error = pfil_run_hooks(&V_inet6_pfil_hook, &m, ifp, PFIL_OUT, inp);
 	if (error != 0 || m == NULL)
 		goto done;
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	needfiblookup = 0;
 	/* See if destination IP address was changed by packet filter. */
 	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip6_input(). */
 		if (in6_localip(&ip6->ip6_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 				m->m_pkthdr.csum_flags |=
 				    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 #ifdef SCTP
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			error = netisr_queue(NETISR_IPV6, m);
 			goto done;
 		} else
 			needfiblookup = 1; /* Redo the routing table lookup. */
 	}
 	/* See if fib was changed by packet filter. */
 	if (fibnum != M_GETFIB(m)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		fibnum = M_GETFIB(m);
 		RO_RTFREE(ro);
 		needfiblookup = 1;
 	}
 	if (needfiblookup)
 		goto again;
 
 	/* See if local, if yes, send it to netisr. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 			m->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		error = netisr_queue(NETISR_IPV6, m);
 		goto done;
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		dst = (struct sockaddr_in6 *)&ro->ro_dst;
 		bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP6_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 		goto again;
 	}
 
 passout:
 	/*
 	 * Send the packet to the outgoing interface.
 	 * If necessary, do IPv6 fragmentation before sending.
 	 *
 	 * the logic here is rather complex:
 	 * 1: normal case (dontfrag == 0, alwaysfrag == 0)
 	 * 1-a:	send as is if tlen <= path mtu
 	 * 1-b:	fragment if tlen > path mtu
 	 *
 	 * 2: if user asks us not to fragment (dontfrag == 1)
 	 * 2-a:	send as is if tlen <= interface mtu
 	 * 2-b:	error if tlen > interface mtu
 	 *
 	 * 3: if we always need to attach fragment header (alwaysfrag == 1)
 	 *	always fragment
 	 *
 	 * 4: if dontfrag == 1 && alwaysfrag == 1
 	 *	error, as we cannot handle this conflicting request
 	 */
 	sw_csum = m->m_pkthdr.csum_flags;
 	if (!hdrsplit) {
 		tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0;
 		sw_csum &= ~ifp->if_hwassist;
 	} else
 		tso = 0;
 	/*
 	 * If we added extension headers, we will not do TSO and calculate the
 	 * checksums ourselves for now.
 	 * XXX-BZ  Need a framework to know when the NIC can handle it, even
 	 * with ext. hdrs.
 	 */
 	if (sw_csum & CSUM_DELAY_DATA_IPV6) {
 		sw_csum &= ~CSUM_DELAY_DATA_IPV6;
 		in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr));
 	}
 #ifdef SCTP
 	if (sw_csum & CSUM_SCTP_IPV6) {
 		sw_csum &= ~CSUM_SCTP_IPV6;
 		sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
 	}
 #endif
 	m->m_pkthdr.csum_flags &= ifp->if_hwassist;
 	tlen = m->m_pkthdr.len;
 
 	if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso)
 		dontfrag = 1;
 	else
 		dontfrag = 0;
 	if (dontfrag && alwaysfrag) {	/* case 4 */
 		/* conflicting request - can't transmit */
 		error = EMSGSIZE;
 		goto bad;
 	}
 	if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) {	/* case 2-b */
 		/*
 		 * Even if the DONTFRAG option is specified, we cannot send the
 		 * packet when the data length is larger than the MTU of the
 		 * outgoing interface.
 		 * Notify the error by sending IPV6_PATHMTU ancillary data if
 		 * application wanted to know the MTU value. Also return an
 		 * error code (this is not described in the API spec).
 		 */
 		if (inp != NULL)
 			ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu);
 		error = EMSGSIZE;
 		goto bad;
 	}
 
 	/*
 	 * transmit packet without fragmentation
 	 */
 	if (dontfrag || (!alwaysfrag && tlen <= mtu)) {	/* case 1-a and 2-a */
 		struct in6_ifaddr *ia6;
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
 		if (ia6) {
 			/* Record statistics for this interface address. */
 			counter_u64_add(ia6->ia_ifa.ifa_opackets, 1);
 			counter_u64_add(ia6->ia_ifa.ifa_obytes,
 			    m->m_pkthdr.len);
 			ifa_free(&ia6->ia_ifa);
 		}
-		error = nd6_output_ifp(ifp, origifp, m, dst, NULL);
+		error = nd6_output_ifp(ifp, origifp, m, dst,
+		    (struct route *)ro);
 		goto done;
 	}
 
 	/*
 	 * try to fragment the packet.  case 1-b and 3
 	 */
 	if (mtu < IPV6_MMTU) {
 		/* path MTU cannot be less than IPV6_MMTU */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else if (ip6->ip6_plen == 0) {
 		/* jumbo payload cannot be fragmented */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else {
 		u_char nextproto;
 
 		/*
 		 * Too large for the destination or interface;
 		 * fragment if possible.
 		 * Must be able to put at least 8 bytes per fragment.
 		 */
 		hlen = unfragpartlen;
 		if (mtu > IPV6_MAXPACKET)
 			mtu = IPV6_MAXPACKET;
 
 		len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
 		if (len < 8) {
 			error = EMSGSIZE;
 			in6_ifstat_inc(ifp, ifs6_out_fragfail);
 			goto bad;
 		}
 
 		/*
 		 * If the interface will not calculate checksums on
 		 * fragmented packets, then do it here.
 		 * XXX-BZ handle the hw offloading case.  Need flags.
 		 */
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 			in6_delayed_cksum(m, plen, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 		}
 #ifdef SCTP
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
 			sctp_delayed_cksum(m, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
 		}
 #endif
 		/*
 		 * Change the next header field of the last header in the
 		 * unfragmentable part.
 		 */
 		if (exthdrs.ip6e_rthdr) {
 			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
 			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_dest1) {
 			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
 			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_hbh) {
 			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
 			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
 		} else {
 			nextproto = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_FRAGMENT;
 		}
 
 		/*
 		 * Loop through length of segment after first fragment,
 		 * make new header and copy data of each part and link onto
 		 * chain.
 		 */
 		m0 = m;
 		id = htonl(ip6_randomid());
 		if ((error = ip6_fragment(ifp, m, hlen, nextproto, len, id)))
 			goto sendorfree;
 
 		in6_ifstat_inc(ifp, ifs6_out_fragok);
 	}
 
 	/*
 	 * Remove leading garbages.
 	 */
 sendorfree:
 	m = m0->m_nextpkt;
 	m0->m_nextpkt = 0;
 	m_freem(m0);
 	for (m0 = m; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
-			error = nd6_output_ifp(ifp, origifp, m, dst, NULL);
+			error = nd6_output_ifp(ifp, origifp, m, dst,
+			    (struct route *)ro);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IP6STAT_INC(ip6s_fragmented);
 
 done:
 	if (ro == &ip6route)
 		RO_RTFREE(ro);
 	return (error);
 
 freehdrs:
 	m_freem(exthdrs.ip6e_hbh);	/* m_freem will check if mbuf is 0 */
 	m_freem(exthdrs.ip6e_dest1);
 	m_freem(exthdrs.ip6e_rthdr);
 	m_freem(exthdrs.ip6e_dest2);
 	/* FALLTHROUGH */
 bad:
 	if (m)
 		m_freem(m);
 	goto done;
 }
 
 static int
 ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
 {
 	struct mbuf *m;
 
 	if (hlen > MCLBYTES)
 		return (ENOBUFS); /* XXX */
 
 	if (hlen > MLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		m = m_get(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_len = hlen;
 	if (hdr)
 		bcopy(hdr, mtod(m, caddr_t), hlen);
 
 	*mp = m;
 	return (0);
 }
 
 /*
  * Insert jumbo payload option.
  */
 static int
 ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
 {
 	struct mbuf *mopt;
 	u_char *optbuf;
 	u_int32_t v;
 
 #define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
 
 	/*
 	 * If there is no hop-by-hop options header, allocate new one.
 	 * If there is one but it doesn't have enough space to store the
 	 * jumbo payload option, allocate a cluster to store the whole options.
 	 * Otherwise, use it to store the options.
 	 */
 	if (exthdrs->ip6e_hbh == 0) {
 		mopt = m_get(M_NOWAIT, MT_DATA);
 		if (mopt == NULL)
 			return (ENOBUFS);
 		mopt->m_len = JUMBOOPTLEN;
 		optbuf = mtod(mopt, u_char *);
 		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
 		exthdrs->ip6e_hbh = mopt;
 	} else {
 		struct ip6_hbh *hbh;
 
 		mopt = exthdrs->ip6e_hbh;
 		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
 			/*
 			 * XXX assumption:
 			 * - exthdrs->ip6e_hbh is not referenced from places
 			 *   other than exthdrs.
 			 * - exthdrs->ip6e_hbh is not an mbuf chain.
 			 */
 			int oldoptlen = mopt->m_len;
 			struct mbuf *n;
 
 			/*
 			 * XXX: give up if the whole (new) hbh header does
 			 * not fit even in an mbuf cluster.
 			 */
 			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
 				return (ENOBUFS);
 
 			/*
 			 * As a consequence, we must always prepare a cluster
 			 * at this point.
 			 */
 			n = m_getcl(M_NOWAIT, MT_DATA, 0);
 			if (n == NULL)
 				return (ENOBUFS);
 			n->m_len = oldoptlen + JUMBOOPTLEN;
 			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
 			    oldoptlen);
 			optbuf = mtod(n, caddr_t) + oldoptlen;
 			m_freem(mopt);
 			mopt = exthdrs->ip6e_hbh = n;
 		} else {
 			optbuf = mtod(mopt, u_char *) + mopt->m_len;
 			mopt->m_len += JUMBOOPTLEN;
 		}
 		optbuf[0] = IP6OPT_PADN;
 		optbuf[1] = 1;
 
 		/*
 		 * Adjust the header length according to the pad and
 		 * the jumbo payload option.
 		 */
 		hbh = mtod(mopt, struct ip6_hbh *);
 		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
 	}
 
 	/* fill in the option. */
 	optbuf[2] = IP6OPT_JUMBO;
 	optbuf[3] = 4;
 	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
 	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
 
 	/* finally, adjust the packet header length */
 	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
 
 	return (0);
 #undef JUMBOOPTLEN
 }
 
 /*
  * Insert fragment header and copy unfragmentable header portions.
  */
 static int
 ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
     struct ip6_frag **frghdrp)
 {
 	struct mbuf *n, *mlast;
 
 	if (hlen > sizeof(struct ip6_hdr)) {
 		n = m_copym(m0, sizeof(struct ip6_hdr),
 		    hlen - sizeof(struct ip6_hdr), M_NOWAIT);
 		if (n == 0)
 			return (ENOBUFS);
 		m->m_next = n;
 	} else
 		n = m;
 
 	/* Search for the last mbuf of unfragmentable part. */
 	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
 		;
 
 	if (M_WRITABLE(mlast) &&
 	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
 		/* use the trailing space of the last mbuf for the fragment hdr */
 		*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
 		    mlast->m_len);
 		mlast->m_len += sizeof(struct ip6_frag);
 		m->m_pkthdr.len += sizeof(struct ip6_frag);
 	} else {
 		/* allocate a new mbuf for the fragment header */
 		struct mbuf *mfrg;
 
 		mfrg = m_get(M_NOWAIT, MT_DATA);
 		if (mfrg == NULL)
 			return (ENOBUFS);
 		mfrg->m_len = sizeof(struct ip6_frag);
 		*frghdrp = mtod(mfrg, struct ip6_frag *);
 		mlast->m_next = mfrg;
 	}
 
 	return (0);
 }
 
 /*
  * Calculates IPv6 path mtu for destination @dst.
  * Resulting MTU is stored in @mtup.
  *
  * Returns 0 on success.
  */
 static int
 ip6_getpmtu_ctl(u_int fibnum, struct in6_addr *dst, u_long *mtup)
 {
 	struct nhop6_extended nh6;
 	struct in6_addr kdst;
 	uint32_t scopeid;
 	struct ifnet *ifp;
 	u_long mtu;
 	int error;
 
 	in6_splitscope(dst, &kdst, &scopeid);
 	if (fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, 0, &nh6) != 0)
 		return (EHOSTUNREACH);
 
 	ifp = nh6.nh_ifp;
 	mtu = nh6.nh_mtu;
 
 	error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL);
 	fib6_free_nh_ext(fibnum, &nh6);
 
 	return (error);
 }
 
 /*
  * Calculates IPv6 path MTU for @dst based on transmit @ifp,
  * and cached data in @ro_pmtu.
  * MTU from (successful) route lookup is saved (along with dst)
  * inside @ro_pmtu to avoid subsequent route lookups after packet
  * filter processing.
  *
  * Stores mtu and always-frag value into @mtup and @alwaysfragp.
  * Returns 0 on success.
  */
 static int
 ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
     struct ifnet *ifp, struct in6_addr *dst, u_long *mtup,
     int *alwaysfragp, u_int fibnum)
 {
 	struct nhop6_basic nh6;
 	struct in6_addr kdst;
 	uint32_t scopeid;
 	struct sockaddr_in6 *sa6_dst;
 	u_long mtu;
 
 	mtu = 0;
 	if (do_lookup) {
 
 		/*
 		 * Here ro_pmtu has final destination address, while
 		 * ro might represent immediate destination.
 		 * Use ro_pmtu destination since mtu might differ.
 		 */
 		sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst;
 		if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))
 			ro_pmtu->ro_mtu = 0;
 
 		if (ro_pmtu->ro_mtu == 0) {
 			bzero(sa6_dst, sizeof(*sa6_dst));
 			sa6_dst->sin6_family = AF_INET6;
 			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
 			sa6_dst->sin6_addr = *dst;
 
 			in6_splitscope(dst, &kdst, &scopeid);
 			if (fib6_lookup_nh_basic(fibnum, &kdst, scopeid, 0, 0,
 			    &nh6) == 0)
 				ro_pmtu->ro_mtu = nh6.nh_mtu;
 		}
 
 		mtu = ro_pmtu->ro_mtu;
 	}
 
 	if (ro_pmtu->ro_rt)
 		mtu = ro_pmtu->ro_rt->rt_mtu;
 
 	return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp));
 }
 
 /*
  * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and
  * hostcache data for @dst.
  * Stores mtu and always-frag value into @mtup and @alwaysfragp.
  *
  * Returns 0 on success.
  */
 static int
 ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu,
     u_long *mtup, int *alwaysfragp)
 {
 	u_long mtu = 0;
 	int alwaysfrag = 0;
 	int error = 0;
 
 	if (rt_mtu > 0) {
 		u_int32_t ifmtu;
 		struct in_conninfo inc;
 
 		bzero(&inc, sizeof(inc));
 		inc.inc_flags |= INC_ISIPV6;
 		inc.inc6_faddr = *dst;
 
 		ifmtu = IN6_LINKMTU(ifp);
 		mtu = tcp_hc_getmtu(&inc);
 		if (mtu)
 			mtu = min(mtu, rt_mtu);
 		else
 			mtu = rt_mtu;
 		if (mtu == 0)
 			mtu = ifmtu;
 		else if (mtu < IPV6_MMTU) {
 			/*
 			 * RFC2460 section 5, last paragraph:
 			 * if we record ICMPv6 too big message with
 			 * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
 			 * or smaller, with framgent header attached.
 			 * (fragment header is needed regardless from the
 			 * packet size, for translators to identify packets)
 			 */
 			alwaysfrag = 1;
 			mtu = IPV6_MMTU;
 		}
 	} else if (ifp) {
 		mtu = IN6_LINKMTU(ifp);
 	} else
 		error = EHOSTUNREACH; /* XXX */
 
 	*mtup = mtu;
 	if (alwaysfragp)
 		*alwaysfragp = alwaysfrag;
 	return (error);
 }
 
 /*
  * IP6 socket option processing.
  */
 int
 ip6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int optdatalen, uproto;
 	void *optdata;
 	struct inpcb *in6p = sotoinpcb(so);
 	int error, optval;
 	int level, op, optname;
 	int optlen;
 	struct thread *td;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 	td = sopt->sopt_td;
 	error = 0;
 	optval = 0;
 	uproto = (int)so->so_proto->pr_protocol;
 
 	if (level != IPPROTO_IPV6) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(in6p);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					in6p->inp_flags2 |= INP_REUSEADDR;
 				else
 					in6p->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(in6p);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(in6p);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					in6p->inp_flags2 |= INP_REUSEPORT;
 				else
 					in6p->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(in6p);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(in6p);
 				in6p->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(in6p);
 				error = 0;
 				break;
 			default:
 				break;
 			}
 		}
 	} else {		/* level == IPPROTO_IPV6 */
 		switch (op) {
 
 		case SOPT_SET:
 			switch (optname) {
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 			{
 				struct mbuf *m;
 
 				error = soopt_getm(sopt, &m); /* XXX */
 				if (error != 0)
 					break;
 				error = soopt_mcopyin(sopt, m); /* XXX */
 				if (error != 0)
 					break;
 				error = ip6_pcbopts(&in6p->in6p_outputopts,
 						    m, so, sopt);
 				m_freem(m); /* XXX */
 				break;
 			}
 
 			/*
 			 * Use of some Hop-by-Hop options or some
 			 * Destination options, might require special
 			 * privilege.  That is, normal applications
 			 * (without special privilege) might be forbidden
 			 * from setting certain options in outgoing packets,
 			 * and might never see certain options in received
 			 * packets. [RFC 2292 Section 6]
 			 * KAME specific note:
 			 *  KAME prevents non-privileged users from sending or
 			 *  receiving ANY hbh/dst options in order to avoid
 			 *  overhead of parsing options in the kernel.
 			 */
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 				if (td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_SETHDROPTS);
 					if (error)
 						break;
 				}
 				/* FALLTHROUGH */
 			case IPV6_UNICAST_HOPS:
 			case IPV6_HOPLIMIT:
 
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 			case IPV6_RECVTCLASS:
 			case IPV6_RECVFLOWID:
 #ifdef	RSS
 			case IPV6_RECVRSSBUCKETID:
 #endif
 			case IPV6_V6ONLY:
 			case IPV6_AUTOFLOWLABEL:
 			case IPV6_BINDANY:
 			case IPV6_BINDMULTI:
 #ifdef	RSS
 			case IPV6_RSS_LISTEN_BUCKET:
 #endif
 				if (optname == IPV6_BINDANY && td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_BINDANY);
 					if (error)
 						break;
 				}
 
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 
 				case IPV6_UNICAST_HOPS:
 					if (optval < -1 || optval >= 256)
 						error = EINVAL;
 					else {
 						/* -1 = kernel default */
 						in6p->in6p_hops = optval;
 						if ((in6p->inp_vflag &
 						     INP_IPV4) != 0)
 							in6p->inp_ip_ttl = optval;
 					}
 					break;
 #define OPTSET(bit) \
 do { \
 	INP_WLOCK(in6p); \
 	if (optval) \
 		in6p->inp_flags |= (bit); \
 	else \
 		in6p->inp_flags &= ~(bit); \
 	INP_WUNLOCK(in6p); \
 } while (/*CONSTCOND*/ 0)
 #define OPTSET2292(bit) \
 do { \
 	INP_WLOCK(in6p); \
 	in6p->inp_flags |= IN6P_RFC2292; \
 	if (optval) \
 		in6p->inp_flags |= (bit); \
 	else \
 		in6p->inp_flags &= ~(bit); \
 	INP_WUNLOCK(in6p); \
 } while (/*CONSTCOND*/ 0)
 #define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0)
 
 #define OPTSET2(bit, val) do {						\
 	INP_WLOCK(in6p);						\
 	if (val)							\
 		in6p->inp_flags2 |= bit;				\
 	else								\
 		in6p->inp_flags2 &= ~bit;				\
 	INP_WUNLOCK(in6p);						\
 } while (0)
 #define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0)
 
 				case IPV6_RECVPKTINFO:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_PKTINFO);
 					break;
 
 				case IPV6_HOPLIMIT:
 				{
 					struct ip6_pktopts **optp;
 
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					optp = &in6p->in6p_outputopts;
 					error = ip6_pcbopt(IPV6_HOPLIMIT,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					break;
 				}
 
 				case IPV6_RECVHOPLIMIT:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVHOPOPTS:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDR:
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					/*
 					 * We ignore this option for TCP
 					 * sockets.
 					 * (RFC3542 leaves this case
 					 * unspecified.)
 					 */
 					if (uproto != IPPROTO_TCP)
 						OPTSET(IN6P_MTU);
 					break;
 
 				case IPV6_RECVFLOWID:
 					OPTSET2(INP_RECVFLOWID, optval);
 					break;
 
 #ifdef	RSS
 				case IPV6_RECVRSSBUCKETID:
 					OPTSET2(INP_RECVRSSBUCKETID, optval);
 					break;
 #endif
 
 				case IPV6_V6ONLY:
 					/*
 					 * make setsockopt(IPV6_V6ONLY)
 					 * available only prior to bind(2).
 					 * see ipng mailing list, Jun 22 2001.
 					 */
 					if (in6p->inp_lport ||
 					    !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_IPV6_V6ONLY);
 					if (optval)
 						in6p->inp_vflag &= ~INP_IPV4;
 					else
 						in6p->inp_vflag |= INP_IPV4;
 					break;
 				case IPV6_RECVTCLASS:
 					/* cannot mix with RFC2292 XXX */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					OPTSET(IN6P_TCLASS);
 					break;
 				case IPV6_AUTOFLOWLABEL:
 					OPTSET(IN6P_AUTOFLOWLABEL);
 					break;
 
 				case IPV6_BINDANY:
 					OPTSET(INP_BINDANY);
 					break;
 
 				case IPV6_BINDMULTI:
 					OPTSET2(INP_BINDMULTI, optval);
 					break;
 #ifdef	RSS
 				case IPV6_RSS_LISTEN_BUCKET:
 					if ((optval >= 0) &&
 					    (optval < rss_getnumbuckets())) {
 						in6p->inp_rss_listen_bucket = optval;
 						OPTSET2(INP_RSS_BUCKET_SET, 1);
 					} else {
 						error = EINVAL;
 					}
 					break;
 #endif
 				}
 				break;
 
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				if (optlen != sizeof(optval)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				{
 					struct ip6_pktopts **optp;
 					optp = &in6p->in6p_outputopts;
 					error = ip6_pcbopt(optname,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					break;
 				}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292DSTOPTS:
 			case IPV6_2292RTHDR:
 				/* RFC 2292 */
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					OPTSET2292(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					OPTSET2292(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					/*
 					 * Check super-user privilege.
 					 * See comments for IPV6_RECVHOPOPTS.
 					 */
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292DSTOPTS:
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
 					break;
 				case IPV6_2292RTHDR:
 					OPTSET2292(IN6P_RTHDR);
 					break;
 				}
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			{
 				/* new advanced API (RFC3542) */
 				u_char *optbuf;
 				u_char optbuf_storage[MCLBYTES];
 				int optlen;
 				struct ip6_pktopts **optp;
 
 				/* cannot mix with RFC2292 */
 				if (OPTBIT(IN6P_RFC2292)) {
 					error = EINVAL;
 					break;
 				}
 
 				/*
 				 * We only ensure valsize is not too large
 				 * here.  Further validation will be done
 				 * later.
 				 */
 				error = sooptcopyin(sopt, optbuf_storage,
 				    sizeof(optbuf_storage), 0);
 				if (error)
 					break;
 				optlen = sopt->sopt_valsize;
 				optbuf = optbuf_storage;
 				optp = &in6p->in6p_outputopts;
 				error = ip6_pcbopt(optname, optbuf, optlen,
 				    optp, (td != NULL) ? td->td_ucred : NULL,
 				    uproto);
 				break;
 			}
 #undef OPTSET
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_JOIN_GROUP:
 			case IPV6_LEAVE_GROUP:
 			case IPV6_MSFILTER:
 			case MCAST_BLOCK_SOURCE:
 			case MCAST_UNBLOCK_SOURCE:
 			case MCAST_JOIN_GROUP:
 			case MCAST_LEAVE_GROUP:
 			case MCAST_JOIN_SOURCE_GROUP:
 			case MCAST_LEAVE_SOURCE_GROUP:
 				error = ip6_setmoptions(in6p, sopt);
 				break;
 
 			case IPV6_PORTRANGE:
 				error = sooptcopyin(sopt, &optval,
 				    sizeof optval, sizeof optval);
 				if (error)
 					break;
 
 				INP_WLOCK(in6p);
 				switch (optval) {
 				case IPV6_PORTRANGE_DEFAULT:
 					in6p->inp_flags &= ~(INP_LOWPORT);
 					in6p->inp_flags &= ~(INP_HIGHPORT);
 					break;
 
 				case IPV6_PORTRANGE_HIGH:
 					in6p->inp_flags &= ~(INP_LOWPORT);
 					in6p->inp_flags |= INP_HIGHPORT;
 					break;
 
 				case IPV6_PORTRANGE_LOW:
 					in6p->inp_flags &= ~(INP_HIGHPORT);
 					in6p->inp_flags |= INP_LOWPORT;
 					break;
 
 				default:
 					error = EINVAL;
 					break;
 				}
 				INP_WUNLOCK(in6p);
 				break;
 
 #ifdef IPSEC
 			case IPV6_IPSEC_POLICY:
 			{
 				caddr_t req;
 				struct mbuf *m;
 
 				if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
 					break;
 				if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
 					break;
 				req = mtod(m, caddr_t);
 				error = ipsec_set_policy(in6p, optname, req,
 				    m->m_len, (sopt->sopt_td != NULL) ?
 				    sopt->sopt_td->td_ucred : NULL);
 				m_freem(m);
 				break;
 			}
 #endif /* IPSEC */
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 
 		case SOPT_GET:
 			switch (optname) {
 
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 				/*
 				 * RFC3542 (effectively) deprecated the
 				 * semantics of the 2292-style pktoptions.
 				 * Since it was not reliable in nature (i.e.,
 				 * applications had to expect the lack of some
 				 * information after all), it would make sense
 				 * to simplify this part by always returning
 				 * empty data.
 				 */
 				sopt->sopt_valsize = 0;
 				break;
 
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 			case IPV6_UNICAST_HOPS:
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 
 			case IPV6_V6ONLY:
 			case IPV6_PORTRANGE:
 			case IPV6_RECVTCLASS:
 			case IPV6_AUTOFLOWLABEL:
 			case IPV6_BINDANY:
 			case IPV6_FLOWID:
 			case IPV6_FLOWTYPE:
 			case IPV6_RECVFLOWID:
 #ifdef	RSS
 			case IPV6_RSSBUCKETID:
 			case IPV6_RECVRSSBUCKETID:
 #endif
 			case IPV6_BINDMULTI:
 				switch (optname) {
 
 				case IPV6_RECVHOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					optval = OPTBIT(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_UNICAST_HOPS:
 					optval = in6p->in6p_hops;
 					break;
 
 				case IPV6_RECVPKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 
 				case IPV6_RECVHOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVRTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					optval = OPTBIT(IN6P_MTU);
 					break;
 
 				case IPV6_V6ONLY:
 					optval = OPTBIT(IN6P_IPV6_V6ONLY);
 					break;
 
 				case IPV6_PORTRANGE:
 				    {
 					int flags;
 					flags = in6p->inp_flags;
 					if (flags & INP_HIGHPORT)
 						optval = IPV6_PORTRANGE_HIGH;
 					else if (flags & INP_LOWPORT)
 						optval = IPV6_PORTRANGE_LOW;
 					else
 						optval = 0;
 					break;
 				    }
 				case IPV6_RECVTCLASS:
 					optval = OPTBIT(IN6P_TCLASS);
 					break;
 
 				case IPV6_AUTOFLOWLABEL:
 					optval = OPTBIT(IN6P_AUTOFLOWLABEL);
 					break;
 
 				case IPV6_BINDANY:
 					optval = OPTBIT(INP_BINDANY);
 					break;
 
 				case IPV6_FLOWID:
 					optval = in6p->inp_flowid;
 					break;
 
 				case IPV6_FLOWTYPE:
 					optval = in6p->inp_flowtype;
 					break;
 
 				case IPV6_RECVFLOWID:
 					optval = OPTBIT2(INP_RECVFLOWID);
 					break;
 #ifdef	RSS
 				case IPV6_RSSBUCKETID:
 					retval =
 					    rss_hash2bucket(in6p->inp_flowid,
 					    in6p->inp_flowtype,
 					    &rss_bucket);
 					if (retval == 0)
 						optval = rss_bucket;
 					else
 						error = EINVAL;
 					break;
 
 				case IPV6_RECVRSSBUCKETID:
 					optval = OPTBIT2(INP_RECVRSSBUCKETID);
 					break;
 #endif
 
 				case IPV6_BINDMULTI:
 					optval = OPTBIT2(INP_BINDMULTI);
 					break;
 
 				}
 				if (error)
 					break;
 				error = sooptcopyout(sopt, &optval,
 					sizeof optval);
 				break;
 
 			case IPV6_PATHMTU:
 			{
 				u_long pmtu = 0;
 				struct ip6_mtuinfo mtuinfo;
 
 				if (!(so->so_state & SS_ISCONNECTED))
 					return (ENOTCONN);
 				/*
 				 * XXX: we dot not consider the case of source
 				 * routing, or optional information to specify
 				 * the outgoing interface.
 				 */
 				error = ip6_getpmtu_ctl(so->so_fibnum,
 				    &in6p->in6p_faddr, &pmtu);
 				if (error)
 					break;
 				if (pmtu > IPV6_MAXPACKET)
 					pmtu = IPV6_MAXPACKET;
 
 				bzero(&mtuinfo, sizeof(mtuinfo));
 				mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
 				optdata = (void *)&mtuinfo;
 				optdatalen = sizeof(mtuinfo);
 				error = sooptcopyout(sopt, optdata,
 				    optdatalen);
 				break;
 			}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292RTHDR:
 			case IPV6_2292DSTOPTS:
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292RTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 				case IPV6_2292DSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
 					break;
 				}
 				error = sooptcopyout(sopt, &optval,
 				    sizeof optval);
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				error = ip6_getpcbopt(in6p->in6p_outputopts,
 				    optname, sopt);
 				break;
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_MSFILTER:
 				error = ip6_getmoptions(in6p, sopt);
 				break;
 
 #ifdef IPSEC
 			case IPV6_IPSEC_POLICY:
 			  {
 				caddr_t req = NULL;
 				size_t len = 0;
 				struct mbuf *m = NULL;
 				struct mbuf **mp = &m;
 				size_t ovalsize = sopt->sopt_valsize;
 				caddr_t oval = (caddr_t)sopt->sopt_val;
 
 				error = soopt_getm(sopt, &m); /* XXX */
 				if (error != 0)
 					break;
 				error = soopt_mcopyin(sopt, m); /* XXX */
 				if (error != 0)
 					break;
 				sopt->sopt_valsize = ovalsize;
 				sopt->sopt_val = oval;
 				if (m) {
 					req = mtod(m, caddr_t);
 					len = m->m_len;
 				}
 				error = ipsec_get_policy(in6p, req, len, mp);
 				if (error == 0)
 					error = soopt_mcopyout(sopt, m); /* XXX */
 				if (error == 0 && m)
 					m_freem(m);
 				break;
 			  }
 #endif /* IPSEC */
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 		}
 	}
 	return (error);
 }
 
 int
 ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0, optval, optlen;
 	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
 	struct inpcb *in6p = sotoinpcb(so);
 	int level, op, optname;
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 
 	if (level != IPPROTO_IPV6) {
 		return (EINVAL);
 	}
 
 	switch (optname) {
 	case IPV6_CHECKSUM:
 		/*
 		 * For ICMPv6 sockets, no modification allowed for checksum
 		 * offset, permit "no change" values to help existing apps.
 		 *
 		 * RFC3542 says: "An attempt to set IPV6_CHECKSUM
 		 * for an ICMPv6 socket will fail."
 		 * The current behavior does not meet RFC3542.
 		 */
 		switch (op) {
 		case SOPT_SET:
 			if (optlen != sizeof(int)) {
 				error = EINVAL;
 				break;
 			}
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 					    sizeof(optval));
 			if (error)
 				break;
 			if ((optval % 2) != 0) {
 				/* the API assumes even offset values */
 				error = EINVAL;
 			} else if (so->so_proto->pr_protocol ==
 			    IPPROTO_ICMPV6) {
 				if (optval != icmp6off)
 					error = EINVAL;
 			} else
 				in6p->in6p_cksum = optval;
 			break;
 
 		case SOPT_GET:
 			if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
 				optval = icmp6off;
 			else
 				optval = in6p->in6p_cksum;
 
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 
 	default:
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Set up IP6 options in pcb for insertion in output packets or
  * specifying behavior of outgoing packets.
  */
 static int
 ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m,
     struct socket *so, struct sockopt *sopt)
 {
 	struct ip6_pktopts *opt = *pktopt;
 	int error = 0;
 	struct thread *td = sopt->sopt_td;
 
 	/* turn off any old options. */
 	if (opt) {
 #ifdef DIAGNOSTIC
 		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
 		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
 		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			printf("ip6_pcbopts: all specified options are cleared.\n");
 #endif
 		ip6_clearpktopts(opt, -1);
 	} else
 		opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK);
 	*pktopt = NULL;
 
 	if (!m || m->m_len == 0) {
 		/*
 		 * Only turning off any previous options, regardless of
 		 * whether the opt is just created or given.
 		 */
 		free(opt, M_IP6OPT);
 		return (0);
 	}
 
 	/*  set options specified by user. */
 	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
 	    td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
 		ip6_clearpktopts(opt, -1); /* XXX: discard all options */
 		free(opt, M_IP6OPT);
 		return (error);
 	}
 	*pktopt = opt;
 	return (0);
 }
 
 /*
  * initialize ip6_pktopts.  beware that there are non-zero default values in
  * the struct.
  */
 void
 ip6_initpktopts(struct ip6_pktopts *opt)
 {
 
 	bzero(opt, sizeof(*opt));
 	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
 	opt->ip6po_tclass = -1;	/* -1 means default traffic class */
 	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
 	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
 }
 
 static int
 ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
     struct ucred *cred, int uproto)
 {
 	struct ip6_pktopts *opt;
 
 	if (*pktopt == NULL) {
 		*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
 		    M_WAITOK);
 		ip6_initpktopts(*pktopt);
 	}
 	opt = *pktopt;
 
 	return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
 }
 
 static int
 ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt)
 {
 	void *optdata = NULL;
 	int optdatalen = 0;
 	struct ip6_ext *ip6e;
 	int error = 0;
 	struct in6_pktinfo null_pktinfo;
 	int deftclass = 0, on;
 	int defminmtu = IP6PO_MINMTU_MCASTONLY;
 	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
 
 	switch (optname) {
 	case IPV6_PKTINFO:
 		optdata = (void *)&null_pktinfo;
 		if (pktopt && pktopt->ip6po_pktinfo) {
 			bcopy(pktopt->ip6po_pktinfo, &null_pktinfo,
 			    sizeof(null_pktinfo));
 			in6_clearscope(&null_pktinfo.ipi6_addr);
 		} else {
 			/* XXX: we don't have to do this every time... */
 			bzero(&null_pktinfo, sizeof(null_pktinfo));
 		}
 		optdatalen = sizeof(struct in6_pktinfo);
 		break;
 	case IPV6_TCLASS:
 		if (pktopt && pktopt->ip6po_tclass >= 0)
 			optdata = (void *)&pktopt->ip6po_tclass;
 		else
 			optdata = (void *)&deftclass;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_HOPOPTS:
 		if (pktopt && pktopt->ip6po_hbh) {
 			optdata = (void *)pktopt->ip6po_hbh;
 			ip6e = (struct ip6_ext *)pktopt->ip6po_hbh;
 			optdatalen = (ip6e->ip6e_len + 1) << 3;
 		}
 		break;
 	case IPV6_RTHDR:
 		if (pktopt && pktopt->ip6po_rthdr) {
 			optdata = (void *)pktopt->ip6po_rthdr;
 			ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr;
 			optdatalen = (ip6e->ip6e_len + 1) << 3;
 		}
 		break;
 	case IPV6_RTHDRDSTOPTS:
 		if (pktopt && pktopt->ip6po_dest1) {
 			optdata = (void *)pktopt->ip6po_dest1;
 			ip6e = (struct ip6_ext *)pktopt->ip6po_dest1;
 			optdatalen = (ip6e->ip6e_len + 1) << 3;
 		}
 		break;
 	case IPV6_DSTOPTS:
 		if (pktopt && pktopt->ip6po_dest2) {
 			optdata = (void *)pktopt->ip6po_dest2;
 			ip6e = (struct ip6_ext *)pktopt->ip6po_dest2;
 			optdatalen = (ip6e->ip6e_len + 1) << 3;
 		}
 		break;
 	case IPV6_NEXTHOP:
 		if (pktopt && pktopt->ip6po_nexthop) {
 			optdata = (void *)pktopt->ip6po_nexthop;
 			optdatalen = pktopt->ip6po_nexthop->sa_len;
 		}
 		break;
 	case IPV6_USE_MIN_MTU:
 		if (pktopt)
 			optdata = (void *)&pktopt->ip6po_minmtu;
 		else
 			optdata = (void *)&defminmtu;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_DONTFRAG:
 		if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
 			on = 1;
 		else
 			on = 0;
 		optdata = (void *)&on;
 		optdatalen = sizeof(on);
 		break;
 	case IPV6_PREFER_TEMPADDR:
 		if (pktopt)
 			optdata = (void *)&pktopt->ip6po_prefer_tempaddr;
 		else
 			optdata = (void *)&defpreftemp;
 		optdatalen = sizeof(int);
 		break;
 	default:		/* should not happen */
 #ifdef DIAGNOSTIC
 		panic("ip6_getpcbopt: unexpected option\n");
 #endif
 		return (ENOPROTOOPT);
 	}
 
 	error = sooptcopyout(sopt, optdata, optdatalen);
 
 	return (error);
 }
 
 void
 ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
 {
 	if (pktopt == NULL)
 		return;
 
 	if (optname == -1 || optname == IPV6_PKTINFO) {
 		if (pktopt->ip6po_pktinfo)
 			free(pktopt->ip6po_pktinfo, M_IP6OPT);
 		pktopt->ip6po_pktinfo = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPLIMIT)
 		pktopt->ip6po_hlim = -1;
 	if (optname == -1 || optname == IPV6_TCLASS)
 		pktopt->ip6po_tclass = -1;
 	if (optname == -1 || optname == IPV6_NEXTHOP) {
 		if (pktopt->ip6po_nextroute.ro_rt) {
 			RTFREE(pktopt->ip6po_nextroute.ro_rt);
 			pktopt->ip6po_nextroute.ro_rt = NULL;
 		}
 		if (pktopt->ip6po_nexthop)
 			free(pktopt->ip6po_nexthop, M_IP6OPT);
 		pktopt->ip6po_nexthop = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPOPTS) {
 		if (pktopt->ip6po_hbh)
 			free(pktopt->ip6po_hbh, M_IP6OPT);
 		pktopt->ip6po_hbh = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
 		if (pktopt->ip6po_dest1)
 			free(pktopt->ip6po_dest1, M_IP6OPT);
 		pktopt->ip6po_dest1 = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDR) {
 		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
 		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
 		if (pktopt->ip6po_route.ro_rt) {
 			RTFREE(pktopt->ip6po_route.ro_rt);
 			pktopt->ip6po_route.ro_rt = NULL;
 		}
 	}
 	if (optname == -1 || optname == IPV6_DSTOPTS) {
 		if (pktopt->ip6po_dest2)
 			free(pktopt->ip6po_dest2, M_IP6OPT);
 		pktopt->ip6po_dest2 = NULL;
 	}
 }
 
 #define PKTOPT_EXTHDRCPY(type) \
 do {\
 	if (src->type) {\
 		int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
 		dst->type = malloc(hlen, M_IP6OPT, canwait);\
 		if (dst->type == NULL && canwait == M_NOWAIT)\
 			goto bad;\
 		bcopy(src->type, dst->type, hlen);\
 	}\
 } while (/*CONSTCOND*/ 0)
 
 static int
 copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
 {
 	if (dst == NULL || src == NULL)  {
 		printf("ip6_clearpktopts: invalid argument\n");
 		return (EINVAL);
 	}
 
 	dst->ip6po_hlim = src->ip6po_hlim;
 	dst->ip6po_tclass = src->ip6po_tclass;
 	dst->ip6po_flags = src->ip6po_flags;
 	dst->ip6po_minmtu = src->ip6po_minmtu;
 	dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr;
 	if (src->ip6po_pktinfo) {
 		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_pktinfo == NULL)
 			goto bad;
 		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
 	}
 	if (src->ip6po_nexthop) {
 		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_nexthop == NULL)
 			goto bad;
 		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
 		    src->ip6po_nexthop->sa_len);
 	}
 	PKTOPT_EXTHDRCPY(ip6po_hbh);
 	PKTOPT_EXTHDRCPY(ip6po_dest1);
 	PKTOPT_EXTHDRCPY(ip6po_dest2);
 	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
 	return (0);
 
   bad:
 	ip6_clearpktopts(dst, -1);
 	return (ENOBUFS);
 }
 #undef PKTOPT_EXTHDRCPY
 
 struct ip6_pktopts *
 ip6_copypktopts(struct ip6_pktopts *src, int canwait)
 {
 	int error;
 	struct ip6_pktopts *dst;
 
 	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
 	if (dst == NULL)
 		return (NULL);
 	ip6_initpktopts(dst);
 
 	if ((error = copypktopts(dst, src, canwait)) != 0) {
 		free(dst, M_IP6OPT);
 		return (NULL);
 	}
 
 	return (dst);
 }
 
 void
 ip6_freepcbopts(struct ip6_pktopts *pktopt)
 {
 	if (pktopt == NULL)
 		return;
 
 	ip6_clearpktopts(pktopt, -1);
 
 	free(pktopt, M_IP6OPT);
 }
 
 /*
  * Set IPv6 outgoing packet options based on advanced API.
  */
 int
 ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
     struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto)
 {
 	struct cmsghdr *cm = 0;
 
 	if (control == NULL || opt == NULL)
 		return (EINVAL);
 
 	ip6_initpktopts(opt);
 	if (stickyopt) {
 		int error;
 
 		/*
 		 * If stickyopt is provided, make a local copy of the options
 		 * for this particular packet, then override them by ancillary
 		 * objects.
 		 * XXX: copypktopts() does not copy the cached route to a next
 		 * hop (if any).  This is not very good in terms of efficiency,
 		 * but we can allow this since this option should be rarely
 		 * used.
 		 */
 		if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * XXX: Currently, we assume all the optional information is stored
 	 * in a single mbuf.
 	 */
 	if (control->m_next)
 		return (EINVAL);
 
 	for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
 	    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 		int error;
 
 		if (control->m_len < CMSG_LEN(0))
 			return (EINVAL);
 
 		cm = mtod(control, struct cmsghdr *);
 		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
 			return (EINVAL);
 		if (cm->cmsg_level != IPPROTO_IPV6)
 			continue;
 
 		error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
 		    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Set a particular packet option, as a sticky option or an ancillary data
  * item.  "len" can be 0 only when it's a sticky option.
  * We have 4 cases of combination of "sticky" and "cmsg":
  * "sticky=0, cmsg=0": impossible
  * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
  * "sticky=1, cmsg=0": RFC3542 socket option
  * "sticky=1, cmsg=1": RFC2292 socket option
  */
 static int
 ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
     struct ucred *cred, int sticky, int cmsg, int uproto)
 {
 	int minmtupolicy, preftemp;
 	int error;
 
 	if (!sticky && !cmsg) {
 #ifdef DIAGNOSTIC
 		printf("ip6_setpktopt: impossible case\n");
 #endif
 		return (EINVAL);
 	}
 
 	/*
 	 * IPV6_2292xxx is for backward compatibility to RFC2292, and should
 	 * not be specified in the context of RFC3542.  Conversely,
 	 * RFC3542 types should not be specified in the context of RFC2292.
 	 */
 	if (!cmsg) {
 		switch (optname) {
 		case IPV6_2292PKTINFO:
 		case IPV6_2292HOPLIMIT:
 		case IPV6_2292NEXTHOP:
 		case IPV6_2292HOPOPTS:
 		case IPV6_2292DSTOPTS:
 		case IPV6_2292RTHDR:
 		case IPV6_2292PKTOPTIONS:
 			return (ENOPROTOOPT);
 		}
 	}
 	if (sticky && cmsg) {
 		switch (optname) {
 		case IPV6_PKTINFO:
 		case IPV6_HOPLIMIT:
 		case IPV6_NEXTHOP:
 		case IPV6_HOPOPTS:
 		case IPV6_DSTOPTS:
 		case IPV6_RTHDRDSTOPTS:
 		case IPV6_RTHDR:
 		case IPV6_USE_MIN_MTU:
 		case IPV6_DONTFRAG:
 		case IPV6_TCLASS:
 		case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
 			return (ENOPROTOOPT);
 		}
 	}
 
 	switch (optname) {
 	case IPV6_2292PKTINFO:
 	case IPV6_PKTINFO:
 	{
 		struct ifnet *ifp = NULL;
 		struct in6_pktinfo *pktinfo;
 
 		if (len != sizeof(struct in6_pktinfo))
 			return (EINVAL);
 
 		pktinfo = (struct in6_pktinfo *)buf;
 
 		/*
 		 * An application can clear any sticky IPV6_PKTINFO option by
 		 * doing a "regular" setsockopt with ipi6_addr being
 		 * in6addr_any and ipi6_ifindex being zero.
 		 * [RFC 3542, Section 6]
 		 */
 		if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
 		    pktinfo->ipi6_ifindex == 0 &&
 		    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			ip6_clearpktopts(opt, optname);
 			break;
 		}
 
 		if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
 		    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			return (EINVAL);
 		}
 		if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr))
 			return (EINVAL);
 		/* validate the interface index if specified. */
 		if (pktinfo->ipi6_ifindex > V_if_index)
 			 return (ENXIO);
 		if (pktinfo->ipi6_ifindex) {
 			ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
 			if (ifp == NULL)
 				return (ENXIO);
 		}
 		if (ifp != NULL && (
 		    ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED))
 			return (ENETDOWN);
 
 		if (ifp != NULL &&
 		    !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			struct in6_ifaddr *ia;
 
 			in6_setscope(&pktinfo->ipi6_addr, ifp, NULL);
 			ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr);
 			if (ia == NULL)
 				return (EADDRNOTAVAIL);
 			ifa_free(&ia->ia_ifa);
 		}
 		/*
 		 * We store the address anyway, and let in6_selectsrc()
 		 * validate the specified address.  This is because ipi6_addr
 		 * may not have enough information about its scope zone, and
 		 * we may need additional information (such as outgoing
 		 * interface or the scope zone of a destination address) to
 		 * disambiguate the scope.
 		 * XXX: the delay of the validation may confuse the
 		 * application when it is used as a sticky option.
 		 */
 		if (opt->ip6po_pktinfo == NULL) {
 			opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
 			    M_IP6OPT, M_NOWAIT);
 			if (opt->ip6po_pktinfo == NULL)
 				return (ENOBUFS);
 		}
 		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
 		break;
 	}
 
 	case IPV6_2292HOPLIMIT:
 	case IPV6_HOPLIMIT:
 	{
 		int *hlimp;
 
 		/*
 		 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
 		 * to simplify the ordering among hoplimit options.
 		 */
 		if (optname == IPV6_HOPLIMIT && sticky)
 			return (ENOPROTOOPT);
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		hlimp = (int *)buf;
 		if (*hlimp < -1 || *hlimp > 255)
 			return (EINVAL);
 
 		opt->ip6po_hlim = *hlimp;
 		break;
 	}
 
 	case IPV6_TCLASS:
 	{
 		int tclass;
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		tclass = *(int *)buf;
 		if (tclass < -1 || tclass > 255)
 			return (EINVAL);
 
 		opt->ip6po_tclass = tclass;
 		break;
 	}
 
 	case IPV6_2292NEXTHOP:
 	case IPV6_NEXTHOP:
 		if (cred != NULL) {
 			error = priv_check_cred(cred,
 			    PRIV_NETINET_SETHDROPTS, 0);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {	/* just remove the option */
 			ip6_clearpktopts(opt, IPV6_NEXTHOP);
 			break;
 		}
 
 		/* check if cmsg_len is large enough for sa_len */
 		if (len < sizeof(struct sockaddr) || len < *buf)
 			return (EINVAL);
 
 		switch (((struct sockaddr *)buf)->sa_family) {
 		case AF_INET6:
 		{
 			struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
 			int error;
 
 			if (sa6->sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 
 			if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
 			    IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
 				return (EINVAL);
 			}
 			if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
 			    != 0) {
 				return (error);
 			}
 			break;
 		}
 		case AF_LINK:	/* should eventually be supported */
 		default:
 			return (EAFNOSUPPORT);
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_NEXTHOP);
 		opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_nexthop == NULL)
 			return (ENOBUFS);
 		bcopy(buf, opt->ip6po_nexthop, *buf);
 		break;
 
 	case IPV6_2292HOPOPTS:
 	case IPV6_HOPOPTS:
 	{
 		struct ip6_hbh *hbh;
 		int hbhlen;
 
 		/*
 		 * XXX: We don't allow a non-privileged user to set ANY HbH
 		 * options, since per-option restriction has too much
 		 * overhead.
 		 */
 		if (cred != NULL) {
 			error = priv_check_cred(cred,
 			    PRIV_NETINET_SETHDROPTS, 0);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_HOPOPTS);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_hbh))
 			return (EINVAL);
 		hbh = (struct ip6_hbh *)buf;
 		hbhlen = (hbh->ip6h_len + 1) << 3;
 		if (len != hbhlen)
 			return (EINVAL);
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_HOPOPTS);
 		opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_hbh == NULL)
 			return (ENOBUFS);
 		bcopy(hbh, opt->ip6po_hbh, hbhlen);
 
 		break;
 	}
 
 	case IPV6_2292DSTOPTS:
 	case IPV6_DSTOPTS:
 	case IPV6_RTHDRDSTOPTS:
 	{
 		struct ip6_dest *dest, **newdest = NULL;
 		int destlen;
 
 		if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
 			error = priv_check_cred(cred,
 			    PRIV_NETINET_SETHDROPTS, 0);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, optname);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_dest))
 			return (EINVAL);
 		dest = (struct ip6_dest *)buf;
 		destlen = (dest->ip6d_len + 1) << 3;
 		if (len != destlen)
 			return (EINVAL);
 
 		/*
 		 * Determine the position that the destination options header
 		 * should be inserted; before or after the routing header.
 		 */
 		switch (optname) {
 		case IPV6_2292DSTOPTS:
 			/*
 			 * The old advacned API is ambiguous on this point.
 			 * Our approach is to determine the position based
 			 * according to the existence of a routing header.
 			 * Note, however, that this depends on the order of the
 			 * extension headers in the ancillary data; the 1st
 			 * part of the destination options header must appear
 			 * before the routing header in the ancillary data,
 			 * too.
 			 * RFC3542 solved the ambiguity by introducing
 			 * separate ancillary data or option types.
 			 */
 			if (opt->ip6po_rthdr == NULL)
 				newdest = &opt->ip6po_dest1;
 			else
 				newdest = &opt->ip6po_dest2;
 			break;
 		case IPV6_RTHDRDSTOPTS:
 			newdest = &opt->ip6po_dest1;
 			break;
 		case IPV6_DSTOPTS:
 			newdest = &opt->ip6po_dest2;
 			break;
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, optname);
 		*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
 		if (*newdest == NULL)
 			return (ENOBUFS);
 		bcopy(dest, *newdest, destlen);
 
 		break;
 	}
 
 	case IPV6_2292RTHDR:
 	case IPV6_RTHDR:
 	{
 		struct ip6_rthdr *rth;
 		int rthlen;
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_RTHDR);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_rthdr))
 			return (EINVAL);
 		rth = (struct ip6_rthdr *)buf;
 		rthlen = (rth->ip6r_len + 1) << 3;
 		if (len != rthlen)
 			return (EINVAL);
 
 		switch (rth->ip6r_type) {
 		case IPV6_RTHDR_TYPE_0:
 			if (rth->ip6r_len == 0)	/* must contain one addr */
 				return (EINVAL);
 			if (rth->ip6r_len % 2) /* length must be even */
 				return (EINVAL);
 			if (rth->ip6r_len / 2 != rth->ip6r_segleft)
 				return (EINVAL);
 			break;
 		default:
 			return (EINVAL);	/* not supported */
 		}
 
 		/* turn off the previous option */
 		ip6_clearpktopts(opt, IPV6_RTHDR);
 		opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_rthdr == NULL)
 			return (ENOBUFS);
 		bcopy(rth, opt->ip6po_rthdr, rthlen);
 
 		break;
 	}
 
 	case IPV6_USE_MIN_MTU:
 		if (len != sizeof(int))
 			return (EINVAL);
 		minmtupolicy = *(int *)buf;
 		if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
 		    minmtupolicy != IP6PO_MINMTU_DISABLE &&
 		    minmtupolicy != IP6PO_MINMTU_ALL) {
 			return (EINVAL);
 		}
 		opt->ip6po_minmtu = minmtupolicy;
 		break;
 
 	case IPV6_DONTFRAG:
 		if (len != sizeof(int))
 			return (EINVAL);
 
 		if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
 			/*
 			 * we ignore this option for TCP sockets.
 			 * (RFC3542 leaves this case unspecified.)
 			 */
 			opt->ip6po_flags &= ~IP6PO_DONTFRAG;
 		} else
 			opt->ip6po_flags |= IP6PO_DONTFRAG;
 		break;
 
 	case IPV6_PREFER_TEMPADDR:
 		if (len != sizeof(int))
 			return (EINVAL);
 		preftemp = *(int *)buf;
 		if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
 		    preftemp != IP6PO_TEMPADDR_NOTPREFER &&
 		    preftemp != IP6PO_TEMPADDR_PREFER) {
 			return (EINVAL);
 		}
 		opt->ip6po_prefer_tempaddr = preftemp;
 		break;
 
 	default:
 		return (ENOPROTOOPT);
 	} /* end of switch */
 
 	return (0);
 }
 
 /*
  * Routine called from ip6_output() to loop back a copy of an IP6 multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be &loif -- easier than replicating that code here.
  */
 void
 ip6_mloopback(struct ifnet *ifp, struct mbuf *m)
 {
 	struct mbuf *copym;
 	struct ip6_hdr *ip6;
 
 	copym = m_copy(m, 0, M_COPYALL);
 	if (copym == NULL)
 		return;
 
 	/*
 	 * Make sure to deep-copy IPv6 header portion in case the data
 	 * is in an mbuf cluster, so that we can safely override the IPv6
 	 * header portion later.
 	 */
 	if (!M_WRITABLE(copym) ||
 	    copym->m_len < sizeof(struct ip6_hdr)) {
 		copym = m_pullup(copym, sizeof(struct ip6_hdr));
 		if (copym == NULL)
 			return;
 	}
 	ip6 = mtod(copym, struct ip6_hdr *);
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 	if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 		copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 |
 		    CSUM_PSEUDO_HDR;
 		copym->m_pkthdr.csum_data = 0xffff;
 	}
 	if_simloop(ifp, copym, AF_INET6, 0);
 }
 
 /*
  * Chop IPv6 header off from the payload.
  */
 static int
 ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
 {
 	struct mbuf *mh;
 	struct ip6_hdr *ip6;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (m->m_len > sizeof(*ip6)) {
 		mh = m_gethdr(M_NOWAIT, MT_DATA);
 		if (mh == NULL) {
 			m_freem(m);
 			return ENOBUFS;
 		}
 		m_move_pkthdr(mh, m);
 		M_ALIGN(mh, sizeof(*ip6));
 		m->m_len -= sizeof(*ip6);
 		m->m_data += sizeof(*ip6);
 		mh->m_next = m;
 		m = mh;
 		m->m_len = sizeof(*ip6);
 		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
 	}
 	exthdrs->ip6e_ip6 = m;
 	return 0;
 }
 
 /*
  * Compute IPv6 extension header length.
  */
 int
 ip6_optlen(struct inpcb *in6p)
 {
 	int len;
 
 	if (!in6p->in6p_outputopts)
 		return 0;
 
 	len = 0;
 #define elen(x) \
     (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
 
 	len += elen(in6p->in6p_outputopts->ip6po_hbh);
 	if (in6p->in6p_outputopts->ip6po_rthdr)
 		/* dest1 is valid with rthdr only */
 		len += elen(in6p->in6p_outputopts->ip6po_dest1);
 	len += elen(in6p->in6p_outputopts->ip6po_rthdr);
 	len += elen(in6p->in6p_outputopts->ip6po_dest2);
 	return len;
 #undef elen
 }
Index: head/sys/netinet6/udp6_usrreq.c
===================================================================
--- head/sys/netinet6/udp6_usrreq.c	(revision 297224)
+++ head/sys/netinet6/udp6_usrreq.c	(revision 297225)
@@ -1,1279 +1,1279 @@
 /*-
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * Copyright (c) 2014 Kevin Lo
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $
  *	$KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp_var.h>
 #include <netinet/icmp6.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/udplite.h>
 
 #include <netinet6/ip6protosw.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_rss.h>
 #include <netinet6/udp6_var.h>
 #include <netinet6/scope6_var.h>
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif /* IPSEC */
 
 #include <security/mac/mac_framework.h>
 
 /*
  * UDP protocol implementation.
  * Per RFC 768, August, 1980.
  */
 
 extern struct protosw	inetsw[];
 static void		udp6_detach(struct socket *so);
 
 static int
 udp6_append(struct inpcb *inp, struct mbuf *n, int off,
     struct sockaddr_in6 *fromsa)
 {
 	struct socket *so;
 	struct mbuf *opts;
 	struct udpcb *up;
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * Engage the tunneling protocol.
 	 */
 	up = intoudpcb(inp);
 	if (up->u_tun_func != NULL) {
 		in_pcbref(inp);
 		INP_RUNLOCK(inp);
 		(*up->u_tun_func)(n, off, inp, (struct sockaddr *)fromsa,
 		    up->u_tun_ctx);
 		INP_RLOCK(inp);
 		return (in_pcbrele_rlocked(inp));
 	}
 #ifdef IPSEC
 	/* Check AH/ESP integrity. */
 	if (ipsec6_in_reject(n, inp)) {
 		m_freem(n);
 		return (0);
 	}
 #endif /* IPSEC */
 #ifdef MAC
 	if (mac_inpcb_check_deliver(inp, n) != 0) {
 		m_freem(n);
 		return (0);
 	}
 #endif
 	opts = NULL;
 	if (inp->inp_flags & INP_CONTROLOPTS ||
 	    inp->inp_socket->so_options & SO_TIMESTAMP)
 		ip6_savecontrol(inp, n, &opts);
 	m_adj(n, off + sizeof(struct udphdr));
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)fromsa, n,
 	    opts) == 0) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		UDPSTAT_INC(udps_fullsock);
 	} else
 		sorwakeup_locked(so);
 	return (0);
 }
 
 int
 udp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6;
 	struct udphdr *uh;
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct udpcb *up;
 	int off = *offp;
 	int cscov_partial;
 	int plen, ulen;
 	struct sockaddr_in6 fromsa;
 	struct m_tag *fwd_tag;
 	uint16_t uh_sum;
 	uint8_t nxt;
 
 	ifp = m->m_pkthdr.rcvif;
 	ip6 = mtod(m, struct ip6_hdr *);
 
 #ifndef PULLDOWN_TEST
 	IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), IPPROTO_DONE);
 	ip6 = mtod(m, struct ip6_hdr *);
 	uh = (struct udphdr *)((caddr_t)ip6 + off);
 #else
 	IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(*uh));
 	if (!uh)
 		return (IPPROTO_DONE);
 #endif
 
 	UDPSTAT_INC(udps_ipackets);
 
 	/*
 	 * Destination port of 0 is illegal, based on RFC768.
 	 */
 	if (uh->uh_dport == 0)
 		goto badunlocked;
 
 	plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6);
 	ulen = ntohs((u_short)uh->uh_ulen);
 
 	nxt = proto;
 	cscov_partial = (nxt == IPPROTO_UDPLITE) ? 1 : 0;
 	if (nxt == IPPROTO_UDPLITE) {
 		/* Zero means checksum over the complete packet. */
 		if (ulen == 0)
 			ulen = plen;
 		if (ulen == plen)
 			cscov_partial = 0;
 		if ((ulen < sizeof(struct udphdr)) || (ulen > plen)) {
 			/* XXX: What is the right UDPLite MIB counter? */
 			goto badunlocked;
 		}
 		if (uh->uh_sum == 0) {
 			/* XXX: What is the right UDPLite MIB counter? */
 			goto badunlocked;
 		}
 	} else {
 		if ((ulen < sizeof(struct udphdr)) || (plen != ulen)) {
 			UDPSTAT_INC(udps_badlen);
 			goto badunlocked;
 		}
 		if (uh->uh_sum == 0) {
 			UDPSTAT_INC(udps_nosum);
 			goto badunlocked;
 		}
 	}
 
 	if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) &&
 	    !cscov_partial) {
 		if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 			uh_sum = m->m_pkthdr.csum_data;
 		else
 			uh_sum = in6_cksum_pseudo(ip6, ulen, nxt,
 			    m->m_pkthdr.csum_data);
 		uh_sum ^= 0xffff;
 	} else
 		uh_sum = in6_cksum_partial(m, nxt, off, plen, ulen);
 
 	if (uh_sum != 0) {
 		UDPSTAT_INC(udps_badsum);
 		goto badunlocked;
 	}
 
 	/*
 	 * Construct sockaddr format source address.
 	 */
 	init_sin6(&fromsa, m);
 	fromsa.sin6_port = uh->uh_sport;
 
 	pcbinfo = udp_get_inpcbinfo(nxt);
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		struct inpcb *last;
 		struct inpcbhead *pcblist;
 		struct ip6_moptions *imo;
 
 		INP_INFO_RLOCK(pcbinfo);
 		/*
 		 * In the event that laddr should be set to the link-local
 		 * address (this happens in RIPng), the multicast address
 		 * specified in the received packet will not match laddr.  To
 		 * handle this situation, matching is relaxed if the
 		 * receiving interface is the same as one specified in the
 		 * socket and if the destination multicast address matches
 		 * one of the multicast groups specified in the socket.
 		 */
 
 		/*
 		 * KAME note: traditionally we dropped udpiphdr from mbuf
 		 * here.  We need udphdr for IPsec processing so we do that
 		 * later.
 		 */
 		pcblist = udp_get_pcblist(nxt);
 		last = NULL;
 		LIST_FOREACH(inp, pcblist, inp_list) {
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
 			if (inp->inp_lport != uh->uh_dport)
 				continue;
 			if (inp->inp_fport != 0 &&
 			    inp->inp_fport != uh->uh_sport)
 				continue;
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 				if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
 							&ip6->ip6_dst))
 					continue;
 			}
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 				if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr,
 							&ip6->ip6_src) ||
 				    inp->inp_fport != uh->uh_sport)
 					continue;
 			}
 
 			/*
 			 * XXXRW: Because we weren't holding either the inpcb
 			 * or the hash lock when we checked for a match 
 			 * before, we should probably recheck now that the 
 			 * inpcb lock is (supposed to be) held.
 			 */
 
 			/*
 			 * Handle socket delivery policy for any-source
 			 * and source-specific multicast. [RFC3678]
 			 */
 			imo = inp->in6p_moptions;
 			if (imo && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 				struct sockaddr_in6	 mcaddr;
 				int			 blocked;
 
 				INP_RLOCK(inp);
 
 				bzero(&mcaddr, sizeof(struct sockaddr_in6));
 				mcaddr.sin6_len = sizeof(struct sockaddr_in6);
 				mcaddr.sin6_family = AF_INET6;
 				mcaddr.sin6_addr = ip6->ip6_dst;
 
 				blocked = im6o_mc_filter(imo, ifp,
 					(struct sockaddr *)&mcaddr,
 					(struct sockaddr *)&fromsa);
 				if (blocked != MCAST_PASS) {
 					if (blocked == MCAST_NOTGMEMBER)
 						IP6STAT_INC(ip6s_notmember);
 					if (blocked == MCAST_NOTSMEMBER ||
 					    blocked == MCAST_MUTED)
 						UDPSTAT_INC(udps_filtermcast);
 					INP_RUNLOCK(inp); /* XXX */
 					continue;
 				}
 
 				INP_RUNLOCK(inp);
 			}
 			if (last != NULL) {
 				struct mbuf *n;
 
 				if ((n = m_copy(m, 0, M_COPYALL)) != NULL) {
 					INP_RLOCK(last);
 					UDP_PROBE(receive, NULL, last, ip6,
 					    last, uh);
 					if (udp6_append(last, n, off, &fromsa))
 						goto inp_lost;
 					INP_RUNLOCK(last);
 				}
 			}
 			last = inp;
 			/*
 			 * Don't look for additional matches if this one does
 			 * not have either the SO_REUSEPORT or SO_REUSEADDR
 			 * socket options set.  This heuristic avoids
 			 * searching through all pcbs in the common case of a
 			 * non-shared port.  It assumes that an application
 			 * will never clear these options after setting them.
 			 */
 			if ((last->inp_socket->so_options &
 			     (SO_REUSEPORT|SO_REUSEADDR)) == 0)
 				break;
 		}
 
 		if (last == NULL) {
 			/*
 			 * No matching pcb found; discard datagram.  (No need
 			 * to send an ICMP Port Unreachable for a broadcast
 			 * or multicast datgram.)
 			 */
 			UDPSTAT_INC(udps_noport);
 			UDPSTAT_INC(udps_noportmcast);
 			goto badheadlocked;
 		}
 		INP_RLOCK(last);
 		INP_INFO_RUNLOCK(pcbinfo);
 		UDP_PROBE(receive, NULL, last, ip6, last, uh);
 		if (udp6_append(last, m, off, &fromsa) == 0) 
 			INP_RUNLOCK(last);
 	inp_lost:
 		return (IPPROTO_DONE);
 	}
 	/*
 	 * Locate pcb for datagram.
 	 */
 
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		struct sockaddr_in6 *next_hop6;
 
 		next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1);
 
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * Already got one like this?
 		 */
 		inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src,
 		    uh->uh_sport, &ip6->ip6_dst, uh->uh_dport,
 		    INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in6_pcblookup(pcbinfo, &ip6->ip6_src,
 			    uh->uh_sport, &next_hop6->sin6_addr,
 			    next_hop6->sin6_port ? htons(next_hop6->sin6_port) :
 			    uh->uh_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif);
 		}
 		/* Remove the tag from the packet. We don't need it anymore. */
 		m_tag_delete(m, fwd_tag);
 		m->m_flags &= ~M_IP6_NEXTHOP;
 	} else
 		inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src,
 		    uh->uh_sport, &ip6->ip6_dst, uh->uh_dport,
 		    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB,
 		    m->m_pkthdr.rcvif, m);
 	if (inp == NULL) {
 		if (udp_log_in_vain) {
 			char ip6bufs[INET6_ADDRSTRLEN];
 			char ip6bufd[INET6_ADDRSTRLEN];
 
 			log(LOG_INFO,
 			    "Connection attempt to UDP [%s]:%d from [%s]:%d\n",
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 			    ntohs(uh->uh_dport),
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ntohs(uh->uh_sport));
 		}
 		UDPSTAT_INC(udps_noport);
 		if (m->m_flags & M_MCAST) {
 			printf("UDP6: M_MCAST is set in a unicast packet.\n");
 			UDPSTAT_INC(udps_noportmcast);
 			goto badunlocked;
 		}
 		if (V_udp_blackhole)
 			goto badunlocked;
 		if (badport_bandlim(BANDLIM_ICMP6_UNREACH) < 0)
 			goto badunlocked;
 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0);
 		return (IPPROTO_DONE);
 	}
 	INP_RLOCK_ASSERT(inp);
 	up = intoudpcb(inp);
 	if (cscov_partial) {
 		if (up->u_rxcslen == 0 || up->u_rxcslen > ulen) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	}
 	UDP_PROBE(receive, NULL, inp, ip6, inp, uh);
 	if (udp6_append(inp, m, off, &fromsa) == 0)
 		INP_RUNLOCK(inp);
 	return (IPPROTO_DONE);
 
 badheadlocked:
 	INP_INFO_RUNLOCK(pcbinfo);
 badunlocked:
 	if (m)
 		m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 static void
 udp6_common_ctlinput(int cmd, struct sockaddr *sa, void *d,
     struct inpcbinfo *pcbinfo)
 {
 	struct udphdr uh;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	int off = 0;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	void *cmdarg;
 	struct inpcb *(*notify)(struct inpcb *, int) = udp_notify;
 	struct udp_portonly {
 		u_int16_t uh_sport;
 		u_int16_t uh_dport;
 	} *uhp;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if ((unsigned)cmd >= PRC_NCMDS)
 		return;
 	if (PRC_IS_REDIRECT(cmd))
 		notify = in6_rtchange, d = NULL;
 	else if (cmd == PRC_HOSTDEAD)
 		d = NULL;
 	else if (inet6ctlerrmap[cmd] == 0)
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		cmdarg = ip6cp->ip6c_cmdarg;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		cmdarg = NULL;
 		sa6_src = &sa6_any;
 	}
 
 	if (ip6) {
 		/*
 		 * XXX: We assume that when IPV6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		/* Check if we can safely examine src and dst ports. */
 		if (m->m_pkthdr.len < off + sizeof(*uhp))
 			return;
 
 		bzero(&uh, sizeof(uh));
 		m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh);
 
 		(void)in6_pcbnotify(pcbinfo, sa, uh.uh_dport,
 		    (struct sockaddr *)ip6cp->ip6c_src, uh.uh_sport, cmd,
 		    cmdarg, notify);
 	} else
 		(void)in6_pcbnotify(pcbinfo, sa, 0,
 		    (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify);
 }
 
 void
 udp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 
 	return (udp6_common_ctlinput(cmd, sa, d, &V_udbinfo));
 }
 
 void
 udplite6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 
 	return (udp6_common_ctlinput(cmd, sa, d, &V_ulitecbinfo));
 }
 
 static int
 udp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 
 	if (req->newlen != sizeof(addrs))
 		return (EINVAL);
 	if (req->oldlen != sizeof(struct xucred))
 		return (EINVAL);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr,
 	    addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port,
 	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
 	if (inp != NULL) {
 		INP_RLOCK_ASSERT(inp);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseesocket(req->td->td_ucred,
 			    inp->inp_socket);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0,
     0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection");
 
 static int
 udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6,
     struct mbuf *control, struct thread *td)
 {
 	u_int32_t ulen = m->m_pkthdr.len;
 	u_int32_t plen = sizeof(struct udphdr) + ulen;
 	struct ip6_hdr *ip6;
 	struct udphdr *udp6;
 	struct in6_addr *laddr, *faddr, in6a;
 	struct sockaddr_in6 *sin6 = NULL;
 	int cscov_partial = 0;
 	int scope_ambiguous = 0;
 	u_short fport;
 	int error = 0;
 	uint8_t nxt;
 	uint16_t cscov = 0;
 	struct ip6_pktopts *optp, opt;
 	int af = AF_INET6, hlen = sizeof(struct ip6_hdr);
 	int flags;
 	struct sockaddr_in6 tmp;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (addr6) {
 		/* addr6 has been validated in udp6_send(). */
 		sin6 = (struct sockaddr_in6 *)addr6;
 
 		/* protect *sin6 from overwrites */
 		tmp = *sin6;
 		sin6 = &tmp;
 
 		/*
 		 * Application should provide a proper zone ID or the use of
 		 * default zone IDs should be enabled.  Unfortunately, some
 		 * applications do not behave as it should, so we need a
 		 * workaround.  Even if an appropriate ID is not determined,
 		 * we'll see if we can determine the outgoing interface.  If we
 		 * can, determine the zone ID based on the interface below.
 		 */
 		if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
 			scope_ambiguous = 1;
 		if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
 			return (error);
 	}
 
 	nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
 	    IPPROTO_UDP : IPPROTO_UDPLITE;
 	if (control) {
 		if ((error = ip6_setpktopts(control, &opt,
 		    inp->in6p_outputopts, td->td_ucred, nxt)) != 0)
 			goto release;
 		optp = &opt;
 	} else
 		optp = inp->in6p_outputopts;
 
 	if (sin6) {
 		faddr = &sin6->sin6_addr;
 
 		/*
 		 * Since we saw no essential reason for calling in_pcbconnect,
 		 * we get rid of such kind of logic, and call in6_selectsrc
 		 * and in6_pcbsetport in order to fill in the local address
 		 * and the local port.
 		 */
 		if (sin6->sin6_port == 0) {
 			error = EADDRNOTAVAIL;
 			goto release;
 		}
 
 		if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 			/* how about ::ffff:0.0.0.0 case? */
 			error = EISCONN;
 			goto release;
 		}
 
 		fport = sin6->sin6_port; /* allow 0 port */
 
 		if (IN6_IS_ADDR_V4MAPPED(faddr)) {
 			if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
 				/*
 				 * I believe we should explicitly discard the
 				 * packet when mapped addresses are disabled,
 				 * rather than send the packet as an IPv6 one.
 				 * If we chose the latter approach, the packet
 				 * might be sent out on the wire based on the
 				 * default route, the situation which we'd
 				 * probably want to avoid.
 				 * (20010421 jinmei@kame.net)
 				 */
 				error = EINVAL;
 				goto release;
 			}
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
 			    !IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) {
 				/*
 				 * when remote addr is an IPv4-mapped address,
 				 * local addr should not be an IPv6 address,
 				 * since you cannot determine how to map IPv6
 				 * source address to IPv4.
 				 */
 				error = EINVAL;
 				goto release;
 			}
 
 			af = AF_INET;
 		}
 
 		if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
 			error = in6_selectsrc_socket(sin6, optp, inp,
 			    td->td_ucred, scope_ambiguous, &in6a, NULL);
 			if (error)
 				goto release;
 			laddr = &in6a;
 		} else
 			laddr = &inp->in6p_laddr;	/* XXX */
 		if (laddr == NULL) {
 			if (error == 0)
 				error = EADDRNOTAVAIL;
 			goto release;
 		}
 		if (inp->inp_lport == 0 &&
 		    (error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0) {
 			/* Undo an address bind that may have occurred. */
 			inp->in6p_laddr = in6addr_any;
 			goto release;
 		}
 	} else {
 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 			error = ENOTCONN;
 			goto release;
 		}
 		if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) {
 			if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
 				/*
 				 * XXX: this case would happen when the
 				 * application sets the V6ONLY flag after
 				 * connecting the foreign address.
 				 * Such applications should be fixed,
 				 * so we bark here.
 				 */
 				log(LOG_INFO, "udp6_output: IPV6_V6ONLY "
 				    "option was set for a connected socket\n");
 				error = EINVAL;
 				goto release;
 			} else
 				af = AF_INET;
 		}
 		laddr = &inp->in6p_laddr;
 		faddr = &inp->in6p_faddr;
 		fport = inp->inp_fport;
 	}
 
 	if (af == AF_INET)
 		hlen = sizeof(struct ip);
 
 	/*
 	 * Calculate data length and get a mbuf
 	 * for UDP and IP6 headers.
 	 */
 	M_PREPEND(m, hlen + sizeof(struct udphdr), M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto release;
 	}
 
 	/*
 	 * Stuff checksum and output datagram.
 	 */
 	udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen);
 	udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */
 	udp6->uh_dport = fport;
 	if (nxt == IPPROTO_UDPLITE) {
 		struct udpcb *up;
 
 		up = intoudpcb(inp);
 		cscov = up->u_txcslen;
 		if (cscov >= plen)
 			cscov = 0;
 		udp6->uh_ulen = htons(cscov);
 		/*
 		 * For UDP-Lite, checksum coverage length of zero means
 		 * the entire UDPLite packet is covered by the checksum.
 		 */
 		cscov_partial = (cscov == 0) ? 0 : 1;
 	} else if (plen <= 0xffff)
 		udp6->uh_ulen = htons((u_short)plen);
 	else
 		udp6->uh_ulen = 0;
 	udp6->uh_sum = 0;
 
 	switch (af) {
 	case AF_INET6:
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6->ip6_flow	= inp->inp_flow & IPV6_FLOWINFO_MASK;
 		ip6->ip6_vfc	&= ~IPV6_VERSION_MASK;
 		ip6->ip6_vfc	|= IPV6_VERSION;
 		ip6->ip6_plen	= htons((u_short)plen);
 		ip6->ip6_nxt	= nxt;
 		ip6->ip6_hlim	= in6_selecthlim(inp, NULL);
 		ip6->ip6_src	= *laddr;
 		ip6->ip6_dst	= *faddr;
 
 		if (cscov_partial) {
 			if ((udp6->uh_sum = in6_cksum_partial(m, nxt,
 			    sizeof(struct ip6_hdr), plen, cscov)) == 0)
 				udp6->uh_sum = 0xffff;
 		} else {
 			udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0);
 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 		}
 
 #ifdef	RSS
 		{
 			uint32_t hash_val, hash_type;
 			uint8_t pr;
 
 			pr = inp->inp_socket->so_proto->pr_protocol;
 			/*
 			 * Calculate an appropriate RSS hash for UDP and
 			 * UDP Lite.
 			 *
 			 * The called function will take care of figuring out
 			 * whether a 2-tuple or 4-tuple hash is required based
 			 * on the currently configured scheme.
 			 *
 			 * Later later on connected socket values should be
 			 * cached in the inpcb and reused, rather than constantly
 			 * re-calculating it.
 			 *
 			 * UDP Lite is a different protocol number and will
 			 * likely end up being hashed as a 2-tuple until
 			 * RSS / NICs grow UDP Lite protocol awareness.
 			 */
 			if (rss_proto_software_hash_v6(faddr, laddr, fport,
 			    inp->inp_lport, pr, &hash_val, &hash_type) == 0) {
 				m->m_pkthdr.flowid = hash_val;
 				M_HASHTYPE_SET(m, hash_type);
 			}
 		}
 #endif
 		flags = 0;
 #ifdef	RSS
 		/*
 		 * Don't override with the inp cached flowid.
 		 *
 		 * Until the whole UDP path is vetted, it may actually
 		 * be incorrect.
 		 */
 		flags |= IP_NODEFAULTFLOWID;
 #endif
 
 		UDP_PROBE(send, NULL, inp, ip6, inp, udp6);
 		UDPSTAT_INC(udps_opackets);
-		error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions,
-		    NULL, inp);
+		error = ip6_output(m, optp, &inp->inp_route6, flags,
+		    inp->in6p_moptions, NULL, inp);
 		break;
 	case AF_INET:
 		error = EAFNOSUPPORT;
 		goto release;
 	}
 	goto releaseopt;
 
 release:
 	m_freem(m);
 
 releaseopt:
 	if (control) {
 		ip6_clearpktopts(&opt, -1);
 		m_freem(control);
 	}
 	return (error);
 }
 
 static void
 udp6_abort(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_abort: inp == NULL"));
 
 	INP_WLOCK(inp);
 #ifdef INET
 	if (inp->inp_vflag & INP_IPV4) {
 		struct pr_usrreqs *pru;
 		uint8_t nxt;
 
 		nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
 		    IPPROTO_UDP : IPPROTO_UDPLITE;
 		INP_WUNLOCK(inp);
 		pru = inetsw[ip_protox[nxt]].pr_usrreqs;
 		(*pru->pru_abort)(so);
 		return;
 	}
 #endif
 
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		INP_HASH_WLOCK(pcbinfo);
 		in6_pcbdisconnect(inp);
 		inp->in6p_laddr = in6addr_any;
 		INP_HASH_WUNLOCK(pcbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp6_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("udp6_attach: inp != NULL"));
 
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		error = soreserve(so, udp_sendspace, udp_recvspace);
 		if (error)
 			return (error);
 	}
 	INP_INFO_WLOCK(pcbinfo);
 	error = in_pcballoc(so, pcbinfo);
 	if (error) {
 		INP_INFO_WUNLOCK(pcbinfo);
 		return (error);
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_vflag |= INP_IPV6;
 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
 		inp->inp_vflag |= INP_IPV4;
 	inp->in6p_hops = -1;	/* use kernel default */
 	inp->in6p_cksum = -1;	/* just to be sure */
 	/*
 	 * XXX: ugly!!
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = V_ip_defttl;
 
 	error = udp_newudpcb(inp);
 	if (error) {
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 		INP_INFO_WUNLOCK(pcbinfo);
 		return (error);
 	}
 	INP_WUNLOCK(inp);
 	INP_INFO_WUNLOCK(pcbinfo);
 	return (0);
 }
 
 static int
 udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_bind: inp == NULL"));
 
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(pcbinfo);
 	inp->inp_vflag &= ~INP_IPV4;
 	inp->inp_vflag |= INP_IPV6;
 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
 		struct sockaddr_in6 *sin6_p;
 
 		sin6_p = (struct sockaddr_in6 *)nam;
 
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr))
 			inp->inp_vflag |= INP_IPV4;
 #ifdef INET
 		else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) {
 			struct sockaddr_in sin;
 
 			in6_sin6_2_sin(&sin, sin6_p);
 			inp->inp_vflag |= INP_IPV4;
 			inp->inp_vflag &= ~INP_IPV6;
 			error = in_pcbbind(inp, (struct sockaddr *)&sin,
 			    td->td_ucred);
 			goto out;
 		}
 #endif
 	}
 
 	error = in6_pcbbind(inp, nam, td->td_ucred);
 #ifdef INET
 out:
 #endif
 	INP_HASH_WUNLOCK(pcbinfo);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp6_close(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_close: inp == NULL"));
 
 	INP_WLOCK(inp);
 #ifdef INET
 	if (inp->inp_vflag & INP_IPV4) {
 		struct pr_usrreqs *pru;
 		uint8_t nxt;
 
 		nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
 		    IPPROTO_UDP : IPPROTO_UDPLITE;
 		INP_WUNLOCK(inp);
 		pru = inetsw[ip_protox[nxt]].pr_usrreqs;
 		(*pru->pru_disconnect)(so);
 		return;
 	}
 #endif
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		INP_HASH_WLOCK(pcbinfo);
 		in6_pcbdisconnect(inp);
 		inp->in6p_laddr = in6addr_any;
 		INP_HASH_WUNLOCK(pcbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct sockaddr_in6 *sin6;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	sin6 = (struct sockaddr_in6 *)nam;
 	KASSERT(inp != NULL, ("udp6_connect: inp == NULL"));
 
 	/*
 	 * XXXRW: Need to clarify locking of v4/v6 flags.
 	 */
 	INP_WLOCK(inp);
 #ifdef INET
 	if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 		struct sockaddr_in sin;
 
 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
 			error = EINVAL;
 			goto out;
 		}
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto out;
 		}
 		in6_sin6_2_sin(&sin, sin6);
 		inp->inp_vflag |= INP_IPV4;
 		inp->inp_vflag &= ~INP_IPV6;
 		error = prison_remote_ip4(td->td_ucred, &sin.sin_addr);
 		if (error != 0)
 			goto out;
 		INP_HASH_WLOCK(pcbinfo);
 		error = in_pcbconnect(inp, (struct sockaddr *)&sin,
 		    td->td_ucred);
 		INP_HASH_WUNLOCK(pcbinfo);
 		if (error == 0)
 			soisconnected(so);
 		goto out;
 	}
 #endif
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		error = EISCONN;
 		goto out;
 	}
 	inp->inp_vflag &= ~INP_IPV4;
 	inp->inp_vflag |= INP_IPV6;
 	error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr);
 	if (error != 0)
 		goto out;
 	INP_HASH_WLOCK(pcbinfo);
 	error = in6_pcbconnect(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(pcbinfo);
 	if (error == 0)
 		soisconnected(so);
 out:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp6_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct udpcb *up;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_detach: inp == NULL"));
 
 	INP_INFO_WLOCK(pcbinfo);
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: up == NULL", __func__));
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	INP_INFO_WUNLOCK(pcbinfo);
 	udp_discardcb(up);
 }
 
 static int
 udp6_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL"));
 
 	INP_WLOCK(inp);
 #ifdef INET
 	if (inp->inp_vflag & INP_IPV4) {
 		struct pr_usrreqs *pru;
 		uint8_t nxt;
 
 		nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
 		    IPPROTO_UDP : IPPROTO_UDPLITE;
 		INP_WUNLOCK(inp);
 		pru = inetsw[ip_protox[nxt]].pr_usrreqs;
 		(void)(*pru->pru_disconnect)(so);
 		return (0);
 	}
 #endif
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		error = ENOTCONN;
 		goto out;
 	}
 
 	INP_HASH_WLOCK(pcbinfo);
 	in6_pcbdisconnect(inp);
 	inp->in6p_laddr = in6addr_any;
 	INP_HASH_WUNLOCK(pcbinfo);
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	SOCK_UNLOCK(so);
 out:
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 udp6_send(struct socket *so, int flags, struct mbuf *m,
     struct sockaddr *addr, struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error = 0;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_send: inp == NULL"));
 
 	INP_WLOCK(inp);
 	if (addr) {
 		if (addr->sa_len != sizeof(struct sockaddr_in6)) {
 			error = EINVAL;
 			goto bad;
 		}
 		if (addr->sa_family != AF_INET6) {
 			error = EAFNOSUPPORT;
 			goto bad;
 		}
 	}
 
 #ifdef INET
 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
 		int hasv4addr;
 		struct sockaddr_in6 *sin6 = 0;
 
 		if (addr == 0)
 			hasv4addr = (inp->inp_vflag & INP_IPV4);
 		else {
 			sin6 = (struct sockaddr_in6 *)addr;
 			hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)
 			    ? 1 : 0;
 		}
 		if (hasv4addr) {
 			struct pr_usrreqs *pru;
 			uint8_t nxt;
 
 			nxt = (inp->inp_socket->so_proto->pr_protocol ==
 			    IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE;
 			/*
 			 * XXXRW: We release UDP-layer locks before calling
 			 * udp_send() in order to avoid recursion.  However,
 			 * this does mean there is a short window where inp's
 			 * fields are unstable.  Could this lead to a
 			 * potential race in which the factors causing us to
 			 * select the UDPv4 output routine are invalidated?
 			 */
 			INP_WUNLOCK(inp);
 			if (sin6)
 				in6_sin6_2_sin_in_sock(addr);
 			pru = inetsw[ip_protox[nxt]].pr_usrreqs;
 			/* addr will just be freed in sendit(). */
 			return ((*pru->pru_send)(so, flags, m, addr, control,
 			    td));
 		}
 	}
 #endif
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 	INP_HASH_WLOCK(pcbinfo);
 	error = udp6_output(inp, m, addr, control, td);
 	INP_HASH_WUNLOCK(pcbinfo);
 	INP_WUNLOCK(inp);
 	return (error);
 
 bad:
 	INP_WUNLOCK(inp);
 	m_freem(m);
 	return (error);
 }
 
 struct pr_usrreqs udp6_usrreqs = {
 	.pru_abort =		udp6_abort,
 	.pru_attach =		udp6_attach,
 	.pru_bind =		udp6_bind,
 	.pru_connect =		udp6_connect,
 	.pru_control =		in6_control,
 	.pru_detach =		udp6_detach,
 	.pru_disconnect =	udp6_disconnect,
 	.pru_peeraddr =		in6_mapped_peeraddr,
 	.pru_send =		udp6_send,
 	.pru_shutdown =		udp_shutdown,
 	.pru_sockaddr =		in6_mapped_sockaddr,
 	.pru_soreceive =	soreceive_dgram,
 	.pru_sosend =		sosend_dgram,
 	.pru_sosetlabel =	in_pcbsosetlabel,
 	.pru_close =		udp6_close
 };