Index: sys/net/if_ethersubr.c =================================================================== --- sys/net/if_ethersubr.c +++ sys/net/if_ethersubr.c @@ -135,6 +135,26 @@ dst->m_pkthdr.csum_data = 0xffff; } +static __inline void +update_cached_lle(struct route *ro, struct llentry *lle, struct ifnet *ifp, + const struct sockaddr *dst) +{ + if (lle == ro->ro_lle || + (ro->ro_flags & RT_CACHING_CONTEXT) == 0) + return; + + IF_AFDATA_RLOCK(ifp); + lle = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); + IF_AFDATA_RUNLOCK(ifp); + if (lle != NULL) { + LLE_ADDREF(lle); + LLE_WUNLOCK(lle); + if (ro->ro_lle != NULL) + LLE_FREE(ro->ro_lle); + ro->ro_lle = lle; + } +} + /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. @@ -191,6 +211,8 @@ if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); + if (ro != NULL) + update_cached_lle(ro, lle, ifp, dst); break; case AF_ARP: { @@ -230,6 +252,8 @@ if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IPV6); + if (ro != NULL) + update_cached_lle(ro, lle, ifp, dst); break; #endif case pseudo_AF_HDRCMPLT: Index: sys/net/route.h =================================================================== --- sys/net/route.h +++ sys/net/route.h @@ -333,7 +333,7 @@ #define RO_RTFREE(_ro) do { \ if ((_ro)->ro_rt) { \ - if ((_ro)->ro_flags & RT_NORTREF) { \ + if ((_ro)->ro_flags & RT_NORTREF ) { \ (_ro)->ro_flags &= ~RT_NORTREF; \ (_ro)->ro_rt = NULL; \ } else { \ Index: sys/net/route.c =================================================================== --- sys/net/route.c +++ sys/net/route.c @@ -97,6 +97,11 @@ u_int rt_numfibs = RT_NUMFIBS; SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, ""); +u_int inpcb_rt_cache_enable = 0; +SYSCTL_UINT(_net, OID_AUTO, conn_rt_cache, CTLFLAG_RW|CTLFLAG_TUN, &inpcb_rt_cache_enable, 0, ""); +TUNABLE_INT("net.conn_rt_cache", &inpcb_rt_cache_enable); + + /* * By default add routes to all fibs for new interfaces. * Once this is set to 0 then only allocate routes on interface @@ -1019,6 +1024,7 @@ * but when callers invoke us blindly it may not (sigh). */ rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh); + atomic_add_int(&rnh->rnh_gen, 1); if (rn == NULL) { error = ESRCH; goto bad; @@ -1224,6 +1230,7 @@ * to the caller */ rn = rnh->rnh_deladdr(dst, netmask, rnh); + atomic_add_int(&rnh->rnh_gen, 1); KASSERT(rt == RNTORT(rn), ("radix node disappeared")); goto gwdelete; } @@ -1355,6 +1362,7 @@ * Complain if it is not there and do no more processing. */ rn = rnh->rnh_deladdr(dst, netmask, rnh); + atomic_add_int(&rnh->rnh_gen, 1); if (rn == NULL) senderr(ESRCH); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) @@ -1515,6 +1523,7 @@ /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes); + atomic_add_int(&rnh->rnh_gen, 1); /* * If it still failed to go into the tree, * then un-make it (this should be a function) Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -155,6 +155,7 @@ * from the global list. * * Key: + * (a) - Atomically incremented * (c) - Constant after initialization * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock @@ -201,6 +202,8 @@ u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_flowid; /* (x) flow id / queue id */ u_int inp_refcount; /* (i) refcount */ + struct in_ifaddr *inp_ifaddr; /* (i) reference to the local ifaddr */ + u_int inp_rt_gen; /* (a) generation count of routing entry */ void *inp_pspare[5]; /* (x) route caching / general use */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */ @@ -599,8 +602,6 @@ /* * Flags for inp_flags2. */ -#define INP_LLE_VALID 0x00000001 /* cached lle is valid */ -#define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ #define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ #define INP_FREED 0x00000010 /* inp itself is not valid */ @@ -722,6 +723,8 @@ int in_pcbrele(struct inpcb *); int in_pcbrele_rlocked(struct inpcb *); int in_pcbrele_wlocked(struct inpcb *); +void in_pcbrtalloc(struct inpcb *inp); +int in_rt_valid(struct inpcb *inp); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); int in_getsockaddr(struct socket *so, struct sockaddr **nam); Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -44,6 +44,7 @@ #include "opt_inet6.h" #include "opt_pcbgroup.h" #include "opt_rss.h" +#include "opt_mpath.h" #include #include @@ -71,6 +72,7 @@ #include #include +#include #include #include #include @@ -133,6 +135,8 @@ #define V_ipport_tcplastcount VNET(ipport_tcplastcount) +extern u_int inpcb_rt_cache_enable; + static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, @@ -504,6 +508,180 @@ so_options |= SO_REUSEADDR; return (so_options); } + +/* + * in_rt_valid() both checks for, and attempts to ensure, that a cached route + * is present on a socket. It will call in_pcbrtalloc() if conditions are + * right (i.e. routing is enabled on the socket) and required (no route cached + * already or the cached rout is no longer valid). A route can only be + * installed if the caller passes the inp with a write lock, but the route may + * be used if a read lock is held. + */ + +int +in_rt_valid(struct inpcb *inp) +{ + struct radix_node_head *rnh; + + INP_WLOCK_ASSERT(inp); + + if (inpcb_rt_cache_enable == 0) + return (0); + if (inp->inp_socket == NULL) + return (0); + if (inp->inp_socket->so_options & SO_DONTROUTE) + return (0); + if (inp->inp_vflag & INP_IPV6PROTO) + rnh = rt_tables_get_rnh(0, AF_INET6); + else + rnh = rt_tables_get_rnh(inp->inp_inc.inc_fibnum, AF_INET); + if (inp->inp_rt != NULL && + (inp->inp_rt->rt_flags & RTF_UP) && + inp->inp_rt_gen == rnh->rnh_gen) + return (1); + /* + * This will handle selectively replacing one field or the other or + * merely updating the inpcb's routing generation count. + */ + in_pcbrtalloc(inp); + return (inp->inp_rt != NULL && inp->inp_rt->rt_ifp != NULL); +} + +/* + * in_pcbrtalloc will install or update a cached route on an inpcb. + */ + +void +in_pcbrtalloc(struct inpcb *inp) +{ + struct rtentry *rt; + struct radix_node_head *rnh = NULL; + int gen; + struct route_in6 iproute; +#ifdef INET6 + struct route_in6 *sro6 = NULL; + struct sockaddr_in6 *sin6 = NULL; +#endif +#ifdef INET + struct sockaddr_in *sin = NULL; + struct route *sro = NULL; + struct in_ifaddr *ia; +#endif + + INP_WLOCK_ASSERT(inp); + + if (inpcb_rt_cache_enable == 0) + return; + + if (inp->inp_socket->so_options & SO_DONTROUTE) + return; + + if (inp->inp_vflag & INP_IPV6PROTO) { +#ifdef INET6 + sro6 = &iproute; + bzero(sro6, sizeof(*sro6)); + rnh = rt_tables_get_rnh(0, AF_INET6); + sin6 = (struct sockaddr_in6 *)&sro6->ro_dst; + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_addr = inp->in6p_faddr; +#endif + } else { +#ifdef INET + sro = (struct route *)&iproute; + bzero(sro, sizeof(*sro)); + rnh = rt_tables_get_rnh(inp->inp_inc.inc_fibnum, AF_INET); + sin = (struct sockaddr_in *)&sro->ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_addr.s_addr = inp->inp_faddr.s_addr; +#endif + + } + if (inp->inp_rt != NULL && + inp->inp_rt_gen == rnh->rnh_gen) { + KASSERT(inp->inp_rt->rt_flags & RTF_UP, + ("gen count unchanged but route invalid")); + rt = inp->inp_rt; + return; + } +resolve: + + gen = rnh->rnh_gen; + + if (inp->inp_vflag & INP_IPV6PROTO) { +#ifdef INET6 +#ifdef RADIX_MPATH + rtalloc_mpath((struct route *)sro6, + ntohl(sin6->sin6_addr.s6_addr32[3])); +#else + sro6->ro_rt = rtalloc1(&((struct route *)sro6) + ->ro_dst, 0, 0UL); + if (sro6->ro_rt) + RT_UNLOCK(sro6->ro_rt); +#endif + rt = sro6->ro_rt; +#endif + } else { +#ifdef INET +#ifdef RADIX_MPATH + rtalloc_mpath_fib(sro, ntohl(inp->inp_faddr.s_addr), + inp->inp_inc.inc_fibnum); +#else + rtalloc_ign_fib(sro, 0, inp->inp_inc.inc_fibnum); +#endif + rt = sro->ro_rt; +#endif + } + + if (inp->inp_rt != NULL) { + if (rt == inp->inp_rt) { + /* The route is unchanged so we drop the added + * reference and update reference count. + */ + RTFREE(rt); + inp->inp_rt_gen = gen; + + /* The route has been validated and the generation + * count updated so we're done here. + */ + return; + } +#ifdef INET + /* Drop our reference to the old route */ + ia = ifatoia(inp->inp_rt->rt_ifa); + ifa_free(&ia->ia_ifa); + inp->inp_ifaddr = NULL; +#endif + RTFREE(inp->inp_rt); + inp->inp_rt = NULL; + } + + if (inp->inp_lle != NULL) { + LLE_FREE(inp->inp_lle); + inp->inp_lle = NULL; + } + if (rt == NULL) + return; + + if (gen != rnh->rnh_gen) { + /* + * The routing tree was updated some time after we read its + * generation counter. + */ + RTFREE(rt); + goto resolve; + } + + inp->inp_rt = rt; +#ifdef INET + ia = ifatoia(rt->rt_ifa); + ifa_ref(&ia->ia_ifa); + inp->inp_ifaddr = ia; +#endif + inp->inp_rt_gen = gen; +} + #endif /* INET || INET6 */ /* @@ -743,6 +921,7 @@ inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; + in_pcbrehash_mbuf(inp, m); if (anonport) @@ -795,7 +974,7 @@ * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) - in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); + rtalloc_ign_fib(&sro, 0, inp->inp_inc.inc_fibnum); /* * If we found a route, use the address corresponding to @@ -1120,6 +1299,19 @@ INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); + if (inp->inp_rt != NULL) { + RTFREE(inp->inp_rt); + inp->inp_rt = NULL; + } + if (inp->inp_ifaddr != NULL) { + ifa_free(&inp->inp_ifaddr->ia_ifa); + inp->inp_ifaddr = NULL; + } + if (inp->inp_lle != NULL) { + LLE_FREE(inp->inp_lle); + inp->inp_lle = NULL; + } + inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); @@ -1266,6 +1458,20 @@ #endif INP_WLOCK_ASSERT(inp); + if (inp->inp_rt != NULL) { + RTFREE(inp->inp_rt); + inp->inp_rt = NULL; +#ifdef INET + KASSERT(inp->inp_ifaddr != NULL, ("route valid but ifaddr not set")); + ifa_free(&inp->inp_ifaddr->ia_ifa); + inp->inp_ifaddr = NULL; +#endif + } + if (inp->inp_lle != NULL) { + LLE_FREE(inp->inp_lle); + inp->inp_lle = NULL; + } + /* XXXRW: Do as much as possible here. */ #ifdef IPSEC if (inp->inp_sp != NULL) Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -222,7 +222,7 @@ struct sockaddr_in *dst; const struct sockaddr_in *gw; struct in_ifaddr *ia; - int isbroadcast; + int isbroadcast, nortfree; uint16_t ip_len, ip_off; struct route iproute; struct rtentry *rte; /* cache for ro->ro_rt */ @@ -250,6 +250,10 @@ #ifdef FLOWTABLE if (ro->ro_rt == NULL) (void )flowtable_lookup(AF_INET, m, ro); + else { + nortfree = 1; + ia = ro->ro_ia; + } #endif if (opt) { @@ -355,6 +359,7 @@ in_rtalloc_ign(ro, 0, fibnum); #endif rte = ro->ro_rt; + nortfree = 0; } if (rte == NULL || (rte->rt_flags & RTF_UP) == 0 || @@ -681,7 +686,7 @@ IPSTAT_INC(ips_fragmented); done: - if (ro == &iproute) + if (ro == &iproute && !nortfree) RO_RTFREE(ro); else if (rte == NULL) /* Index: sys/netinet/tcp_output.c =================================================================== --- sys/netinet/tcp_output.c +++ sys/netinet/tcp_output.c @@ -177,6 +177,13 @@ struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; +#ifdef INET + struct sockaddr_in *sin; +#endif +#ifdef INET6 + struct sockaddr_in6 *sin6; +#endif + struct inpcb *inp; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef IPSEC @@ -197,7 +204,8 @@ isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif - INP_WLOCK_ASSERT(tp->t_inpcb); + inp = tp->t_inpcb; + INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) @@ -1291,6 +1299,16 @@ */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); + if (in_rt_valid(inp)) { + sin6 = (struct sockaddr_in6 *)&ro.ro_dst; + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(struct sockaddr_in6); + memcpy(&sin6->sin6_addr.s6_addr, &inp->in6p_faddr.s6_addr, 16); + ro.ro_rt = inp->inp_rt; + ro.ro_lle = inp->inp_lle; + ro.ro_flags |= RT_CACHING_CONTEXT; + } + /* * Set the packet size here for the benefit of DTrace probes. * ip6_output() will set it properly; it's supposed to include @@ -1320,7 +1338,9 @@ if (error == EMSGSIZE && ro.ro_rt != NULL) mtu = ro.ro_rt->rt_mtu; - RO_RTFREE(&ro); + if (!(ro.ro_flags & RT_CACHING_CONTEXT)) { + RO_RTFREE(&ro); + } } #endif /* INET6 */ #if defined(INET) && defined(INET6) @@ -1361,13 +1381,26 @@ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif + if (in_rt_valid(inp)) { + sin = (struct sockaddr_in *)&ro.ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_addr.s_addr = inp->inp_faddr.s_addr; + ro.ro_rt = inp->inp_rt; + ro.ro_lle = inp->inp_lle; + ro.ro_ia = inp->inp_ifaddr; + ro.ro_flags |= RT_CACHING_CONTEXT; + } + error = ip_output(m, tp->t_inpcb->inp_options, &ro, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); if (error == EMSGSIZE && ro.ro_rt != NULL) mtu = ro.ro_rt->rt_mtu; - RO_RTFREE(&ro); + if (!(ro.ro_flags & RT_CACHING_CONTEXT)) { + RO_RTFREE(&ro); + } } #endif /* INET */ Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -1216,6 +1216,7 @@ goto out; } inp->inp_laddr = laddr; + in_pcbrtalloc(inp); in_pcbrehash(inp); INP_HASH_WUNLOCK(&V_tcbinfo); Index: sys/netinet6/in6_src.c =================================================================== --- sys/netinet6/in6_src.c +++ sys/netinet6/in6_src.c @@ -662,7 +662,9 @@ ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst))) { - RTFREE(ro->ro_rt); + + if (!(ro->ro_flags & RT_CACHING_CONTEXT)) + RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)NULL; } if (ro->ro_rt == (struct rtentry *)NULL) { @@ -696,7 +698,8 @@ ifp = ro->ro_rt->rt_ifp; if (ifp == NULL) { /* can this really happen? */ - RTFREE(ro->ro_rt); + if (!(ro->ro_flags & RT_CACHING_CONTEXT)) + RTFREE(ro->ro_rt); ro->ro_rt = NULL; } } Index: sys/netinet6/ip6_forward.c =================================================================== --- sys/netinet6/ip6_forward.c +++ sys/netinet6/ip6_forward.c @@ -571,7 +571,7 @@ goto bad; } - error = nd6_output_ifp(rt->rt_ifp, origifp, m, dst); + error = nd6_output_ifp(rt->rt_ifp, origifp, m, dst, &rin6); if (error) { in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard); IP6STAT_INC(ip6s_cantforward); Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -935,7 +935,7 @@ m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } - error = nd6_output_ifp(ifp, origifp, m, dst); + error = nd6_output_ifp(ifp, origifp, m, dst, ro); goto done; } @@ -1034,7 +1034,7 @@ counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } - error = nd6_output_ifp(ifp, origifp, m, dst); + error = nd6_output_ifp(ifp, origifp, m, dst, ro); } else m_freem(m); } Index: sys/netinet6/nd6.h =================================================================== --- sys/netinet6/nd6.h +++ sys/netinet6/nd6.h @@ -422,7 +422,7 @@ int nd6_add_ifa_lle(struct in6_ifaddr *); void nd6_rem_ifa_lle(struct in6_ifaddr *, int); int nd6_output_ifp(struct ifnet *, struct ifnet *, struct mbuf *, - struct sockaddr_in6 *); + struct sockaddr_in6 *, struct route_in6 *); /* nd6_nbr.c */ void nd6_na_input(struct mbuf *, int, int); Index: sys/netinet6/nd6.c =================================================================== --- sys/netinet6/nd6.c +++ sys/netinet6/nd6.c @@ -1905,7 +1905,7 @@ int nd6_output_ifp(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, - struct sockaddr_in6 *dst) + struct sockaddr_in6 *dst, struct route_in6 *ro) { int error; int ip6len; @@ -1944,7 +1944,7 @@ if ((ifp->if_flags & IFF_LOOPBACK) == 0) origifp = ifp; - error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, NULL); + error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, (struct route *)ro); return (error); } @@ -2192,7 +2192,8 @@ while (m_head) { m = m_head; m_head = m_head->m_nextpkt; - error = nd6_output_ifp(ifp, origifp, m, dst); + /* XXX TODO */ + error = nd6_output_ifp(ifp, origifp, m, dst, NULL); } /*