Index: sys/net/if_ethersubr.c =================================================================== --- sys/net/if_ethersubr.c +++ sys/net/if_ethersubr.c @@ -312,7 +312,11 @@ if (error != 0) return (error == EWOULDBLOCK ? 0 : error); } - + if (ro != NULL && ro->ro_prepend == NULL && (ro->ro_flags & RT_CACHING_CONTEXT) && + (ro->ro_prepend = malloc(ETHER_HDR_LEN, M_TEMP, M_NOWAIT)) != NULL) { + ro->ro_plen = hlen; + memcpy(ro->ro_prepend, phdr, ETHER_HDR_LEN); + } if ((pflags & RT_L2_ME) != 0) { update_mbuf_csumflags(m, m); return (if_simloop(ifp, m, dst->sa_family, 0)); Index: sys/net/if_llatbl.c =================================================================== --- sys/net/if_llatbl.c +++ sys/net/if_llatbl.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -369,6 +370,10 @@ size_t linkhdrsize; u_char *lladdr; int lladdr_off; + int i; +#if defined(INET) || defined(INET6) + struct rib_head *rnh; +#endif ifp = (struct ifnet *)farg; @@ -387,6 +392,16 @@ lltable_calc_llheader(ifp, llt->llt_af, lladdr, linkhdr, &linkhdrsize, &lladdr_off); memcpy(lle->r_linkdata, linkhdr, linkhdrsize); + for (i = 0; i < rt_numfibs; i++) { +#ifdef INET + rnh = rt_tables_get_rnh(i, AF_INET); + atomic_add_int(&rnh->rnh_gen, 1); +#endif +#ifdef INET6 + rnh = rt_tables_get_rnh(i, AF_INET6); + atomic_add_int(&rnh->rnh_gen, 1); +#endif + } LLE_WUNLOCK(lle); return (0); @@ -562,6 +577,10 @@ { struct llentry *lle; struct ifnet *ifp; + int i; +#if defined(INET) || defined(INET6) + struct rib_head *rnh; +#endif ifp = llt->llt_ifp; IF_AFDATA_WLOCK(ifp); @@ -578,6 +597,16 @@ } lltable_unlink_entry(llt, lle); + for (i = 0; i < rt_numfibs; i++) { +#ifdef INET + rnh = rt_tables_get_rnh(i, AF_INET); + atomic_add_int(&rnh->rnh_gen, 1); +#endif +#ifdef INET6 + rnh = rt_tables_get_rnh(i, AF_INET6); + atomic_add_int(&rnh->rnh_gen, 1); +#endif + } IF_AFDATA_WUNLOCK(ifp); llt->llt_delete_entry(llt, lle); Index: sys/net/route.c =================================================================== --- sys/net/route.c +++ sys/net/route.c @@ -98,6 +98,11 @@ u_int rt_numfibs = RT_NUMFIBS; SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, ""); +u_int inpcb_rt_cache_enable = 1; +SYSCTL_UINT(_net, OID_AUTO, conn_rt_cache, CTLFLAG_RW|CTLFLAG_TUN, &inpcb_rt_cache_enable, 0, ""); +TUNABLE_INT("net.conn_rt_cache", &inpcb_rt_cache_enable); + + /* * By default add routes to all fibs for new interfaces. * Once this is set to 0 then only allocate routes on interface @@ -1179,6 +1184,7 @@ else #endif rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); + atomic_add_int(&rnh->rnh_gen, 1); if (rn == NULL) return (NULL); @@ -1448,6 +1454,7 @@ * the first entry */ rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); + atomic_add_int(&rnh->rnh_gen, 1); *perror = 0; return (rn); } @@ -1675,6 +1682,7 @@ /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); + atomic_add_int(&rnh->rnh_gen, 1); rt_old = NULL; if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) { @@ -1690,9 +1698,11 @@ rt_old = rt_unlinkrte(rnh, info, &error); info->rti_flags |= RTF_PINNED; info->rti_info[RTAX_DST] = info_dst; - if (rt_old != NULL) + if (rt_old != NULL) { rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, - rt->rt_nodes); + rt->rt_nodes); + atomic_add_int(&rnh->rnh_gen, 1); + } } RIB_WUNLOCK(rnh); Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -155,6 +155,7 @@ * from the global list. * * Key: + * (a) - Atomically incremented * (c) - Constant after initialization * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock @@ -201,6 +202,7 @@ u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_flowid; /* (x) flow id / queue id */ u_int inp_refcount; /* (i) refcount */ + u_int inp_rt_gen; /* (a) generation count of routing entry */ void *inp_pspare[5]; /* (x) route caching / general use */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */ @@ -237,8 +239,9 @@ struct inpcbport *inp_phd; /* (i/h) head of this list */ #define inp_zero_size offsetof(struct inpcb, inp_gencnt) inp_gen_t inp_gencnt; /* (c) generation count */ - struct llentry *inp_lle; /* cached L2 information */ struct rtentry *inp_rt; /* cached L3 information */ + char *inp_prepend; /* cached L2 information */ + uint16_t inp_plen; struct rwlock inp_lock; }; #define inp_fport inp_inc.inc_fport @@ -599,8 +602,6 @@ /* * Flags for inp_flags2. */ -#define INP_LLE_VALID 0x00000001 /* cached lle is valid */ -#define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ #define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ #define INP_FREED 0x00000010 /* inp itself is not valid */ @@ -722,6 +723,8 @@ int in_pcbrele(struct inpcb *); int in_pcbrele_rlocked(struct inpcb *); int in_pcbrele_wlocked(struct inpcb *); +void in_pcbrtalloc(struct inpcb *inp); +int in_rt_valid(struct inpcb *inp); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); int in_getsockaddr(struct socket *so, struct sockaddr **nam); Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -44,6 +44,7 @@ #include "opt_inet6.h" #include "opt_pcbgroup.h" #include "opt_rss.h" +#include "opt_mpath.h" #include #include @@ -72,11 +73,17 @@ #include #include +#include #include #include +#include #include #include +#include +#include + + #if defined(INET) || defined(INET6) #include #include @@ -87,12 +94,14 @@ #endif #ifdef INET #include +#include #endif #ifdef INET6 #include #include #include #include +#include #endif /* INET6 */ @@ -134,6 +143,8 @@ #define V_ipport_tcplastcount VNET(ipport_tcplastcount) +extern u_int inpcb_rt_cache_enable; + static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, @@ -505,6 +516,185 @@ so_options |= SO_REUSEADDR; return (so_options); } + +/* + * in_rt_valid() both checks for, and attempts to ensure, that a cached route + * is present on a socket. It will call in_pcbrtalloc() if conditions are + * right (i.e. routing is enabled on the socket) and required (no route cached + * already or the cached rout is no longer valid). A route can only be + * installed if the caller passes the inp with a write lock, but the route may + * be used if a read lock is held. + */ + +int +in_rt_valid(struct inpcb *inp) +{ + struct rib_head *rnh; + + INP_WLOCK_ASSERT(inp); + + if (inpcb_rt_cache_enable == 0) + return (0); + if (inp->inp_socket == NULL) + return (0); + if (inp->inp_socket->so_options & SO_DONTROUTE) + return (0); + if (inp->inp_faddr.s_addr == inp->inp_laddr.s_addr || + (ntohl(inp->inp_faddr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || + (ntohl(inp->inp_laddr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) + return (0); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) + rnh = rt_tables_get_rnh(inp->inp_inc.inc_fibnum, AF_INET6); +#endif +#ifdef INET +#ifdef INET6 + else +#endif + rnh = rt_tables_get_rnh(inp->inp_inc.inc_fibnum, AF_INET); +#endif + if (inp->inp_rt != NULL && + (inp->inp_rt->rt_flags & RTF_UP) && + inp->inp_rt_gen == rnh->rnh_gen) + return (1); + /* + * This will handle selectively replacing one field or the other or + * merely updating the inpcb's routing generation count. + */ + in_pcbrtalloc(inp); + if (inp->inp_rt != NULL) + MPASS(inp->inp_rt->rt_ifp != NULL); + return (inp->inp_rt != NULL); +} + +/* + * in_pcbrtalloc will install or update a cached route on an inpcb. + */ + +void +in_pcbrtalloc(struct inpcb *inp) +{ + struct rtentry *rt; + struct rib_head *rnh = NULL; + int gen; + struct route_in6 iproute; +#ifdef INET6 + struct route_in6 *sro6 = NULL; + struct sockaddr_in6 *sin6 = NULL; +#endif +#ifdef INET + struct sockaddr_in *sin = NULL; + struct route *sro = NULL; +#endif + INP_WLOCK_ASSERT(inp); + + if (inpcb_rt_cache_enable == 0) + return; + + if (inp->inp_socket->so_options & SO_DONTROUTE) + return; + + if (inp->inp_prepend != NULL) { + free(inp->inp_prepend, M_TEMP); + inp->inp_prepend = NULL; + } + +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) { + sro6 = &iproute; + bzero(sro6, sizeof(*sro6)); + rnh = rt_tables_get_rnh(inp->inp_inc.inc_fibnum, AF_INET6); + sin6 = (struct sockaddr_in6 *)&sro6->ro_dst; + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_addr = inp->in6p_faddr; + + } +#endif +#ifdef INET +#ifdef INET6 + else +#endif + { + sro = (struct route *)&iproute; + bzero(sro, sizeof(*sro)); + rnh = rt_tables_get_rnh(inp->inp_inc.inc_fibnum, AF_INET); + sin = (struct sockaddr_in *)&sro->ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_addr.s_addr = inp->inp_faddr.s_addr; + + } +#endif + + if (inp->inp_rt != NULL && + inp->inp_rt_gen == rnh->rnh_gen) { + KASSERT(inp->inp_rt->rt_flags & RTF_UP, + ("gen count unchanged but route invalid")); + rt = inp->inp_rt; + return; + } +resolve: + + gen = rnh->rnh_gen; + + if (inp->inp_vflag & INP_IPV6PROTO) { +#ifdef INET6 +#ifdef RADIX_MPATH + rtalloc_mpath_fib((struct route *)sro6, + ntohl(sin6->sin6_addr.s6_addr32[3]), + inp->inp_inc.inc_fibnum); +#else + sro6->ro_rt = rtalloc1(&((struct route *)sro6) + ->ro_dst, 0, 0UL); + if (sro6->ro_rt) + RT_UNLOCK(sro6->ro_rt); +#endif + rt = sro6->ro_rt; +#endif + } else { +#ifdef INET +#ifdef RADIX_MPATH + rtalloc_mpath_fib(sro, ntohl(inp->inp_faddr.s_addr), + inp->inp_inc.inc_fibnum); +#else + rtalloc_ign_fib(sro, 0, inp->inp_inc.inc_fibnum); +#endif + rt = sro->ro_rt; +#endif + } + + if (inp->inp_rt != NULL) { + if (rt == inp->inp_rt) { + /* The route is unchanged so we drop the added + * reference and update reference count. + */ + RTFREE(rt); + inp->inp_rt_gen = gen; + + /* The route has been validated and the generation + * count updated so we're done here. + */ + return; + } + RTFREE(inp->inp_rt); + inp->inp_rt = NULL; + } + + if (rt == NULL) + return; + if (gen != rnh->rnh_gen) { + /* + * The routing tree was updated some time after we read its + * generation counter. + */ + RTFREE(rt); + goto resolve; + } + inp->inp_rt = rt; + inp->inp_rt_gen = gen; +} + #endif /* INET || INET6 */ /* @@ -744,6 +934,7 @@ inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; + in_pcbrehash_mbuf(inp, m); if (anonport) @@ -796,7 +987,7 @@ * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) - in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); + rtalloc_ign_fib(&sro, 0, inp->inp_inc.inc_fibnum); /* * If we found a route, use the address corresponding to @@ -1121,6 +1312,15 @@ INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); + if (inp->inp_rt != NULL) { + RTFREE(inp->inp_rt); + inp->inp_rt = NULL; + } + if (inp->inp_prepend != NULL) { + free(inp->inp_prepend, M_TEMP); + inp->inp_prepend = NULL; + } + inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); @@ -1276,6 +1476,15 @@ #endif INP_WLOCK_ASSERT(inp); + if (inp->inp_rt != NULL) { + RTFREE(inp->inp_rt); + inp->inp_rt = NULL; + } + if (inp->inp_prepend != NULL) { + free(inp->inp_prepend, M_TEMP); + inp->inp_prepend = NULL; + } + /* XXXRW: Do as much as possible here. */ #ifdef IPSEC if (inp->inp_sp != NULL) Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -222,7 +222,7 @@ struct sockaddr_in *dst; const struct sockaddr_in *gw; struct in_ifaddr *ia; - int isbroadcast; + int isbroadcast, nortfree; uint16_t ip_len, ip_off; struct route iproute; struct rtentry *rte; /* cache for ro->ro_rt */ @@ -242,9 +242,11 @@ } } + nortfree = 1; if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); + nortfree = 0; } #ifdef FLOWTABLE @@ -355,6 +357,7 @@ in_rtalloc_ign(ro, 0, fibnum); #endif rte = ro->ro_rt; + nortfree = 0; } if (rte == NULL || (rte->rt_flags & RTF_UP) == 0 || @@ -682,7 +685,7 @@ IPSTAT_INC(ips_fragmented); done: - if (ro == &iproute) + if (ro == &iproute && !nortfree) RO_RTFREE(ro); else if (rte == NULL) /* Index: sys/netinet/tcp_output.c =================================================================== --- sys/netinet/tcp_output.c +++ sys/netinet/tcp_output.c @@ -53,6 +53,7 @@ #include #include +#include #include #include @@ -181,6 +182,13 @@ struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; +#ifdef INET + struct sockaddr_in *sin; +#endif +#ifdef INET6 + struct sockaddr_in6 *sin6; +#endif + struct inpcb *inp; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef IPSEC @@ -201,7 +209,8 @@ isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif - INP_WLOCK_ASSERT(tp->t_inpcb); + inp = tp->t_inpcb; + INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) @@ -1342,6 +1351,21 @@ */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); + if (in_rt_valid(inp)) { + sin6 = (struct sockaddr_in6 *)&ro.ro_dst; + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(struct sockaddr_in6); + memcpy(&sin6->sin6_addr.s6_addr, &inp->in6p_faddr.s6_addr, 16); + ro.ro_rt = inp->inp_rt; + ro.ro_plen = inp->inp_plen; + if (inp->inp_prepend != NULL) { + ro.ro_prepend = inp->inp_prepend; + ro.ro_plen = inp->inp_plen; + } + if (ro.ro_rt != NULL) + ro.ro_flags |= RT_CACHING_CONTEXT; + } + /* * Set the packet size here for the benefit of DTrace probes. * ip6_output() will set it properly; it's supposed to include @@ -1371,7 +1395,15 @@ if (error == EMSGSIZE && ro.ro_rt != NULL) mtu = ro.ro_rt->rt_mtu; - RO_RTFREE(&ro); + if (inp->inp_prepend != ro.ro_prepend) { + if (inp->inp_prepend != NULL) + free(inp->inp_prepend, M_TEMP); + inp->inp_prepend = ro.ro_prepend; + inp->inp_plen = ro.ro_plen; + } + if (!(ro.ro_flags & RT_CACHING_CONTEXT)) { + RO_RTFREE(&ro); + } } #endif /* INET6 */ #if defined(INET) && defined(INET6) @@ -1412,13 +1444,36 @@ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif + if (in_rt_valid(inp)) { + sin = (struct sockaddr_in *)&ro.ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_addr.s_addr = inp->inp_faddr.s_addr; + ro.ro_rt = inp->inp_rt; + if (inp->inp_prepend != NULL) { + ro.ro_prepend = inp->inp_prepend; + ro.ro_plen = inp->inp_plen; + } + if (ro.ro_rt != NULL) + ro.ro_flags |= RT_CACHING_CONTEXT; + } + error = ip_output(m, tp->t_inpcb->inp_options, &ro, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); + if (inp->inp_prepend != ro.ro_prepend) { + if (inp->inp_prepend != NULL) + free(inp->inp_prepend, M_TEMP); + inp->inp_prepend = ro.ro_prepend; + inp->inp_plen = ro.ro_plen; + } + if (error == EMSGSIZE && ro.ro_rt != NULL) mtu = ro.ro_rt->rt_mtu; - RO_RTFREE(&ro); + if (!(ro.ro_flags & RT_CACHING_CONTEXT)) { + RO_RTFREE(&ro); + } } #endif /* INET */ Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -1239,6 +1239,7 @@ goto out; } inp->inp_laddr = laddr; + in_pcbrtalloc(inp); in_pcbrehash(inp); INP_HASH_WUNLOCK(&V_tcbinfo); Index: sys/netinet6/in6_src.c =================================================================== --- sys/netinet6/in6_src.c +++ sys/netinet6/in6_src.c @@ -739,7 +739,9 @@ ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst))) { - RTFREE(ro->ro_rt); + + if (!(ro->ro_flags & RT_CACHING_CONTEXT)) + RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)NULL; } if (ro->ro_rt == (struct rtentry *)NULL) { @@ -773,7 +775,8 @@ ifp = ro->ro_rt->rt_ifp; if (ifp == NULL) { /* can this really happen? */ - RTFREE(ro->ro_rt); + if (!(ro->ro_flags & RT_CACHING_CONTEXT)) + RTFREE(ro->ro_rt); ro->ro_rt = NULL; } } Index: sys/netinet6/ip6_forward.c =================================================================== --- sys/netinet6/ip6_forward.c +++ sys/netinet6/ip6_forward.c @@ -574,7 +574,7 @@ goto bad; } - error = nd6_output_ifp(rt->rt_ifp, origifp, m, dst, NULL); + error = nd6_output_ifp(rt->rt_ifp, origifp, m, dst, (struct route *)&rin6); if (error) { in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard); IP6STAT_INC(ip6s_cantforward); Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -939,7 +939,7 @@ m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } - error = nd6_output_ifp(ifp, origifp, m, dst, NULL); + error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); goto done; } @@ -1038,7 +1038,7 @@ counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } - error = nd6_output_ifp(ifp, origifp, m, dst, NULL); + error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); } else m_freem(m); }