Index: sys/net/if_arcsubr.c =================================================================== --- sys/net/if_arcsubr.c +++ sys/net/if_arcsubr.c @@ -130,7 +130,8 @@ else if (ifp->if_flags & IFF_NOARP) adst = ntohl(SIN(dst)->sin_addr.s_addr) & 0xFF; else { - error = arpresolve(ifp, is_gw, m, dst, &adst, NULL); + error = arpresolve(ifp, is_gw, m, dst, &adst, NULL, + NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); } @@ -171,7 +172,8 @@ if ((m->m_flags & M_MCAST) != 0) adst = arcbroadcastaddr; /* ARCnet broadcast address */ else { - error = nd6_resolve(ifp, is_gw, m, dst, &adst, NULL); + error = nd6_resolve(ifp, is_gw, m, dst, &adst, NULL, + NULL); if (error != 0) return (error == EWOULDBLOCK ? 0 : error); } Index: sys/net/if_ethersubr.c =================================================================== --- sys/net/if_ethersubr.c +++ sys/net/if_ethersubr.c @@ -156,12 +156,27 @@ int hlen; /* link layer header length */ int is_gw = 0; uint32_t pflags = 0; + int addref = 0; if (ro != NULL) { if (!(m->m_flags & (M_BCAST | M_MCAST))) { lle = ro->ro_lle; - if (lle != NULL) - pflags = lle->la_flags; + if (lle == &lle_no_cache) + lle = NULL; + else { + if (lle != NULL && + (lle->la_flags & LLE_VALID) == 0) { + LLE_FREE(lle); + lle = NULL; /* redundant */ + ro->ro_lle = NULL; + } + if (lle != NULL) + pflags = lle->la_flags; + else { + /* if we lookup, keep cache */ + addref = 1; + } + } } rt0 = ro->ro_rt; if (rt0 != NULL && (rt0->rt_flags & RTF_GATEWAY) != 0) @@ -186,8 +201,13 @@ case AF_INET: if (lle != NULL && (pflags & LLE_VALID) != 0) memcpy(edst, &lle->ll_addr.mac16, sizeof(edst)); - else - error = arpresolve(ifp, is_gw, m, dst, edst, &pflags); + else { + error = arpresolve(ifp, is_gw, m, dst, edst, &pflags, + addref ? &lle : NULL); + if (addref && error == 0 && lle != NULL) { + ro->ro_lle = lle; + } + } if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); @@ -224,9 +244,12 @@ case AF_INET6: if (lle != NULL && (pflags & LLE_VALID)) memcpy(edst, &lle->ll_addr.mac16, sizeof(edst)); - else + else { error = nd6_resolve(ifp, is_gw, m, dst, (u_char *)edst, - &pflags); + &pflags, addref ? &lle : NULL); + if (addref && error == 0 && lle != NULL) + ro->ro_lle = lle; + } if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IPV6); Index: sys/net/if_fddisubr.c =================================================================== --- sys/net/if_fddisubr.c +++ sys/net/if_fddisubr.c @@ -127,7 +127,7 @@ switch (dst->sa_family) { #ifdef INET case AF_INET: { - error = arpresolve(ifp, is_gw, m, dst, edst, NULL); + error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); Index: sys/net/if_fwsubr.c =================================================================== --- sys/net/if_fwsubr.c +++ sys/net/if_fwsubr.c @@ -149,7 +149,8 @@ if (ro != NULL && ro->ro_rt != NULL && (ro->ro_rt->rt_flags & RTF_GATEWAY) != 0) is_gw = 1; - error = arpresolve(ifp, is_gw, m, dst, (u_char *) destfw, NULL); + error = arpresolve(ifp, is_gw, m, dst, + (u_char *) destfw, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); } @@ -179,7 +180,7 @@ case AF_INET6: if (unicast) { error = nd6_resolve(fc->fc_ifp, is_gw, m, dst, - (u_char *) destfw, NULL); + (u_char *) destfw, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); } Index: sys/net/if_iso88025subr.c =================================================================== --- sys/net/if_iso88025subr.c +++ sys/net/if_iso88025subr.c @@ -258,7 +258,7 @@ switch (dst->sa_family) { #ifdef INET case AF_INET: - error = arpresolve(ifp, is_gw, m, dst, edst, NULL); + error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); snap_type = ETHERTYPE_IP; @@ -293,7 +293,7 @@ #endif /* INET */ #ifdef INET6 case AF_INET6: - error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL); + error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); snap_type = ETHERTYPE_IPV6; Index: sys/net/if_llatbl.h =================================================================== --- sys/net/if_llatbl.h +++ sys/net/if_llatbl.h @@ -114,11 +114,13 @@ } while (0) #define LLE_FREE_LOCKED(lle) do { \ - if ((lle)->lle_refcnt == 1) \ - (lle)->lle_free(lle); \ - else { \ - LLE_REMREF(lle); \ - LLE_WUNLOCK(lle); \ + if ((lle) != &lle_no_cache) { \ + if ((lle)->lle_refcnt <= 1) \ + (lle)->lle_free(lle); \ + else { \ + LLE_REMREF(lle); \ + LLE_WUNLOCK(lle); \ + } \ } \ /* guard against invalid refs */ \ (lle) = NULL; \ @@ -191,6 +193,8 @@ #define LLATBL_HASH(key, mask) \ (((((((key >> 8) ^ key) >> 8) ^ key) >> 8) ^ key) & mask) +extern struct llentry lle_no_cache; + struct lltable *lltable_allocate_htbl(uint32_t hsize); void lltable_free(struct lltable *); void lltable_link(struct lltable *llt); Index: sys/net/if_llatbl.c =================================================================== --- sys/net/if_llatbl.c +++ sys/net/if_llatbl.c @@ -78,6 +78,8 @@ static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg); +struct llentry lle_no_cache; /* empty */ + /* * Dump lle state for a specific address family. */ Index: sys/net/radix.h =================================================================== --- sys/net/radix.h +++ sys/net/radix.h @@ -101,11 +101,18 @@ #define rm_mask rm_rmu.rmu_mask #define rm_leaf rm_rmu.rmu_leaf /* extra field would make 32 bytes */ +/* + * Keep a generation count of routing table, incremented on route addition, + * so we can invalidate caches. This is accessed without a lock, as precision + * is not required. + */ +typedef volatile u_int rt_gen_t; /* tree generation (for adds) */ + typedef int walktree_f_t(struct radix_node *, void *); struct radix_node_head { struct radix_node *rnh_treetop; - u_int rnh_gen; /* generation counter */ + rt_gen_t rnh_gen; /* generation counter */ int rnh_multipath; /* multipath capable ? */ struct radix_node *(*rnh_addaddr) /* add based on sockaddr */ (void *v, void *mask, Index: sys/net/route.h =================================================================== --- sys/net/route.h +++ sys/net/route.h @@ -345,6 +345,27 @@ struct radix_node_head *rt_tables_get_rnh(int, int); +/* + * Keep a generation count of routing table, incremented on route addition, + * so we can invalidate caches. This is accessed without a lock, as precision + * is not required. + */ +#define RT_GEN(fibnum, af) rt_tables_get_rnh(fibnum, af)->rnh_gen + +/* + * Validate a cached route based on a supplied cookie. If there is an + * out-of-date cache, simply free it. Update the generation number + * for the new allocation + */ +#define RT_VALIDATE(ro, cookiep, fibnum) do { \ + rt_gen_t cookie = RT_GEN(fibnum, (ro)->ro_dst.sa_family); \ + if (*(cookiep) != cookie && (ro)->ro_rt != NULL) { \ + RTFREE((ro)->ro_rt); \ + (ro)->ro_rt = NULL; \ + *(cookiep) = cookie; \ + } \ +} while (0) + struct ifmultiaddr; void rt_ieee80211msg(struct ifnet *, int, void *, size_t); Index: sys/net/route.c =================================================================== --- sys/net/route.c +++ sys/net/route.c @@ -1551,6 +1551,7 @@ *ret_nrt = rt; RT_ADDREF(rt); } + rnh->rnh_gen++; /* Routing table updated */ RT_UNLOCK(rt); break; case RTM_CHANGE: Index: sys/netinet/if_ether.h =================================================================== --- sys/netinet/if_ether.h +++ sys/netinet/if_ether.h @@ -115,7 +115,8 @@ struct ifaddr; int arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, - const struct sockaddr *dst, u_char *desten, uint32_t *pflags); + const struct sockaddr *dst, u_char *desten, uint32_t *pflags, + struct llentry **lle); void arprequest(struct ifnet *, const struct in_addr *, const struct in_addr *, u_char *); void arp_ifinit(struct ifnet *, struct ifaddr *); Index: sys/netinet/if_ether.c =================================================================== --- sys/netinet/if_ether.c +++ sys/netinet/if_ether.c @@ -212,6 +212,7 @@ } IF_AFDATA_UNLOCK(ifp); + lle->la_flags &= ~LLE_VALID; size_t pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); @@ -308,7 +309,8 @@ */ static int arpresolve_full(struct ifnet *ifp, int is_gw, int create, struct mbuf *m, - const struct sockaddr *dst, u_char *desten, uint32_t *pflags) + const struct sockaddr *dst, u_char *desten, uint32_t *pflags, + struct llentry **lle) { struct llentry *la = NULL, *la_tmp; struct mbuf *curr = NULL; @@ -368,6 +370,10 @@ if (pflags != NULL) *pflags = la->la_flags; + if (lle) { + LLE_ADDREF(la); + *lle = la; + } LLE_WUNLOCK(la); if (renew == 1) @@ -413,6 +419,11 @@ else error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN; + if (lle) { + LLE_ADDREF(la); + *lle = la; + } + if (renew) { int canceled; @@ -449,13 +460,26 @@ */ int arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, - const struct sockaddr *dst, u_char *desten, uint32_t *pflags) + const struct sockaddr *dst, u_char *desten, uint32_t *pflags, + struct llentry **lle) { struct llentry *la = 0; int renew; if (pflags != NULL) *pflags = 0; + if (lle != NULL) { + /* + * If the caller had a cache and still called us, + * the cache was probably invalid. Just drop it. + */ + if (*lle != NULL) { + LLE_WLOCK(*lle); + LLE_REMREF(*lle); + LLE_WUNLOCK(*lle); + } + *lle = NULL; + } if (m != NULL) { if (m->m_flags & M_BCAST) { @@ -472,11 +496,12 @@ } IF_AFDATA_RLOCK(ifp); - la = lla_lookup(LLTABLE(ifp), 0, dst); + la = lla_lookup(LLTABLE(ifp), (lle != NULL) ? LLE_EXCLUSIVE : 0, dst); IF_AFDATA_RUNLOCK(ifp); if (la == NULL) - return (arpresolve_full(ifp, is_gw, 1, m, dst, desten, pflags)); + return (arpresolve_full(ifp, is_gw, 1, m, dst, desten, pflags, + lle)); if ((la->la_flags & LLE_VALID) && ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { @@ -496,16 +521,24 @@ if (pflags != NULL) *pflags = la->la_flags; - LLE_RUNLOCK(la); + if (lle) { + LLE_ADDREF(la); + LLE_WUNLOCK(la); + *lle = la; + } else + LLE_RUNLOCK(la); if (renew == 1) arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL); return (0); } - LLE_RUNLOCK(la); + if (lle) + LLE_WUNLOCK(la); + else + LLE_RUNLOCK(la); - return (arpresolve_full(ifp, is_gw, 0, m, dst, desten, pflags)); + return (arpresolve_full(ifp, is_gw, 0, m, dst, desten, pflags, lle)); } /* Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -42,6 +42,7 @@ #include #include #include +#include #ifdef _KERNEL #include @@ -240,6 +241,13 @@ struct llentry *inp_lle; /* cached L2 information */ struct rtentry *inp_rt; /* cached L3 information */ struct rwlock inp_lock; + rt_gen_t inp_rt_cookie; /* generation for route entry */ + union { + struct route inpu_route; + struct route_in6 inpu_route6; + } inp_rtu; +#define inp_route inp_rtu.inpu_route +#define inp_route6 inp_rtu.inpu_route6 }; #define inp_fport inp_inc.inc_fport #define inp_lport inp_inc.inc_lport Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -1297,6 +1298,13 @@ if (inp->inp_moptions != NULL) inp_freemoptions(inp->inp_moptions); #endif + if (inp->inp_route.ro_rt) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)NULL; + } + if (inp->inp_route.ro_lle) + LLE_FREE(inp->inp_route.ro_lle); /* zeros ro_lle */ + inp->inp_vflag = 0; inp->inp_flags2 |= INP_FREED; crfree(inp->inp_cred); @@ -1717,6 +1725,8 @@ if (inp != NULL) goto found; } + if (inp->inp_route.ro_lle) + LLE_FREE(inp->inp_route.ro_lle); /* zeros ro_lle */ #endif /* Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -245,6 +245,11 @@ if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); + /* + * We use a distinguished lle value to say + * we don't want to cache an lle. + */ + ro->ro_lle = &lle_no_cache; } #ifdef FLOWTABLE @@ -282,17 +287,39 @@ gw = dst = (struct sockaddr_in *)&ro->ro_dst; fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); rte = ro->ro_rt; - /* - * The address family should also be checked in case of sharing - * the cache with IPv6. - */ - if (rte == NULL || dst->sin_family != AF_INET) { + if (rte == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } again: + /* + * Validate route against routing table additions; + * a better/more specific route might have been added. + */ + if (inp) + RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); + /* + * If there is a cached route, + * check that it is to the same destination + * and is still up. If not, free it and try again. + * The address family should also be checked in case of sharing the + * cache with IPv6. + * Also check whether routing cache needs invalidation. + */ + rte = ro->ro_rt; + if (rte && ((rte->rt_flags & RTF_UP) == 0 || + rte->rt_ifp == NULL || + !RT_LINK_IS_UP(rte->rt_ifp) || + dst->sin_family != AF_INET || + dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { + RTFREE(rte); + rte = ro->ro_rt = (struct rtentry *)NULL; + if (ro->ro_lle) + LLE_FREE(ro->ro_lle); /* zeros ro_lle */ + ro->ro_lle = (struct llentry *)NULL; + } ia = NULL; have_ia_ref = 0; /* Index: sys/netinet/tcp_output.c =================================================================== --- sys/netinet/tcp_output.c +++ sys/netinet/tcp_output.c @@ -1328,9 +1328,6 @@ #endif #ifdef INET { - struct route ro; - - bzero(&ro, sizeof(ro)); ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) @@ -1356,18 +1353,12 @@ TCP_PROBE5(send, NULL, tp, ip, tp, th); -#ifdef TCPPCAP - /* Save packet, if requested. */ - tcp_pcap_add(th, m, &(tp->t_outpkts)); -#endif - - error = ip_output(m, tp->t_inpcb->inp_options, &ro, + error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); - if (error == EMSGSIZE && ro.ro_rt != NULL) - mtu = ro.ro_rt->rt_mtu; - RO_RTFREE(&ro); + if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL) + mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu; } #endif /* INET */ Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -1226,6 +1226,10 @@ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { + if (inp->inp_route.ro_rt) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)NULL; + } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { @@ -1519,11 +1523,11 @@ else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; - /* - * Redirects don't need to be handled up here. - */ - else if (PRC_IS_REDIRECT(cmd)) + else if (PRC_IS_REDIRECT(cmd)) { + /* signal EHOSTDOWN, as it flushes the cached route */ + in_pcbnotifyall(&tcbinfo, faddr, EHOSTDOWN, notify); return; + } /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an Index: sys/netinet/toecore.c =================================================================== --- sys/netinet/toecore.c +++ sys/netinet/toecore.c @@ -462,12 +462,12 @@ switch (sa->sa_family) { #ifdef INET case AF_INET: - rc = arpresolve(ifp, 0, NULL, sa, lladdr, NULL); + rc = arpresolve(ifp, 0, NULL, sa, lladdr, NULL, NULL); break; #endif #ifdef INET6 case AF_INET6: - rc = nd6_resolve(ifp, 0, NULL, sa, lladdr, NULL); + rc = nd6_resolve(ifp, 0, NULL, sa, lladdr, NULL, NULL); break; #endif default: Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -88,6 +88,7 @@ #include #include +#include #include #include #include @@ -502,6 +503,11 @@ if (ro == 0) { ro = &ip6route; bzero((caddr_t)ro, sizeof(*ro)); + /* + * We use a distinguished lle value to say + * we don't want to cache an lle. + */ + ro->ro_lle = &lle_no_cache; } ro_pmtu = ro; if (opt && opt->ip6po_rthdr) @@ -542,7 +548,18 @@ /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); - if (ro->ro_rt && fwd_tag == NULL) { + /* + * Validate route against routing table additions; + * a better/more specific route might have been added. + * Make sure address family is set in route. + */ + if (inp) { + ro->ro_dst.sin6_family = AF_INET6; + RT_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, fibnum); + } + if (ro->ro_rt && fwd_tag == NULL && (ro->ro_rt->rt_flags & RTF_UP) && + ro->ro_dst.sin6_family == AF_INET6 && + IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) { rt = ro->ro_rt; ifp = ro->ro_rt->rt_ifp; } else { @@ -935,7 +952,7 @@ m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } - error = nd6_output_ifp(ifp, origifp, m, dst, NULL); + error = nd6_output_ifp(ifp, origifp, m, dst, ro); goto done; } @@ -1034,7 +1051,7 @@ counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } - error = nd6_output_ifp(ifp, origifp, m, dst, NULL); + error = nd6_output_ifp(ifp, origifp, m, dst, ro); } else m_freem(m); } Index: sys/netinet6/nd6.h =================================================================== --- sys/netinet6/nd6.h +++ sys/netinet6/nd6.h @@ -422,7 +422,7 @@ int nd6_add_ifa_lle(struct in6_ifaddr *); void nd6_rem_ifa_lle(struct in6_ifaddr *, int); int nd6_output_ifp(struct ifnet *, struct ifnet *, struct mbuf *, - struct sockaddr_in6 *, struct route *); + struct sockaddr_in6 *, struct route_in6 *); /* nd6_nbr.c */ void nd6_na_input(struct mbuf *, int, int); Index: sys/netinet6/nd6.c =================================================================== --- sys/netinet6/nd6.c +++ sys/netinet6/nd6.c @@ -1905,7 +1905,7 @@ int nd6_output_ifp(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, - struct sockaddr_in6 *dst, struct route *ro) + struct sockaddr_in6 *dst, struct route_in6 *ro) { int error; int ip6len; @@ -1944,7 +1944,8 @@ if ((ifp->if_flags & IFF_LOOPBACK) == 0) origifp = ifp; - error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, ro); + error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, + (struct route *)ro); return (error); }