diff --git a/sys/net/route.c b/sys/net/route.c index 0cf56fc18364..8198bc0883be 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -1,841 +1,842 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 * $FreeBSD$ */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mrouting.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_PCPUSTAT_DEFINE(struct rtstat, rtstat); VNET_PCPUSTAT_SYSINIT(rtstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(rtstat); #endif EVENTHANDLER_LIST_DEFINE(rt_addrmsg); static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *, void *arg); static int rt_exportinfo(struct rtentry *rt, struct nhop_object *nh, struct rt_addrinfo *info, int flags); /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. */ static void route_init(void) { nhops_init(); } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL); struct rib_head * rt_table_init(int offset, int family, u_int fibnum) { struct rib_head *rh; rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO); /* TODO: These details should be hidded inside radix.c */ /* Init masks tree */ rn_inithead_internal(&rh->head, rh->rnh_nodes, offset); rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0); rh->head.rnh_masks = &rh->rmhead; /* Save metadata associated with this routing table. */ rh->rib_family = family; rh->rib_fibnum = fibnum; #ifdef VIMAGE rh->rib_vnet = curvnet; #endif tmproutes_init(rh); /* Init locks */ RIB_LOCK_INIT(rh); nhops_init_rib(rh); /* Init subscription system */ rib_init_subscriptions(rh); /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; rh->rnh_matchaddr = rn_match; rh->rnh_lookup = rn_lookup; rh->rnh_walktree = rn_walktree; rh->rnh_walktree_from = rn_walktree_from; return (rh); } static int rt_freeentry(struct radix_node *rn, void *arg) { struct radix_head * const rnh = arg; struct radix_node *x; x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); if (x != NULL) R_Free(x); return (0); } void rt_table_destroy(struct rib_head *rh) { RIB_WLOCK(rh); rh->rib_dying = true; RIB_WUNLOCK(rh); #ifdef FIB_ALGO fib_destroy_rib(rh); #endif tmproutes_destroy(rh); rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); nhops_destroy_rib(rh); rib_destroy_subscriptions(rh); /* Assume table is already empty */ RIB_LOCK_DESTROY(rh); free(rh, M_RTABLE); } /* * Adds a temporal redirect entry to the routing table. * @fibnum: fib number * @dst: destination to install redirect to * @gateway: gateway to go via * @author: sockaddr of originating router, can be NULL * @ifp: interface to use for the redirected route * @flags: set of flags to add. Allowed: RTF_GATEWAY * @lifetime_sec: time in seconds to expire this redirect. * * Retuns 0 on success, errno otherwise. */ int rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int lifetime_sec) { struct rib_cmd_info rc; int error; struct rt_addrinfo info; struct rt_metrics rti_rmx; struct ifaddr *ifa; NET_EPOCH_ASSERT(); if (rt_tables_get_rnh(fibnum, dst->sa_family) == NULL) return (EAFNOSUPPORT); /* Verify the allowed flag mask. */ KASSERT(((flags & ~(RTF_GATEWAY)) == 0), ("invalid redirect flags: %x", flags)); flags |= RTF_HOST | RTF_DYNAMIC; /* Get the best ifa for the given interface and gateway. */ if ((ifa = ifaof_ifpforaddr(gateway, ifp)) == NULL) return (ENETUNREACH); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_ifa = ifa; info.rti_ifp = ifp; info.rti_flags = flags; /* Setup route metrics to define expire time. */ bzero(&rti_rmx, sizeof(rti_rmx)); /* Set expire time as absolute. */ rti_rmx.rmx_expire = lifetime_sec + time_second; info.rti_mflags |= RTV_EXPIRE; info.rti_rmx = &rti_rmx; error = rib_action(fibnum, RTM_ADD, &info, &rc); if (error != 0) { /* TODO: add per-fib redirect stats. */ return (error); } RTSTAT_INC(rts_dynamic); /* Send notification of a route addition to userland. */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_AUTHOR] = author; rt_missmsg_fib(RTM_REDIRECT, &info, flags | RTF_UP, error, fibnum); return (0); } /* * Routing table ioctl interface. */ int rtioctl_fib(u_long req, caddr_t data, u_int fibnum) { /* * If more ioctl commands are added here, make sure the proper * super-user checks are being performed because it is possible for * prison-root to make it this far if raw sockets have been enabled * in jails. */ #ifdef INET /* Multicast goop, grrr... */ return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; #else /* INET */ return ENXIO; #endif /* INET */ } struct ifaddr * ifa_ifwithroute(int flags, const struct sockaddr *dst, const struct sockaddr *gateway, u_int fibnum) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ ifa = NULL; if (flags & RTF_HOST) ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { struct nhop_object *nh; nh = rib_lookup(fibnum, gateway, NHR_NONE, 0); /* * dismiss a gateway that is reachable only * through the default router */ if ((nh == NULL) || (nh->nh_flags & NHF_DEFAULT)) return (NULL); ifa = nh->nh_ifa; } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *oifa = ifa; ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (ifa == NULL) ifa = oifa; } return (ifa); } /* * Copy most of @rt data into @info. * * If @flags contains NHR_COPY, copies dst,netmask and gw to the * pointers specified by @info structure. Assume such pointers * are zeroed sockaddr-like structures with sa_len field initialized * to reflect size of the provided buffer. if no NHR_COPY is specified, * point dst,netmask and gw @info fields to appropriate @rt values. * * if @flags contains NHR_REF, do refcouting on rt_ifp and rt_ifa. * * Returns 0 on success. */ static int rt_exportinfo(struct rtentry *rt, struct nhop_object *nh, struct rt_addrinfo *info, int flags) { struct rt_metrics *rmx; struct sockaddr *src, *dst; int sa_len; if (flags & NHR_COPY) { /* Copy destination if dst is non-zero */ src = rt_key(rt); dst = info->rti_info[RTAX_DST]; sa_len = src->sa_len; if (dst != NULL) { if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_DST; } /* Copy mask if set && dst is non-zero */ src = rt_mask(rt); dst = info->rti_info[RTAX_NETMASK]; if (src != NULL && dst != NULL) { /* * Radix stores different value in sa_len, * assume rt_mask() to have the same length * as rt_key() */ if (sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_NETMASK; } /* Copy gateway is set && dst is non-zero */ src = &nh->gw_sa; dst = info->rti_info[RTAX_GATEWAY]; if ((nhop_get_rtflags(nh) & RTF_GATEWAY) && src != NULL && dst != NULL) { if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_GATEWAY; } } else { info->rti_info[RTAX_DST] = rt_key(rt); info->rti_addrs |= RTA_DST; if (rt_mask(rt) != NULL) { info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_addrs |= RTA_NETMASK; } if (nhop_get_rtflags(nh) & RTF_GATEWAY) { info->rti_info[RTAX_GATEWAY] = &nh->gw_sa; info->rti_addrs |= RTA_GATEWAY; } } rmx = info->rti_rmx; if (rmx != NULL) { info->rti_mflags |= RTV_MTU; rmx->rmx_mtu = nh->nh_mtu; } info->rti_flags = rt->rte_flags | nhop_get_rtflags(nh); info->rti_ifp = nh->nh_ifp; info->rti_ifa = nh->nh_ifa; if (flags & NHR_REF) { if_ref(info->rti_ifp); ifa_ref(info->rti_ifa); } return (0); } /* * Lookups up route entry for @dst in RIB database for fib @fibnum. * Exports entry data to @info using rt_exportinfo(). * * If @flags contains NHR_REF, refcouting is performed on rt_ifp and rt_ifa. * All references can be released later by calling rib_free_info(). * * Returns 0 on success. * Returns ENOENT for lookup failure, ENOMEM for export failure. */ int rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid, struct rt_addrinfo *info) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; struct nhop_object *nh; int error; KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rh == NULL) return (ENOENT); RIB_RLOCK(rh); rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); nh = nhop_select(rt->rt_nhop, flowid); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(nh->nh_ifp)) { flags = (flags & NHR_REF) | NHR_COPY; error = rt_exportinfo(rt, nh, info, flags); RIB_RUNLOCK(rh); return (error); } } RIB_RUNLOCK(rh); return (ENOENT); } /* * Releases all references acquired by rib_lookup_info() when * called with NHR_REF flags. */ void rib_free_info(struct rt_addrinfo *info) { ifa_free(info->rti_ifa); if_rele(info->rti_ifp); } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rt pointer to rtentry * nh pointer to nhop * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated */ static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *nh, void *arg) { struct ifnet *ifp = arg; if (nh->nh_ifp != ifp) return (0); /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rte_flags & RTF_UP) == 0) return (0); return (1); } void rt_flushifroutes(struct ifnet *ifp) { rib_foreach_table_walk_del(AF_UNSPEC, rt_ifdelroute, ifp); } /* * Tries to extract interface from RTAX_IFP passed in rt_addrinfo. * Interface can be specified ether as interface index (sdl_index) or * the interface name (sdl_data). * * Returns found ifp or NULL */ static struct ifnet * info_get_ifp(struct rt_addrinfo *info) { const struct sockaddr_dl *sdl; sdl = (const struct sockaddr_dl *)info->rti_info[RTAX_IFP]; if (sdl->sdl_family != AF_LINK) return (NULL); if (sdl->sdl_index != 0) return (ifnet_byindex(sdl->sdl_index)); if (sdl->sdl_nlen > 0) { char if_name[IF_NAMESIZE]; if (sdl->sdl_nlen + offsetof(struct sockaddr_dl, sdl_data) > sdl->sdl_len) return (NULL); if (sdl->sdl_nlen >= IF_NAMESIZE) return (NULL); bzero(if_name, sizeof(if_name)); memcpy(if_name, sdl->sdl_data, sdl->sdl_nlen); return (ifunit(if_name)); } return (NULL); } /* * Calculates proper ifa/ifp for the cases when gateway AF is different * from dst AF. * * Returns 0 on success. */ __noinline static int rt_getifa_family(struct rt_addrinfo *info, uint32_t fibnum) { if (info->rti_ifp == NULL) { struct ifaddr *ifa = NULL; /* * No transmit interface specified. Guess it by checking gw sa. */ const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY]; ifa = ifa_ifwithroute(RTF_GATEWAY, gw, gw, fibnum); if (ifa == NULL) return (ENETUNREACH); info->rti_ifp = ifa->ifa_ifp; } /* Prefer address from outgoing interface */ info->rti_ifa = ifaof_ifpforaddr(info->rti_info[RTAX_DST], info->rti_ifp); #ifdef INET if (info->rti_ifa == NULL) { /* Use first found IPv4 address */ bool loopback_ok = info->rti_ifp->if_flags & IFF_LOOPBACK; info->rti_ifa = (struct ifaddr *)in_findlocal(fibnum, loopback_ok); } #endif if (info->rti_ifa == NULL) return (ENETUNREACH); return (0); } /* * Fills in rti_ifp and rti_ifa for the provided fib. * * Assume basic consistency checks are executed by callers: * RTAX_DST exists, if RTF_GATEWAY is set, RTAX_GATEWAY exists as well. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { const struct sockaddr *dst, *gateway, *ifaaddr; int error, flags; dst = info->rti_info[RTAX_DST]; gateway = info->rti_info[RTAX_GATEWAY]; ifaaddr = info->rti_info[RTAX_IFA]; flags = info->rti_flags; /* * ifp may be specified by sockaddr_dl * when protocol address is ambiguous. */ error = 0; /* If we have interface specified by RTAX_IFP address, try to use it */ if ((info->rti_ifp == NULL) && (info->rti_info[RTAX_IFP] != NULL)) info->rti_ifp = info_get_ifp(info); /* * If we have source address specified, try to find it * TODO: avoid enumerating all ifas on all interfaces. */ if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if ((info->rti_ifa == NULL) && ((info->rti_flags & RTF_GATEWAY) != 0) && (gateway->sa_family != dst->sa_family)) return (rt_getifa_family(info, fibnum)); if (info->rti_ifa == NULL) { const struct sockaddr *sa; /* * Most common use case for the userland-supplied routes. * * Choose sockaddr to select ifa. * -- if ifp is set -- * Order of preference: * 1) IFA address * 2) gateway address * Note: for interface routes link-level gateway address * is specified to indicate the interface index without * specifying RTF_GATEWAY. In this case, ignore gateway * Note: gateway AF may be different from dst AF. In this case, * ignore gateway * 3) final destination. * 4) if all of these fails, try to get at least link-level ifa. * -- else -- * try to lookup gateway or dst in the routing table to get ifa */ if (info->rti_info[RTAX_IFA] != NULL) sa = info->rti_info[RTAX_IFA]; else if ((info->rti_flags & RTF_GATEWAY) != 0 && gateway->sa_family == dst->sa_family) sa = gateway; else sa = dst; if (info->rti_ifp != NULL) { info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); /* Case 4 */ if (info->rti_ifa == NULL && gateway != NULL) info->rti_ifa = ifaof_ifpforaddr(gateway, info->rti_ifp); } else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if (info->rti_ifa != NULL) { if (info->rti_ifp == NULL) info->rti_ifp = info->rti_ifa->ifa_ifp; } else error = ENETUNREACH; return (error); } void rt_updatemtu(struct ifnet *ifp) { struct rib_head *rnh; int mtu; int i, j; /* * Try to update rt_mtu for all routes using this interface * Unfortunately the only way to do this is to traverse all * routing tables in all fibs/domains. */ for (i = 1; i <= AF_MAX; i++) { mtu = if_getmtu_family(ifp, i); for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; nhops_update_ifmtu(rnh, ifp, mtu); } } } #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); int p_sockaddr(char *buf, int buflen, struct sockaddr *s) { void *paddr = NULL; switch (s->sa_family) { case AF_INET: paddr = &((struct sockaddr_in *)s)->sin_addr; break; case AF_INET6: paddr = &((struct sockaddr_in6 *)s)->sin6_addr; break; } if (paddr == NULL) return (0); if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) return (0); return (strlen(buf)); } int rt_print(char *buf, int buflen, struct rtentry *rt) { struct sockaddr *addr, *mask; int i = 0; addr = rt_key(rt); mask = rt_mask(rt); i = p_sockaddr(buf, buflen, addr); if (!(rt->rt_flags & RTF_HOST)) { buf[i++] = '/'; i += p_sockaddr(buf + i, buflen - i, mask); } if (rt->rt_flags & RTF_GATEWAY) { buf[i++] = '>'; i += p_sockaddr(buf + i, buflen - i, &rt->rt_nhop->gw_sa); } return (i); } #endif void -rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) +rt_maskedcopy(const struct sockaddr *src, struct sockaddr *dst, + const struct sockaddr *netmask) { - u_char *cp1 = (u_char *)src; + const u_char *cp1 = (const u_char *)src; u_char *cp2 = (u_char *)dst; - u_char *cp3 = (u_char *)netmask; + const u_char *cp3 = (const u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ cp3 += 2; if (cplim > cplim2) cplim = cplim2; while (cp2 < cplim) *cp2++ = *cp1++ & *cp3++; if (cp2 < cplim2) bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } /* * Announce interface address arrival/withdraw * Returns 0 on success. */ int rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { #if defined(INET) || defined(INET6) struct sockaddr *sa = ifa->ifa_addr; struct ifnet *ifp = ifa->ifa_ifp; #endif KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT((fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); EVENTHANDLER_DIRECT_INVOKE(rt_addrmsg, ifa, cmd); #ifdef INET if (sa->sa_family == AF_INET) { char addrstr[INET_ADDRSTRLEN]; char strbuf[INET_ADDRSTRLEN + 12]; inet_ntoa_r(((struct sockaddr_in *)sa)->sin_addr, addrstr); snprintf(strbuf, sizeof(strbuf), "address=%s", addrstr); devctl_notify("IFNET", ifp->if_xname, (cmd == RTM_ADD) ? "ADDR_ADD" : "ADDR_DEL", strbuf); } #endif #ifdef INET6 if (sa->sa_family == AF_INET6) { char addrstr[INET6_ADDRSTRLEN]; char strbuf[INET6_ADDRSTRLEN + 12]; ip6_sprintf(addrstr, IFA_IN6(ifa)); snprintf(strbuf, sizeof(strbuf), "address=%s", addrstr); devctl_notify("IFNET", ifp->if_xname, (cmd == RTM_ADD) ? "ADDR_ADD" : "ADDR_DEL", strbuf); } #endif if (V_rt_add_addr_allfibs) fibnum = RT_ALL_FIBS; return (rtsock_addrmsg(cmd, ifa, fibnum)); } /* * Announce kernel-originated route addition/removal to rtsock based on @rt data. * cmd: RTM_ cmd * @rt: valid rtentry * @nh: nhop object to announce * @fibnum: fib id or RT_ALL_FIBS * * Returns 0 on success. */ int rt_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__)); return (rtsock_routemsg(cmd, rt, nh, fibnum)); } /* * Announce kernel-originated route addition/removal to rtsock based on @rt data. * cmd: RTM_ cmd * @info: addrinfo structure with valid data. * @fibnum: fib id or RT_ALL_FIBS * * Returns 0 on success. */ int rt_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE || cmd == RTM_CHANGE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(info->rti_info[RTAX_DST] != NULL, (":%s: RTAX_DST must be supplied", __func__)); return (rtsock_routemsg_info(cmd, info, fibnum)); } diff --git a/sys/net/route.h b/sys/net/route.h index 46dc0c555218..931b284b664d 100644 --- a/sys/net/route.h +++ b/sys/net/route.h @@ -1,454 +1,455 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.h 8.4 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NET_ROUTE_H_ #define _NET_ROUTE_H_ #include /* * Kernel resident routing tables. * * The routing tables are initialized when interface addresses * are set by making entries for all directly connected interfaces. */ /* * Struct route consiste of a destination address, * a route entry pointer, link-layer prepend data pointer along * with its length. */ struct route { struct nhop_object *ro_nh; struct llentry *ro_lle; /* * ro_prepend and ro_plen are only used for bpf to pass in a * preformed header. They are not cacheable. */ char *ro_prepend; uint16_t ro_plen; uint16_t ro_flags; uint16_t ro_mtu; /* saved ro_rt mtu */ uint16_t spare; struct sockaddr ro_dst; }; #define RT_L2_ME_BIT 2 /* dst L2 addr is our address */ #define RT_MAY_LOOP_BIT 3 /* dst may require loop copy */ #define RT_HAS_HEADER_BIT 4 /* mbuf already have its header prepended */ #define RT_L2_ME (1 << RT_L2_ME_BIT) /* 0x0004 */ #define RT_MAY_LOOP (1 << RT_MAY_LOOP_BIT) /* 0x0008 */ #define RT_HAS_HEADER (1 << RT_HAS_HEADER_BIT) /* 0x0010 */ #define RT_REJECT 0x0020 /* Destination is reject */ #define RT_BLACKHOLE 0x0040 /* Destination is blackhole */ #define RT_HAS_GW 0x0080 /* Destination has GW */ #define RT_LLE_CACHE 0x0100 /* Cache link layer */ struct rt_metrics { u_long rmx_locks; /* Kernel must leave these values alone */ u_long rmx_mtu; /* MTU for this path */ u_long rmx_hopcount; /* max hops expected */ u_long rmx_expire; /* lifetime for route, e.g. redirect */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ u_long rmx_sendpipe; /* outbound delay-bandwidth product */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ u_long rmx_rtt; /* estimated round trip time */ u_long rmx_rttvar; /* estimated rtt variance */ u_long rmx_pksent; /* packets sent using this route */ u_long rmx_weight; /* route weight */ u_long rmx_nhidx; /* route nexhop index */ u_long rmx_filler[2]; /* will be used for T/TCP later */ }; /* * rmx_rtt and rmx_rttvar are stored as microseconds; * RTTTOPRHZ(rtt) converts to a value suitable for use * by a protocol slowtimo counter. */ #define RTM_RTTUNIT 1000000 /* units for rtt, rttvar, as units per sec */ #define RTTTOPRHZ(r) ((r) / (RTM_RTTUNIT / PR_SLOWHZ)) /* lle state is exported in rmx_state rt_metrics field */ #define rmx_state rmx_weight /* default route weight */ #define RT_DEFAULT_WEIGHT 1 #define RT_MAX_WEIGHT 16777215 /* 3 bytes */ /* * Keep a generation count of routing table, incremented on route addition, * so we can invalidate caches. This is accessed without a lock, as precision * is not required. */ typedef volatile u_int rt_gen_t; /* tree generation (for adds) */ #define RT_GEN(fibnum, af) rt_tables_get_gen(fibnum, af) #define RT_DEFAULT_FIB 0 /* Explicitly mark fib=0 restricted cases */ #define RT_ALL_FIBS -1 /* Announce event for every fib */ #ifdef _KERNEL VNET_DECLARE(uint32_t, _rt_numfibs); /* number of existing route tables */ #define V_rt_numfibs VNET(_rt_numfibs) /* temporary compat arg */ #define rt_numfibs V_rt_numfibs VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */ #define V_rt_add_addr_allfibs VNET(rt_add_addr_allfibs) /* Calculate flowid for locally-originated packets */ #define V_fib_hash_outbound VNET(fib_hash_outbound) VNET_DECLARE(u_int, fib_hash_outbound); /* Outbound flowid generation rules */ #ifdef RSS #define fib4_calc_packet_hash xps_proto_software_hash_v4 #define fib6_calc_packet_hash xps_proto_software_hash_v6 #define CALC_FLOWID_OUTBOUND_SENDTO true #ifdef ROUTE_MPATH #define CALC_FLOWID_OUTBOUND V_fib_hash_outbound #else #define CALC_FLOWID_OUTBOUND false #endif #else /* !RSS */ #define fib4_calc_packet_hash fib4_calc_software_hash #define fib6_calc_packet_hash fib6_calc_software_hash #ifdef ROUTE_MPATH #define CALC_FLOWID_OUTBOUND_SENDTO V_fib_hash_outbound #define CALC_FLOWID_OUTBOUND V_fib_hash_outbound #else #define CALC_FLOWID_OUTBOUND_SENDTO false #define CALC_FLOWID_OUTBOUND false #endif #endif /* RSS */ #endif /* _KERNEL */ /* * We distinguish between routes to hosts and routes to networks, * preferring the former if available. For each route we infer * the interface to use from the gateway address supplied when * the route was entered. Routes that forward packets through * gateways are marked so that the output routines know to address the * gateway rather than the ultimate destination. */ #define RTF_UP 0x1 /* route usable */ #define RTF_GATEWAY 0x2 /* destination is a gateway */ #define RTF_HOST 0x4 /* host entry (net otherwise) */ #define RTF_REJECT 0x8 /* host or net unreachable */ #define RTF_DYNAMIC 0x10 /* created dynamically (by redirect) */ #define RTF_MODIFIED 0x20 /* modified dynamically (by redirect) */ #define RTF_DONE 0x40 /* message confirmed */ /* 0x80 unused, was RTF_DELCLONE */ /* 0x100 unused, was RTF_CLONING */ #define RTF_XRESOLVE 0x200 /* external daemon resolves name */ #define RTF_LLINFO 0x400 /* DEPRECATED - exists ONLY for backward compatibility */ #define RTF_LLDATA 0x400 /* used by apps to add/del L2 entries */ #define RTF_STATIC 0x800 /* manually added */ #define RTF_BLACKHOLE 0x1000 /* just discard pkts (during updates) */ #define RTF_PROTO2 0x4000 /* protocol specific routing flag */ #define RTF_PROTO1 0x8000 /* protocol specific routing flag */ /* 0x10000 unused, was RTF_PRCLONING */ /* 0x20000 unused, was RTF_WASCLONED */ #define RTF_PROTO3 0x40000 /* protocol specific routing flag */ #define RTF_FIXEDMTU 0x80000 /* MTU was explicitly specified */ #define RTF_PINNED 0x100000 /* route is immutable */ #define RTF_LOCAL 0x200000 /* route represents a local address */ #define RTF_BROADCAST 0x400000 /* route represents a bcast address */ #define RTF_MULTICAST 0x800000 /* route represents a mcast address */ /* 0x8000000 and up unassigned */ #define RTF_STICKY 0x10000000 /* always route dst->src */ /* 0x40000000 unused, was RTF_RNH_LOCKED */ #define RTF_GWFLAG_COMPAT 0x80000000 /* a compatibility bit for interacting with existing routing apps */ /* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */ #define RTF_FMASK \ (RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \ RTF_REJECT | RTF_STATIC | RTF_STICKY) /* * fib_ nexthop API flags. */ /* Consumer-visible nexthop info flags */ #define NHF_MULTIPATH 0x0008 /* Nexhop is a nexthop group */ #define NHF_REJECT 0x0010 /* RTF_REJECT */ #define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */ #define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */ #define NHF_DEFAULT 0x0080 /* Default route */ #define NHF_BROADCAST 0x0100 /* RTF_BROADCAST */ #define NHF_GATEWAY 0x0200 /* RTF_GATEWAY */ #define NHF_HOST 0x0400 /* RTF_HOST */ /* Nexthop request flags */ #define NHR_NONE 0x00 /* empty flags field */ #define NHR_REF 0x01 /* reference nexhop */ #define NHR_NODEFAULT 0x02 /* uRPF: do not consider default route */ /* Control plane route request flags */ #define NHR_COPY 0x100 /* Copy rte data */ #define NHR_UNLOCKED 0x200 /* Do not lock table */ /* * Routing statistics. */ struct rtstat { uint64_t rts_badredirect; /* bogus redirect calls */ uint64_t rts_dynamic; /* routes created by redirects */ uint64_t rts_newgateway; /* routes modified by redirects */ uint64_t rts_unreach; /* lookups which failed */ uint64_t rts_wildcard; /* lookups satisfied by a wildcard */ uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/ uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/ uint64_t rts_add_failure; /* # of route addition failures */ uint64_t rts_add_retry; /* # of route addition retries */ uint64_t rts_del_failure; /* # of route deletion failure */ uint64_t rts_del_retry; /* # of route deletion retries */ }; /* * Structures for routing messages. */ struct rt_msghdr { u_short rtm_msglen; /* to skip over non-understood messages */ u_char rtm_version; /* future binary compatibility */ u_char rtm_type; /* message type */ u_short rtm_index; /* index for associated ifp */ u_short _rtm_spare1; int rtm_flags; /* flags, incl. kern & message, e.g. DONE */ int rtm_addrs; /* bitmask identifying sockaddrs in msg */ pid_t rtm_pid; /* identify sender */ int rtm_seq; /* for sender to identify action */ int rtm_errno; /* why failed */ int rtm_fmask; /* bitmask used in RTM_CHANGE message */ u_long rtm_inits; /* which metrics we are initializing */ struct rt_metrics rtm_rmx; /* metrics themselves */ }; #define RTM_VERSION 5 /* Up the ante and ignore older versions */ /* * Message types. * * The format for each message is annotated below using the following * identifiers: * * (1) struct rt_msghdr * (2) struct ifa_msghdr * (3) struct if_msghdr * (4) struct ifma_msghdr * (5) struct if_announcemsghdr * */ #define RTM_ADD 0x1 /* (1) Add Route */ #define RTM_DELETE 0x2 /* (1) Delete Route */ #define RTM_CHANGE 0x3 /* (1) Change Metrics or flags */ #define RTM_GET 0x4 /* (1) Report Metrics */ #define RTM_LOSING 0x5 /* (1) Kernel Suspects Partitioning */ #define RTM_REDIRECT 0x6 /* (1) Told to use different route */ #define RTM_MISS 0x7 /* (1) Lookup failed on this address */ #define RTM_LOCK 0x8 /* (1) fix specified metrics */ /* 0x9 */ /* 0xa */ #define RTM_RESOLVE 0xb /* (1) req to resolve dst to LL addr */ #define RTM_NEWADDR 0xc /* (2) address being added to iface */ #define RTM_DELADDR 0xd /* (2) address being removed from iface */ #define RTM_IFINFO 0xe /* (3) iface going up/down etc. */ #define RTM_NEWMADDR 0xf /* (4) mcast group membership being added to if */ #define RTM_DELMADDR 0x10 /* (4) mcast group membership being deleted */ #define RTM_IFANNOUNCE 0x11 /* (5) iface arrival/departure */ #define RTM_IEEE80211 0x12 /* (5) IEEE80211 wireless event */ /* * Bitmask values for rtm_inits and rmx_locks. */ #define RTV_MTU 0x1 /* init or lock _mtu */ #define RTV_HOPCOUNT 0x2 /* init or lock _hopcount */ #define RTV_EXPIRE 0x4 /* init or lock _expire */ #define RTV_RPIPE 0x8 /* init or lock _recvpipe */ #define RTV_SPIPE 0x10 /* init or lock _sendpipe */ #define RTV_SSTHRESH 0x20 /* init or lock _ssthresh */ #define RTV_RTT 0x40 /* init or lock _rtt */ #define RTV_RTTVAR 0x80 /* init or lock _rttvar */ #define RTV_WEIGHT 0x100 /* init or lock _weight */ /* * Bitmask values for rtm_addrs. */ #define RTA_DST 0x1 /* destination sockaddr present */ #define RTA_GATEWAY 0x2 /* gateway sockaddr present */ #define RTA_NETMASK 0x4 /* netmask sockaddr present */ #define RTA_GENMASK 0x8 /* cloning mask sockaddr present */ #define RTA_IFP 0x10 /* interface name sockaddr present */ #define RTA_IFA 0x20 /* interface addr sockaddr present */ #define RTA_AUTHOR 0x40 /* sockaddr for author of redirect */ #define RTA_BRD 0x80 /* for NEWADDR, broadcast or p-p dest addr */ /* * Index offsets for sockaddr array for alternate internal encoding. */ #define RTAX_DST 0 /* destination sockaddr present */ #define RTAX_GATEWAY 1 /* gateway sockaddr present */ #define RTAX_NETMASK 2 /* netmask sockaddr present */ #define RTAX_GENMASK 3 /* cloning mask sockaddr present */ #define RTAX_IFP 4 /* interface name sockaddr present */ #define RTAX_IFA 5 /* interface addr sockaddr present */ #define RTAX_AUTHOR 6 /* sockaddr for author of redirect */ #define RTAX_BRD 7 /* for NEWADDR, broadcast or p-p dest addr */ #define RTAX_MAX 8 /* size of array to allocate */ struct rtentry; struct nhop_object; typedef int rib_filter_f_t(const struct rtentry *, const struct nhop_object *, void *); struct rt_addrinfo { int rti_addrs; /* Route RTF_ flags */ int rti_flags; /* Route RTF_ flags */ struct sockaddr *rti_info[RTAX_MAX]; /* Sockaddr data */ struct ifaddr *rti_ifa; /* value of rt_ifa addr */ struct ifnet *rti_ifp; /* route interface */ rib_filter_f_t *rti_filter; /* filter function */ void *rti_filterdata; /* filter parameters */ u_long rti_mflags; /* metrics RTV_ flags */ u_long rti_spare; /* Will be used for fib */ struct rt_metrics *rti_rmx; /* Pointer to route metrics */ }; /* * This macro returns the size of a struct sockaddr when passed * through a routing socket. Basically we round up sa_len to * a multiple of sizeof(long), with a minimum of sizeof(long). * The case sa_len == 0 should only apply to empty structures. */ #define SA_SIZE(sa) \ ( (((struct sockaddr *)(sa))->sa_len == 0) ? \ sizeof(long) : \ 1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(long) - 1) ) ) #define sa_equal(a, b) ( \ (((const struct sockaddr *)(a))->sa_len == ((const struct sockaddr *)(b))->sa_len) && \ (bcmp((a), (b), ((const struct sockaddr *)(b))->sa_len) == 0)) #ifdef _KERNEL #define RT_LINK_IS_UP(ifp) (!((ifp)->if_capabilities & IFCAP_LINKSTATE) \ || (ifp)->if_link_state == LINK_STATE_UP) #define RO_NHFREE(_ro) do { \ if ((_ro)->ro_nh) { \ NH_FREE((_ro)->ro_nh); \ (_ro)->ro_nh = NULL; \ } \ } while (0) #define RO_INVALIDATE_CACHE(ro) do { \ if ((ro)->ro_lle != NULL) { \ LLE_FREE((ro)->ro_lle); \ (ro)->ro_lle = NULL; \ } \ if ((ro)->ro_nh != NULL) { \ NH_FREE((ro)->ro_nh); \ (ro)->ro_nh = NULL; \ } \ } while (0) #define RO_GET_FAMILY(ro, dst) ((ro) != NULL && \ (ro)->ro_flags & RT_HAS_GW \ ? (ro)->ro_dst.sa_family : (dst)->sa_family) /* * Validate a cached route based on a supplied cookie. If there is an * out-of-date cache, simply free it. Update the generation number * for the new allocation */ #define NH_VALIDATE(ro, cookiep, fibnum) do { \ rt_gen_t cookie = RT_GEN(fibnum, (ro)->ro_dst.sa_family); \ if (*(cookiep) != cookie) { \ RO_INVALIDATE_CACHE(ro); \ *(cookiep) = cookie; \ } \ } while (0) struct ifmultiaddr; struct rib_head; void rt_ieee80211msg(struct ifnet *, int, void *, size_t); void rt_ifannouncemsg(struct ifnet *, int); void rt_ifmsg(struct ifnet *); void rt_missmsg(int, struct rt_addrinfo *, int, int); void rt_missmsg_fib(int, struct rt_addrinfo *, int, int, int); int rt_addrmsg(int, struct ifaddr *, int); int rt_routemsg(int, struct rtentry *, struct nhop_object *, int); int rt_routemsg_info(int, struct rt_addrinfo *, int); void rt_newmaddrmsg(int, struct ifmultiaddr *); -void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *); +void rt_maskedcopy(const struct sockaddr *, struct sockaddr *, + const struct sockaddr *); struct rib_head *rt_table_init(int, int, u_int); void rt_table_destroy(struct rib_head *); u_int rt_tables_get_gen(uint32_t table, sa_family_t family); struct sockaddr *rtsock_fix_netmask(const struct sockaddr *dst, const struct sockaddr *smask, struct sockaddr_storage *dmask); void rt_updatemtu(struct ifnet *); void rt_flushifroutes(struct ifnet *ifp); /* XXX MRT NEW VERSIONS THAT USE FIBs * For now the protocol indepedent versions are the same as the AF_INET ones * but this will change.. */ int rtioctl_fib(u_long, caddr_t, u_int); int rib_lookup_info(uint32_t, const struct sockaddr *, uint32_t, uint32_t, struct rt_addrinfo *); void rib_free_info(struct rt_addrinfo *info); /* New API */ void rib_flush_routes_family(int family); struct nhop_object *rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid); const char *rib_print_family(int family); #endif #endif diff --git a/sys/net/route/route_ctl.c b/sys/net/route/route_ctl.c index 34a029746fa1..8f116cd65aa9 100644 --- a/sys/net/route/route_ctl.c +++ b/sys/net/route/route_ctl.c @@ -1,1627 +1,1604 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME route_ctl #define DEBUG_MAX_LEVEL LOG_DEBUG #include _DECLARE_DEBUG(LOG_INFO); /* * This file contains control plane routing tables functions. * * All functions assumes they are called in net epoch. */ struct rib_subscription { CK_STAILQ_ENTRY(rib_subscription) next; rib_subscription_cb_t *func; void *arg; struct rib_head *rnh; enum rib_subscription_type type; struct epoch_context epoch_ctx; }; static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc); static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *nhd_orig, struct rib_cmd_info *rc); static int add_route(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd, struct rib_cmd_info *rc); static int delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc); static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt, int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc); static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, struct rib_cmd_info *rc); static int get_prio_from_info(const struct rt_addrinfo *info); static int nhop_get_prio(const struct nhop_object *nh); static void destroy_subscription_epoch(epoch_context_t ctx); #ifdef ROUTE_MPATH static bool rib_can_multipath(struct rib_head *rh); #endif /* Per-vnet multipath routing configuration */ SYSCTL_DECL(_net_route); #define V_rib_route_multipath VNET(rib_route_multipath) #ifdef ROUTE_MPATH #define _MP_FLAGS CTLFLAG_RW #else #define _MP_FLAGS CTLFLAG_RD #endif VNET_DEFINE(u_int, rib_route_multipath) = 1; SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET, &VNET_NAME(rib_route_multipath), 0, "Enable route multipath"); #undef _MP_FLAGS #if defined(INET) && defined(INET6) FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops"); #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop) VNET_DEFINE(u_int, rib_route_ipv6_nexthop) = 1; SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address"); #endif /* Routing table UMA zone */ VNET_DEFINE_STATIC(uma_zone_t, rtzone); #define V_rtzone VNET(rtzone) /* Debug bits */ SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); void vnet_rtzone_init(void) { V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } #ifdef VIMAGE void vnet_rtzone_destroy(void) { uma_zdestroy(V_rtzone); } #endif static void destroy_rtentry(struct rtentry *rt) { #ifdef VIMAGE struct nhop_object *nh = rt->rt_nhop; /* * At this moment rnh, nh_control may be already freed. * nhop interface may have been migrated to a different vnet. * Use vnet stored in the nexthop to delete the entry. */ #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { const struct weightened_nhop *wn; uint32_t num_nhops; wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); nh = wn[0].nh; } #endif CURVNET_SET(nhop_get_vnet(nh)); #endif /* Unreference nexthop */ nhop_free_any(rt->rt_nhop); uma_zfree(V_rtzone, rt); CURVNET_RESTORE(); } /* * Epoch callback indicating rtentry is safe to destroy */ static void destroy_rtentry_epoch(epoch_context_t ctx) { struct rtentry *rt; rt = __containerof(ctx, struct rtentry, rt_epoch_ctx); destroy_rtentry(rt); } /* * Schedule rtentry deletion */ static void rtfree(struct rtentry *rt) { KASSERT(rt != NULL, ("%s: NULL rt", __func__)); epoch_call(net_epoch_preempt, destroy_rtentry_epoch, &rt->rt_epoch_ctx); } static struct rib_head * get_rnh(uint32_t fibnum, const struct rt_addrinfo *info) { struct rib_head *rnh; struct sockaddr *dst; KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum")); dst = info->rti_info[RTAX_DST]; rnh = rt_tables_get_rnh(fibnum, dst->sa_family); return (rnh); } #if defined(INET) && defined(INET6) static bool rib_can_ipv6_nexthop_address(struct rib_head *rh) { int result; CURVNET_SET(rh->rib_vnet); result = !!V_rib_route_ipv6_nexthop; CURVNET_RESTORE(); return (result); } #endif #ifdef ROUTE_MPATH static bool rib_can_multipath(struct rib_head *rh) { int result; CURVNET_SET(rh->rib_vnet); result = !!V_rib_route_multipath; CURVNET_RESTORE(); return (result); } /* * Check is nhop is multipath-eligible. * Avoid nhops without gateways and redirects. * * Returns 1 for multipath-eligible nexthop, * 0 otherwise. */ bool nhop_can_multipath(const struct nhop_object *nh) { if ((nh->nh_flags & NHF_MULTIPATH) != 0) return (1); if ((nh->nh_flags & NHF_GATEWAY) == 0) return (0); if ((nh->nh_flags & NHF_REDIRECT) != 0) return (0); return (1); } #endif static int get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight) { uint32_t weight; if (info->rti_mflags & RTV_WEIGHT) weight = info->rti_rmx->rmx_weight; else weight = default_weight; /* Keep upper 1 byte for adm distance purposes */ if (weight > RT_MAX_WEIGHT) weight = RT_MAX_WEIGHT; else if (weight == 0) weight = default_weight; return (weight); } bool rt_is_host(const struct rtentry *rt) { return (rt->rte_flags & RTF_HOST); } sa_family_t rt_get_family(const struct rtentry *rt) { const struct sockaddr *dst; dst = (const struct sockaddr *)rt_key_const(rt); return (dst->sa_family); } /* * Returns pointer to nexthop or nexthop group * associated with @rt */ struct nhop_object * rt_get_raw_nhop(const struct rtentry *rt) { return (rt->rt_nhop); } #ifdef INET /* * Stores IPv4 address and prefix length of @rt inside * @paddr and @plen. * @pscopeid is currently always set to 0. */ void rt_get_inet_prefix_plen(const struct rtentry *rt, struct in_addr *paddr, int *plen, uint32_t *pscopeid) { const struct sockaddr_in *dst; dst = (const struct sockaddr_in *)rt_key_const(rt); KASSERT((dst->sin_family == AF_INET), ("rt family is %d, not inet", dst->sin_family)); *paddr = dst->sin_addr; dst = (const struct sockaddr_in *)rt_mask_const(rt); if (dst == NULL) *plen = 32; else *plen = bitcount32(dst->sin_addr.s_addr); *pscopeid = 0; } /* * Stores IPv4 address and prefix mask of @rt inside * @paddr and @pmask. Sets mask to INADDR_ANY for host routes. * @pscopeid is currently always set to 0. */ void rt_get_inet_prefix_pmask(const struct rtentry *rt, struct in_addr *paddr, struct in_addr *pmask, uint32_t *pscopeid) { const struct sockaddr_in *dst; dst = (const struct sockaddr_in *)rt_key_const(rt); KASSERT((dst->sin_family == AF_INET), ("rt family is %d, not inet", dst->sin_family)); *paddr = dst->sin_addr; dst = (const struct sockaddr_in *)rt_mask_const(rt); if (dst == NULL) pmask->s_addr = INADDR_BROADCAST; else *pmask = dst->sin_addr; *pscopeid = 0; } #endif #ifdef INET6 static int inet6_get_plen(const struct in6_addr *addr) { return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) + bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3])); } /* * Stores IPv6 address and prefix length of @rt inside * @paddr and @plen. Addresses are returned in de-embedded form. * Scopeid is set to 0 for non-LL addresses. */ void rt_get_inet6_prefix_plen(const struct rtentry *rt, struct in6_addr *paddr, int *plen, uint32_t *pscopeid) { const struct sockaddr_in6 *dst; dst = (const struct sockaddr_in6 *)rt_key_const(rt); KASSERT((dst->sin6_family == AF_INET6), ("rt family is %d, not inet6", dst->sin6_family)); if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) in6_splitscope(&dst->sin6_addr, paddr, pscopeid); else *paddr = dst->sin6_addr; dst = (const struct sockaddr_in6 *)rt_mask_const(rt); if (dst == NULL) *plen = 128; else *plen = inet6_get_plen(&dst->sin6_addr); } /* * Stores IPv6 address and prefix mask of @rt inside * @paddr and @pmask. Addresses are returned in de-embedded form. * Scopeid is set to 0 for non-LL addresses. */ void rt_get_inet6_prefix_pmask(const struct rtentry *rt, struct in6_addr *paddr, struct in6_addr *pmask, uint32_t *pscopeid) { const struct sockaddr_in6 *dst; dst = (const struct sockaddr_in6 *)rt_key_const(rt); KASSERT((dst->sin6_family == AF_INET6), ("rt family is %d, not inet", dst->sin6_family)); if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) in6_splitscope(&dst->sin6_addr, paddr, pscopeid); else *paddr = dst->sin6_addr; dst = (const struct sockaddr_in6 *)rt_mask_const(rt); if (dst == NULL) memset(pmask, 0xFF, sizeof(struct in6_addr)); else *pmask = dst->sin6_addr; } #endif /* * Check if specified @gw matches gw data in the nexthop @nh. * * Returns true if matches, false otherwise. */ bool match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw) { if (nh->gw_sa.sa_family != gw->sa_family) return (false); switch (gw->sa_family) { case AF_INET: return (nh->gw4_sa.sin_addr.s_addr == ((const struct sockaddr_in *)gw)->sin_addr.s_addr); case AF_INET6: { const struct sockaddr_in6 *gw6; gw6 = (const struct sockaddr_in6 *)gw; /* * Currently (2020-09) IPv6 gws in kernel have their * scope embedded. Once this becomes false, this code * has to be revisited. */ if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr, &gw6->sin6_addr)) return (true); return (false); } case AF_LINK: { const struct sockaddr_dl *sdl; sdl = (const struct sockaddr_dl *)gw; return (nh->gwl_sa.sdl_index == sdl->sdl_index); } default: return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0); } /* NOTREACHED */ return (false); } struct gw_filter_data { const struct sockaddr *gw; int count; }; static int gw_filter_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data) { struct gw_filter_data *gwd = (struct gw_filter_data *)_data; /* Return only first match to make rtsock happy */ if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0) return (1); return (0); } /* * Checks if data in @info matches nexhop @nh. * * Returns 0 on success, * ESRCH if not matched, * ENOENT if filter function returned false */ int check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt, const struct nhop_object *nh) { const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY]; if (info->rti_filter != NULL) { if (info->rti_filter(rt, nh, info->rti_filterdata) == 0) return (ENOENT); else return (0); } if ((gw != NULL) && !match_nhop_gw(nh, gw)) return (ESRCH); return (0); } /* * Runs exact prefix match based on @dst and @netmask. * Returns matched @rtentry if found or NULL. * If rtentry was found, saves nexthop / weight value into @rnd. */ static struct rtentry * lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst, const struct sockaddr *netmask, struct route_nhop_data *rnd) { struct rtentry *rt; RIB_LOCK_ASSERT(rnh); rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head); if (rt != NULL) { rnd->rnd_nhop = rt->rt_nhop; rnd->rnd_weight = rt->rt_weight; } else { rnd->rnd_nhop = NULL; rnd->rnd_weight = 0; } return (rt); } struct rtentry * lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt, struct route_nhop_data *rnd) { return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd)); } /* * Runs exact prefix match based on dst/netmask from @info. * Assumes RIB lock is held. * Returns matched @rtentry if found or NULL. * If rtentry was found, saves nexthop / weight value into @rnd. */ struct rtentry * lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info, struct route_nhop_data *rnd) { struct rtentry *rt; rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], rnd); return (rt); } /* * Adds route defined by @info into the kernel table specified by @fibnum and * sa_family in @info->rti_info[RTAX_DST]. * * Returns 0 on success and fills in operation metadata into @rc. */ int rib_add_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) { struct rib_head *rnh; int error; NET_EPOCH_ASSERT(); rnh = get_rnh(fibnum, info); if (rnh == NULL) return (EAFNOSUPPORT); /* * Check consistency between RTF_HOST flag and netmask * existence. */ if (info->rti_flags & RTF_HOST) info->rti_info[RTAX_NETMASK] = NULL; else if (info->rti_info[RTAX_NETMASK] == NULL) { FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask"); return (EINVAL); } bzero(rc, sizeof(struct rib_cmd_info)); rc->rc_cmd = RTM_ADD; error = add_route_byinfo(rnh, info, rc); if (error == 0) rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); return (error); } /* * Checks if @dst and @gateway is valid combination. * * Returns true if is valid, false otherwise. */ static bool check_gateway(struct rib_head *rnh, struct sockaddr *dst, struct sockaddr *gateway) { if (dst->sa_family == gateway->sa_family) return (true); else if (gateway->sa_family == AF_UNSPEC) return (true); else if (gateway->sa_family == AF_LINK) return (true); #if defined(INET) && defined(INET6) else if (dst->sa_family == AF_INET && gateway->sa_family == AF_INET6 && rib_can_ipv6_nexthop_address(rnh)) return (true); #endif else return (false); } /* * Creates rtentry and nexthop based on @info data. * Return 0 and fills in rtentry into @prt on success, * Note: rtentry mask will be set to RTAX_NETMASK, thus its pointer is required * to be stable till the end of the operation (radix rt insertion/change/removal). * return errno otherwise. */ +static struct rtentry * +create_rtentry(struct rib_head *rnh, const struct sockaddr *dst, + struct sockaddr *netmask) +{ + MPASS(dst->sa_len <= sizeof(((struct rtentry *)NULL)->rt_dstb)); + + struct rtentry *rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO); + if (rt == NULL) + return (NULL); + rt->rte_flags = RTF_UP | (netmask == NULL ? RTF_HOST : 0); + + /* Fill in dst, ensuring it's masked if needed. */ + if (netmask != NULL) { + rt_maskedcopy(dst, &rt->rt_dst, netmask); + } else + bcopy(dst, &rt->rt_dst, dst->sa_len); + rt_key(rt) = &rt->rt_dst; + /* Set netmask to the storage from info. It will be updated upon insertion */ + rt_mask(rt) = netmask; + + return (rt); +} + static int -create_rtentry(struct rib_head *rnh, struct rt_addrinfo *info, - struct rtentry **prt) +add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info, + struct rib_cmd_info *rc) { - struct sockaddr *dst, *ndst, *gateway, *netmask; - struct rtentry *rt; + struct nhop_object *nh_orig; + struct route_nhop_data rnd_orig, rnd_add; struct nhop_object *nh; + struct rtentry *rt, *rt_orig; + struct sockaddr *dst, *gateway, *netmask; int error, flags; dst = info->rti_info[RTAX_DST]; gateway = info->rti_info[RTAX_GATEWAY]; netmask = info->rti_info[RTAX_NETMASK]; flags = info->rti_flags; if ((flags & RTF_GATEWAY) && !gateway) { FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw"); return (EINVAL); } if (dst && gateway && !check_gateway(rnh, dst, gateway)) { FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid dst/gateway family combination (%d, %d)", dst->sa_family, gateway->sa_family); return (EINVAL); } if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) { FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d", dst->sa_len); return (EINVAL); } if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, rnh->rib_fibnum); if (error) return (error); } - error = nhop_create_from_info(rnh, info, &nh); - if (error != 0) - return (error); - - rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO); - if (rt == NULL) { - nhop_free(nh); + if ((rt = create_rtentry(rnh, dst, netmask)) == NULL) return (ENOBUFS); - } - rt->rte_flags = (RTF_UP | flags) & RTE_RT_FLAG_MASK; - rt->rt_nhop = nh; - - /* Fill in dst */ - memcpy(&rt->rt_dst, dst, dst->sa_len); - rt_key(rt) = &rt->rt_dst; - /* - * point to the (possibly newly malloc'd) dest address. - */ - ndst = (struct sockaddr *)rt_key(rt); - - /* - * make sure it contains the value we want (masked if needed). - */ - if (netmask) { - rt_maskedcopy(dst, ndst, netmask); - } else - bcopy(dst, ndst, dst->sa_len); - /* Set netmask to the storage from info. It will be updated upon insertion */ - rt_mask(rt) = netmask; - - /* - * We use the ifa reference returned by rt_getifa_fib(). - * This moved from below so that rnh->rnh_addaddr() can - * examine the ifa and ifa->ifa_ifp if it so desires. - */ - rt->rt_weight = get_info_weight(info, RT_DEFAULT_WEIGHT); - - *prt = rt; - return (0); -} - -static int -add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info, - struct rib_cmd_info *rc) -{ - struct nhop_object *nh_orig; - struct route_nhop_data rnd_orig, rnd_add; - struct nhop_object *nh; - struct rtentry *rt, *rt_orig; - int error; - - error = create_rtentry(rnh, info, &rt); - if (error != 0) + error = nhop_create_from_info(rnh, info, &nh); + if (error != 0) { + uma_zfree(V_rtzone, rt); return (error); + } - rnd_add.rnd_nhop = rt->rt_nhop; - rnd_add.rnd_weight = rt->rt_weight; - nh = rt->rt_nhop; + rnd_add.rnd_nhop = nh; + rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT); RIB_WLOCK(rnh); error = add_route(rnh, rt, &rnd_add, rc); if (error == 0) { RIB_WUNLOCK(rnh); return (0); } /* addition failed. Lookup prefix in the rib to determine the cause */ rt_orig = lookup_prefix(rnh, info, &rnd_orig); if (rt_orig == NULL) { /* No prefix -> rnh_addaddr() failed to allocate memory */ RIB_WUNLOCK(rnh); nhop_free(nh); uma_zfree(V_rtzone, rt); return (ENOMEM); } /* We have existing route in the RIB. */ nh_orig = rnd_orig.rnd_nhop; /* Check if new route has higher preference */ if (get_prio_from_info(info) > nhop_get_prio(nh_orig)) { /* Update nexthop to the new route */ change_route(rnh, rt_orig, &rnd_add, rc); RIB_WUNLOCK(rnh); uma_zfree(V_rtzone, rt); nhop_free(nh_orig); return (0); } RIB_WUNLOCK(rnh); #ifdef ROUTE_MPATH if (rib_can_multipath(rnh) && nhop_can_multipath(rnd_add.rnd_nhop) && nhop_can_multipath(rnd_orig.rnd_nhop)) error = add_route_mpath(rnh, info, rt, &rnd_add, &rnd_orig, rc); else #endif /* Unable to add - another route with the same preference exists */ error = EEXIST; /* * ROUTE_MPATH disabled: failed to add route, free both nhop and rt. * ROUTE_MPATH enabled: original nhop reference is unused in any case, * free rt only if not _adding_ new route to rib (e.g. the case * when initial lookup returned existing route, but then it got * deleted prior to multipath group insertion, leading to a simple * non-multipath add as a result). */ nhop_free(nh); if ((error != 0) || rc->rc_cmd != RTM_ADD) uma_zfree(V_rtzone, rt); return (error); } /* * Removes route defined by @info from the kernel table specified by @fibnum and * sa_family in @info->rti_info[RTAX_DST]. * * Returns 0 on success and fills in operation metadata into @rc. */ int rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) { struct rib_head *rnh; struct sockaddr *dst, *netmask; struct sockaddr_storage mdst; int error; NET_EPOCH_ASSERT(); rnh = get_rnh(fibnum, info); if (rnh == NULL) return (EAFNOSUPPORT); bzero(rc, sizeof(struct rib_cmd_info)); rc->rc_cmd = RTM_DELETE; dst = info->rti_info[RTAX_DST]; netmask = info->rti_info[RTAX_NETMASK]; if (netmask != NULL) { /* Ensure @dst is always properly masked */ if (dst->sa_len > sizeof(mdst)) { FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large"); return (EINVAL); } rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } rib_filter_f_t *filter_func = NULL; void *filter_arg = NULL; struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] }; if (info->rti_filter != NULL) { filter_func = info->rti_filter; filter_arg = info->rti_filterdata; } else if (gwd.gw != NULL) { filter_func = gw_filter_func; filter_arg = &gwd; } int prio = get_prio_from_info(info); RIB_WLOCK(rnh); struct route_nhop_data rnd; struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd); if (rt != NULL) { error = rt_delete_conditional(rnh, rt, prio, filter_func, filter_arg, rc); } else error = ESRCH; RIB_WUNLOCK(rnh); if (error != 0) return (error); rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); if (rc->rc_cmd == RTM_DELETE) rtfree(rc->rc_rt); #ifdef ROUTE_MPATH else { /* * Deleting 1 path may result in RTM_CHANGE to * a different mpath group/nhop. * Free old mpath group. */ nhop_free_any(rc->rc_nh_old); } #endif return (0); } /* * File-local concept for distingushing between the normal and * RTF_PINNED routes tha can override the "normal" one. */ #define NH_PRIORITY_HIGH 2 #define NH_PRIORITY_NORMAL 1 static int get_prio_from_info(const struct rt_addrinfo *info) { if (info->rti_flags & RTF_PINNED) return (NH_PRIORITY_HIGH); return (NH_PRIORITY_NORMAL); } static int nhop_get_prio(const struct nhop_object *nh) { if (NH_IS_PINNED(nh)) return (NH_PRIORITY_HIGH); return (NH_PRIORITY_NORMAL); } /* * Conditionally unlinks rtentry paths from @rnh matching @cb. * Returns 0 on success with operation result stored in @rc. * On error, returns: * ESRCH - if prefix was not found or filter function failed to match * EADDRINUSE - if trying to delete higher priority route. */ static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt, int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc) { struct nhop_object *nh = rt->rt_nhop; struct route_nhop_data rnd; #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { struct nhgrp_object *nhg = (struct nhgrp_object *)nh; int error; if (cb == NULL) return (ESRCH); error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd); if (error == 0) { if (rnd.rnd_nhgrp == nhg) { /* No match, unreference new group and return. */ nhop_free_any(rnd.rnd_nhop); return (ESRCH); } error = change_route(rnh, rt, &rnd, rc); } return (error); } #endif if (cb != NULL && !cb(rt, nh, cbdata)) return (ESRCH); if (prio < nhop_get_prio(nh)) return (EADDRINUSE); return (delete_route(rnh, rt, rc)); } int rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc) { RIB_RLOCK_TRACKER; struct route_nhop_data rnd_orig; struct rib_head *rnh; struct rtentry *rt; int error; NET_EPOCH_ASSERT(); rnh = get_rnh(fibnum, info); if (rnh == NULL) return (EAFNOSUPPORT); bzero(rc, sizeof(struct rib_cmd_info)); rc->rc_cmd = RTM_CHANGE; /* Check if updated gateway exists */ if ((info->rti_flags & RTF_GATEWAY) && (info->rti_info[RTAX_GATEWAY] == NULL)) { /* * route(8) adds RTF_GATEWAY flag if -interface is not set. * Remove RTF_GATEWAY to enforce consistency and maintain * compatibility.. */ info->rti_flags &= ~RTF_GATEWAY; } /* * route change is done in multiple steps, with dropping and * reacquiring lock. In the situations with multiple processes * changes the same route in can lead to the case when route * is changed between the steps. Address it by retrying the operation * multiple times before failing. */ RIB_RLOCK(rnh); rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } rnd_orig.rnd_nhop = rt->rt_nhop; rnd_orig.rnd_weight = rt->rt_weight; RIB_RUNLOCK(rnh); for (int i = 0; i < RIB_MAX_RETRIES; i++) { error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc); if (error != EAGAIN) break; } return (error); } static int change_nhop(struct rib_head *rnh, struct rt_addrinfo *info, struct nhop_object *nh_orig, struct nhop_object **nh_new) { int error; /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ if (((nh_orig->nh_flags & NHF_GATEWAY) && info->rti_info[RTAX_GATEWAY] != NULL) || info->rti_info[RTAX_IFP] != NULL || (info->rti_info[RTAX_IFA] != NULL && !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) { error = rt_getifa_fib(info, rnh->rib_fibnum); if (error != 0) { info->rti_ifa = NULL; return (error); } } error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new); info->rti_ifa = NULL; return (error); } #ifdef ROUTE_MPATH static int change_mpath_route(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) { int error = 0, found_idx = 0; struct nhop_object *nh_orig = NULL, *nh_new; struct route_nhop_data rnd_new = {}; const struct weightened_nhop *wn = NULL; struct weightened_nhop *wn_new; uint32_t num_nhops; wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops); for (int i = 0; i < num_nhops; i++) { if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) { nh_orig = wn[i].nh; found_idx = i; break; } } if (nh_orig == NULL) return (ESRCH); error = change_nhop(rnh, info, nh_orig, &nh_new); if (error != 0) return (error); wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT | M_ZERO); if (wn_new == NULL) { nhop_free(nh_new); return (EAGAIN); } memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop)); wn_new[found_idx].nh = nh_new; wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight); error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new.rnd_nhgrp); nhop_free(nh_new); free(wn_new, M_TEMP); if (error != 0) return (error); error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc); return (error); } #endif static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info, struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc) { int error = 0; struct nhop_object *nh_orig; struct route_nhop_data rnd_new; nh_orig = rnd_orig->rnd_nhop; if (nh_orig == NULL) return (ESRCH); #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh_orig)) return (change_mpath_route(rnh, rt, info, rnd_orig, rc)); #endif rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight); error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop); if (error != 0) return (error); error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc); return (error); } /* * Insert @rt with nhop data from @rnd_new to @rnh. * Returns 0 on success and stores operation results in @rc. */ static int add_route(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd, struct rib_cmd_info *rc) { struct radix_node *rn; RIB_WLOCK_ASSERT(rnh); rt->rt_nhop = rnd->rnd_nhop; rt->rt_weight = rnd->rnd_weight; rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes); if (rn != NULL) { if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop)) tmproutes_update(rnh, rt, rnd->rnd_nhop); /* Finalize notification */ rib_bump_gen(rnh); rnh->rnh_prefixes++; rc->rc_cmd = RTM_ADD; rc->rc_rt = rt; rc->rc_nh_old = NULL; rc->rc_nh_new = rnd->rnd_nhop; rc->rc_nh_weight = rnd->rnd_weight; rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); return (0); } /* Existing route or memory allocation failure. */ return (EEXIST); } /* * Unconditionally deletes @rt from @rnh. */ static int delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc) { RIB_WLOCK_ASSERT(rnh); /* Route deletion requested. */ struct radix_node *rn; rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head); if (rn == NULL) return (ESRCH); rt = RNTORT(rn); rt->rte_flags &= ~RTF_UP; rib_bump_gen(rnh); rnh->rnh_prefixes--; rc->rc_cmd = RTM_DELETE; rc->rc_rt = rt; rc->rc_nh_old = rt->rt_nhop; rc->rc_nh_new = NULL; rc->rc_nh_weight = rt->rt_weight; rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); return (0); } /* * Switch @rt nhop/weigh to the ones specified in @rnd. * Returns 0 on success. */ int change_route(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd, struct rib_cmd_info *rc) { struct nhop_object *nh_orig; RIB_WLOCK_ASSERT(rnh); nh_orig = rt->rt_nhop; if (rnd->rnd_nhop == NULL) return (delete_route(rnh, rt, rc)); /* Changing nexthop & weight to a new one */ rt->rt_nhop = rnd->rnd_nhop; rt->rt_weight = rnd->rnd_weight; if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop)) tmproutes_update(rnh, rt, rnd->rnd_nhop); /* Finalize notification */ rib_bump_gen(rnh); rc->rc_cmd = RTM_CHANGE; rc->rc_rt = rt; rc->rc_nh_old = nh_orig; rc->rc_nh_new = rnd->rnd_nhop; rc->rc_nh_weight = rnd->rnd_weight; rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc); return (0); } /* * Conditionally update route nhop/weight IFF data in @nhd_orig is * consistent with the current route data. * Nexthop in @nhd_new is consumed. */ int change_route_conditional(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new, struct rib_cmd_info *rc) { struct rtentry *rt_new; int error = 0; #if DEBUG_MAX_LEVEL >= LOG_DEBUG2 { char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE]; nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE); nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE); FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family, "trying change %s -> %s", buf_old, buf_new); } #endif RIB_WLOCK(rnh); struct route_nhop_data rnd; rt_new = lookup_prefix_rt(rnh, rt, &rnd); if (rt_new == NULL) { if (rnd_orig->rnd_nhop == NULL) error = add_route(rnh, rt, rnd_new, rc); else { /* * Prefix does not exist, which was not our assumption. * Update @rnd_orig with the new data and return */ rnd_orig->rnd_nhop = NULL; rnd_orig->rnd_weight = 0; error = EAGAIN; } } else { /* Prefix exists, try to update */ if (rnd_orig->rnd_nhop == rt_new->rt_nhop) { /* * Nhop/mpath group hasn't changed. Flip * to the new precalculated one and return */ error = change_route(rnh, rt_new, rnd_new, rc); } else { /* Update and retry */ rnd_orig->rnd_nhop = rt_new->rt_nhop; rnd_orig->rnd_weight = rt_new->rt_weight; error = EAGAIN; } } RIB_WUNLOCK(rnh); if (error == 0) { rib_notify(rnh, RIB_NOTIFY_DELAYED, rc); if (rnd_orig->rnd_nhop != NULL) nhop_free_any(rnd_orig->rnd_nhop); } else { if (rnd_new->rnd_nhop != NULL) nhop_free_any(rnd_new->rnd_nhop); } return (error); } /* * Performs modification of routing table specificed by @action. * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST]. * Needs to be run in network epoch. * * Returns 0 on success and fills in @rc with action result. */ int rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info, struct rib_cmd_info *rc) { int error; switch (action) { case RTM_ADD: error = rib_add_route(fibnum, info, rc); break; case RTM_DELETE: error = rib_del_route(fibnum, info, rc); break; case RTM_CHANGE: error = rib_change_route(fibnum, info, rc); break; default: error = ENOTSUP; } return (error); } struct rt_delinfo { struct rib_head *rnh; struct rtentry *head; rib_filter_f_t *filter_f; void *filter_arg; int prio; struct rib_cmd_info rc; }; /* * Conditionally unlinks rtenties or paths from radix tree based * on the callback data passed in @arg. */ static int rt_checkdelroute(struct radix_node *rn, void *arg) { struct rt_delinfo *di = (struct rt_delinfo *)arg; struct rtentry *rt = (struct rtentry *)rn; if (rt_delete_conditional(di->rnh, rt, di->prio, di->filter_f, di->filter_arg, &di->rc) != 0) return (0); /* * Add deleted rtentries to the list to GC them * after dropping the lock. * * XXX: Delayed notifications not implemented * for nexthop updates. */ if (di->rc.rc_cmd == RTM_DELETE) { /* Add to the list and return */ rt->rt_chain = di->head; di->head = rt; #ifdef ROUTE_MPATH } else { /* * RTM_CHANGE to a different nexthop or nexthop group. * Free old multipath group. */ nhop_free_any(di->rc.rc_nh_old); #endif } return (0); } /* * Iterates over a routing table specified by @fibnum and @family and * deletes elements marked by @filter_f. * @fibnum: rtable id * @family: AF_ address family * @filter_f: function returning non-zero value for items to delete * @arg: data to pass to the @filter_f function * @report: true if rtsock notification is needed. */ void rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg, bool report) { struct rib_head *rnh; struct rtentry *rt; struct nhop_object *nh; struct epoch_tracker et; rnh = rt_tables_get_rnh(fibnum, family); if (rnh == NULL) return; struct rt_delinfo di = { .rnh = rnh, .filter_f = filter_f, .filter_arg = filter_arg, .prio = NH_PRIORITY_NORMAL, }; NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); RIB_WUNLOCK(rnh); /* We might have something to reclaim. */ bzero(&di.rc, sizeof(di.rc)); di.rc.rc_cmd = RTM_DELETE; while (di.head != NULL) { rt = di.head; di.head = rt->rt_chain; rt->rt_chain = NULL; nh = rt->rt_nhop; di.rc.rc_rt = rt; di.rc.rc_nh_old = nh; rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc); if (report) { #ifdef ROUTE_MPATH struct nhgrp_object *nhg; const struct weightened_nhop *wn; uint32_t num_nhops; if (NH_IS_NHGRP(nh)) { nhg = (struct nhgrp_object *)nh; wn = nhgrp_get_nhops(nhg, &num_nhops); for (int i = 0; i < num_nhops; i++) rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum); } else #endif rt_routemsg(RTM_DELETE, rt, nh, fibnum); } rtfree(rt); } NET_EPOCH_EXIT(et); } static int rt_delete_unconditional(struct radix_node *rn, void *arg) { struct rtentry *rt = RNTORT(rn); struct rib_head *rnh = (struct rib_head *)arg; rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head); if (RNTORT(rn) == rt) rtfree(rt); return (0); } /* * Removes all routes from the routing table without executing notifications. * rtentres will be removed after the end of a current epoch. */ static void rib_flush_routes(struct rib_head *rnh) { RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh); RIB_WUNLOCK(rnh); } void rib_flush_routes_family(int family) { struct rib_head *rnh; for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) { if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL) rib_flush_routes(rnh); } } const char * rib_print_family(int family) { switch (family) { case AF_INET: return ("inet"); case AF_INET6: return ("inet6"); case AF_LINK: return ("link"); } return ("unknown"); } static void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, struct rib_cmd_info *rc) { struct rib_subscription *rs; CK_STAILQ_FOREACH(rs, &rnh->rnh_subscribers, next) { if (rs->type == type) rs->func(rnh, rc, rs->arg); } } static struct rib_subscription * allocate_subscription(rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok) { struct rib_subscription *rs; int flags = M_ZERO | (waitok ? M_WAITOK : M_NOWAIT); rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags); if (rs == NULL) return (NULL); rs->func = f; rs->arg = arg; rs->type = type; return (rs); } /* * Subscribe for the changes in the routing table specified by @fibnum and * @family. * * Returns pointer to the subscription structure on success. */ struct rib_subscription * rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok) { struct rib_head *rnh; struct epoch_tracker et; NET_EPOCH_ENTER(et); KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__)); rnh = rt_tables_get_rnh(fibnum, family); NET_EPOCH_EXIT(et); return (rib_subscribe_internal(rnh, f, arg, type, waitok)); } struct rib_subscription * rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok) { struct rib_subscription *rs; struct epoch_tracker et; if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL) return (NULL); rs->rnh = rnh; NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); CK_STAILQ_INSERT_HEAD(&rnh->rnh_subscribers, rs, next); RIB_WUNLOCK(rnh); NET_EPOCH_EXIT(et); return (rs); } struct rib_subscription * rib_subscribe_locked(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type) { struct rib_subscription *rs; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(rnh); if ((rs = allocate_subscription(f, arg, type, false)) == NULL) return (NULL); rs->rnh = rnh; CK_STAILQ_INSERT_HEAD(&rnh->rnh_subscribers, rs, next); return (rs); } /* * Remove rtable subscription @rs from the routing table. * Needs to be run in network epoch. */ void rib_unsubscribe(struct rib_subscription *rs) { struct rib_head *rnh = rs->rnh; NET_EPOCH_ASSERT(); RIB_WLOCK(rnh); CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next); RIB_WUNLOCK(rnh); epoch_call(net_epoch_preempt, destroy_subscription_epoch, &rs->epoch_ctx); } void rib_unsubscribe_locked(struct rib_subscription *rs) { struct rib_head *rnh = rs->rnh; NET_EPOCH_ASSERT(); RIB_WLOCK_ASSERT(rnh); CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next); epoch_call(net_epoch_preempt, destroy_subscription_epoch, &rs->epoch_ctx); } /* * Epoch callback indicating subscription is safe to destroy */ static void destroy_subscription_epoch(epoch_context_t ctx) { struct rib_subscription *rs; rs = __containerof(ctx, struct rib_subscription, epoch_ctx); free(rs, M_RTABLE); } void rib_init_subscriptions(struct rib_head *rnh) { CK_STAILQ_INIT(&rnh->rnh_subscribers); } void rib_destroy_subscriptions(struct rib_head *rnh) { struct rib_subscription *rs; struct epoch_tracker et; NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); while ((rs = CK_STAILQ_FIRST(&rnh->rnh_subscribers)) != NULL) { CK_STAILQ_REMOVE_HEAD(&rnh->rnh_subscribers, next); epoch_call(net_epoch_preempt, destroy_subscription_epoch, &rs->epoch_ctx); } RIB_WUNLOCK(rnh); NET_EPOCH_EXIT(et); }